Skip to content

[ARM64] Consecutive loads or stores not coalesced for vector operands #83773

@neon-sunset

Description

@neon-sunset

It appears that consecutive ldr or str pairs are not coalesced for vector operands while it should be legal IIRC.

Consider the following code (it was also reformated in a way so that vector registers written in a loop are allocated next to each other):

private void InitializeSpanCore(Span<int> destination)
{
    var (direction, start) = _start < _end
        ? (1, _start)
        : (-1, _start - 1);

    var width = Vector<int>.Count;
    var stride = Vector<int>.Count * 2;
    var remainder = destination.Length % stride;

    var initMask = Unsafe.ReadUnaligned<Vector<int>>(
        ref Unsafe.As<int, byte>(ref MemoryMarshal.GetReference(
            (ReadOnlySpan<int>)new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 })));
    var initMask2 = new Vector<int>(width) * direction;

    var mask = new Vector<int>(stride) * direction;
    var value = new Vector<int>(start) + (initMask * direction);
    var value2 = value + initMask2;

    ref var pos = ref MemoryMarshal.GetReference(destination);
    ref var limit = ref Unsafe.Add(ref pos, destination.Length - remainder);
    do
    {
        Unsafe.WriteUnaligned(ref ByteRef(ref pos), value);
        Unsafe.WriteUnaligned(ref ByteRef(ref Unsafe.Add(ref pos, width)), value2);

        value += mask;
        value2 += mask;
        pos = ref Unsafe.Add(ref pos, stride);
    }
    while (Unsafe.IsAddressLessThan(ref pos, ref limit));

    var num = start + ((destination.Length - remainder) * direction);
    limit = ref Unsafe.Add(ref limit, remainder);
    while (Unsafe.IsAddressLessThan(ref pos, ref limit))
    {
        pos = num;
        pos = ref Unsafe.Add(ref pos, 1);
        num += direction;
    }
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ref byte ByteRef<T>(ref T source) => ref Unsafe.As<T, byte>(ref source);

The resulting codegen is

; Assembly listing for method System.Linq.RangeEnumerable:InitializeSpanCore(System.Span`1[int]):this
; Emitting BLENDED_CODE for generic ARM64 CPU - MacOS
; Tier-1 compilation
; optimized code
; optimized using Dynamic PGO
; fp based frame
; fully interruptible
; with Dynamic PGO: edge weights are valid, and fgCalledCount is 1017911
; 0 inlinees with PGO data; 7 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
        A9BF7BFD          stp     fp, lr, [sp, #-0x10]!
        910003FD          mov     fp, sp
 
G_M000_IG02:                ;; offset=0008H
        B9400003          ldr     w3, [x0]
        B9400400          ldr     w0, [x0, #0x04]
        6B00007F          cmp     w3, w0
        5400054A          bge     G_M000_IG08
        52800020          mov     w0, #1
 
G_M000_IG03:                ;; offset=001CH
        12000844          and     w4, w2, #7
        6B0203E5          negs    w5, w2
        120008A5          and     w5, w5, #7
        5A854484          csneg   w4, w4, w5, mi
        4E041C10          ins     v16.s[0], w0
        9C000511          ldr     q17, [@RWD00]
        4F908231          mul     v17.4s, v17.4s, v16.s[0]
        9C000552          ldr     q18, [@RWD16]
        4F908252          mul     v18.4s, v18.4s, v16.s[0]
        9C000593          ldr     q19, [@RWD32]
        4F908270          mul     v16.4s, v19.4s, v16.s[0]
        4E040C73          dup     v19.4s, w3
        4EB08670          add     v16.4s, v19.4s, v16.4s
        4EB18611          add     v17.4s, v16.4s, v17.4s
        4B040042          sub     w2, w2, w4
        937E7C45          sbfiz   x5, x2, #2, #32
        8B0100A5          add     x5, x5, x1
                          align   [0 bytes for IG04]
                          align   [0 bytes]
                          align   [0 bytes]
                          align   [0 bytes]
 
G_M000_IG04:                ;; offset=0060H
        3D800030          str     q16, [x1]
        3D800431          str     q17, [x1, #0x10]
        4EB28610          add     v16.4s, v16.4s, v18.4s
        4EB28631          add     v17.4s, v17.4s, v18.4s
        91008021          add     x1, x1, #32
        EB05003F          cmp     x1, x5
        54FFFF43          blo     G_M000_IG04
 
G_M000_IG05:                ;; offset=007CH
        1B000C43          madd    w3, w2, w0, w3
        937E7C82          sbfiz   x2, x4, #2, #32
        8B0200A5          add     x5, x5, x2
        EB05003F          cmp     x1, x5
        54000142          bhs     G_M000_IG07
        D503201F          align   [4 bytes for IG06]
        D503201F          align   [4 bytes]
        D503201F          align   [4 bytes]
        D503201F          align   [4 bytes]
 
G_M000_IG06:                ;; offset=00A0H
        B9000023          str     w3, [x1]
        91001021          add     x1, x1, #4
        0B000063          add     w3, w3, w0
        EB05003F          cmp     x1, x5
        54FFFF83          blo     G_M000_IG06
 
G_M000_IG07:                ;; offset=00B4H
        A8C17BFD          ldp     fp, lr, [sp], #0x10
        D65F03C0          ret     lr
 
G_M000_IG08:                ;; offset=00BCH
        51000463          sub     w3, w3, #1
        12800000          movn    w0, #0
        17FFFFD6          b       G_M000_IG03
 
RWD00   dq      0000000400000004h, 0000000400000004h
RWD16   dq      0000000800000008h, 0000000800000008h
RWD32   dq      0000000100000000h, 0000000300000002h

; Total bytes of code 200

Interestingly enough, if instead of storing vectors as

Unsafe.WriteUnaligned(ref ByteRef(ref pos), value);
Unsafe.WriteUnaligned(ref ByteRef(ref Unsafe.Add(ref pos, width)), value2);

They are stored as Unsafe.WriteUnaligned(ref ByteRef(ref pos), (value, value2)), the codegen changes from

G_M000_IG04:                ;; offset=0060H
        3D800030          str     q16, [x1]
        3D800431          str     q17, [x1, #0x10]
        4EB28610          add     v16.4s, v16.4s, v18.4s
        4EB28631          add     v17.4s, v17.4s, v18.4s
        91008021          add     x1, x1, #32
        EB05003F          cmp     x1, x5
        54FFFF43          blo     G_M000_IG04

to

G_M000_IG04:                ;; offset=0060H
        A9017FBF          stp     xzr, xzr, [fp, #0x10]
        A9027FBF          stp     xzr, xzr, [fp, #0x20]
        3D8007B0          str     q16, [fp, #0x10]
        3D800BB1          str     q17, [fp, #0x20]
        AD40D3B3          ldp     q19, q20, [fp, #0x10]
        AD005033          stp     q19, q20, [x1]
        4EB28610          add     v16.4s, v16.4s, v18.4s
        4EB28631          add     v17.4s, v17.4s, v18.4s
        91008021          add     x1, x1, #32
        EB05003F          cmp     x1, x5
        54FFFEC3          blo     G_M000_IG04

which is overall worse by does show that JIT is capable of emitting ldp/stp for SIMD registers.

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions