-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Closed
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone
Description
It appears that consecutive ldr or str pairs are not coalesced for vector operands while it should be legal IIRC.
Consider the following code (it was also reformated in a way so that vector registers written in a loop are allocated next to each other):
private void InitializeSpanCore(Span<int> destination)
{
var (direction, start) = _start < _end
? (1, _start)
: (-1, _start - 1);
var width = Vector<int>.Count;
var stride = Vector<int>.Count * 2;
var remainder = destination.Length % stride;
var initMask = Unsafe.ReadUnaligned<Vector<int>>(
ref Unsafe.As<int, byte>(ref MemoryMarshal.GetReference(
(ReadOnlySpan<int>)new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 })));
var initMask2 = new Vector<int>(width) * direction;
var mask = new Vector<int>(stride) * direction;
var value = new Vector<int>(start) + (initMask * direction);
var value2 = value + initMask2;
ref var pos = ref MemoryMarshal.GetReference(destination);
ref var limit = ref Unsafe.Add(ref pos, destination.Length - remainder);
do
{
Unsafe.WriteUnaligned(ref ByteRef(ref pos), value);
Unsafe.WriteUnaligned(ref ByteRef(ref Unsafe.Add(ref pos, width)), value2);
value += mask;
value2 += mask;
pos = ref Unsafe.Add(ref pos, stride);
}
while (Unsafe.IsAddressLessThan(ref pos, ref limit));
var num = start + ((destination.Length - remainder) * direction);
limit = ref Unsafe.Add(ref limit, remainder);
while (Unsafe.IsAddressLessThan(ref pos, ref limit))
{
pos = num;
pos = ref Unsafe.Add(ref pos, 1);
num += direction;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ref byte ByteRef<T>(ref T source) => ref Unsafe.As<T, byte>(ref source);The resulting codegen is
; Assembly listing for method System.Linq.RangeEnumerable:InitializeSpanCore(System.Span`1[int]):this
; Emitting BLENDED_CODE for generic ARM64 CPU - MacOS
; Tier-1 compilation
; optimized code
; optimized using Dynamic PGO
; fp based frame
; fully interruptible
; with Dynamic PGO: edge weights are valid, and fgCalledCount is 1017911
; 0 inlinees with PGO data; 7 single block inlinees; 0 inlinees without PGO data
G_M000_IG01: ;; offset=0000H
A9BF7BFD stp fp, lr, [sp, #-0x10]!
910003FD mov fp, sp
G_M000_IG02: ;; offset=0008H
B9400003 ldr w3, [x0]
B9400400 ldr w0, [x0, #0x04]
6B00007F cmp w3, w0
5400054A bge G_M000_IG08
52800020 mov w0, #1
G_M000_IG03: ;; offset=001CH
12000844 and w4, w2, #7
6B0203E5 negs w5, w2
120008A5 and w5, w5, #7
5A854484 csneg w4, w4, w5, mi
4E041C10 ins v16.s[0], w0
9C000511 ldr q17, [@RWD00]
4F908231 mul v17.4s, v17.4s, v16.s[0]
9C000552 ldr q18, [@RWD16]
4F908252 mul v18.4s, v18.4s, v16.s[0]
9C000593 ldr q19, [@RWD32]
4F908270 mul v16.4s, v19.4s, v16.s[0]
4E040C73 dup v19.4s, w3
4EB08670 add v16.4s, v19.4s, v16.4s
4EB18611 add v17.4s, v16.4s, v17.4s
4B040042 sub w2, w2, w4
937E7C45 sbfiz x5, x2, #2, #32
8B0100A5 add x5, x5, x1
align [0 bytes for IG04]
align [0 bytes]
align [0 bytes]
align [0 bytes]
G_M000_IG04: ;; offset=0060H
3D800030 str q16, [x1]
3D800431 str q17, [x1, #0x10]
4EB28610 add v16.4s, v16.4s, v18.4s
4EB28631 add v17.4s, v17.4s, v18.4s
91008021 add x1, x1, #32
EB05003F cmp x1, x5
54FFFF43 blo G_M000_IG04
G_M000_IG05: ;; offset=007CH
1B000C43 madd w3, w2, w0, w3
937E7C82 sbfiz x2, x4, #2, #32
8B0200A5 add x5, x5, x2
EB05003F cmp x1, x5
54000142 bhs G_M000_IG07
D503201F align [4 bytes for IG06]
D503201F align [4 bytes]
D503201F align [4 bytes]
D503201F align [4 bytes]
G_M000_IG06: ;; offset=00A0H
B9000023 str w3, [x1]
91001021 add x1, x1, #4
0B000063 add w3, w3, w0
EB05003F cmp x1, x5
54FFFF83 blo G_M000_IG06
G_M000_IG07: ;; offset=00B4H
A8C17BFD ldp fp, lr, [sp], #0x10
D65F03C0 ret lr
G_M000_IG08: ;; offset=00BCH
51000463 sub w3, w3, #1
12800000 movn w0, #0
17FFFFD6 b G_M000_IG03
RWD00 dq 0000000400000004h, 0000000400000004h
RWD16 dq 0000000800000008h, 0000000800000008h
RWD32 dq 0000000100000000h, 0000000300000002h
; Total bytes of code 200Interestingly enough, if instead of storing vectors as
Unsafe.WriteUnaligned(ref ByteRef(ref pos), value);
Unsafe.WriteUnaligned(ref ByteRef(ref Unsafe.Add(ref pos, width)), value2);
They are stored as Unsafe.WriteUnaligned(ref ByteRef(ref pos), (value, value2)), the codegen changes from
G_M000_IG04: ;; offset=0060H
3D800030 str q16, [x1]
3D800431 str q17, [x1, #0x10]
4EB28610 add v16.4s, v16.4s, v18.4s
4EB28631 add v17.4s, v17.4s, v18.4s
91008021 add x1, x1, #32
EB05003F cmp x1, x5
54FFFF43 blo G_M000_IG04to
G_M000_IG04: ;; offset=0060H
A9017FBF stp xzr, xzr, [fp, #0x10]
A9027FBF stp xzr, xzr, [fp, #0x20]
3D8007B0 str q16, [fp, #0x10]
3D800BB1 str q17, [fp, #0x20]
AD40D3B3 ldp q19, q20, [fp, #0x10]
AD005033 stp q19, q20, [x1]
4EB28610 add v16.4s, v16.4s, v18.4s
4EB28631 add v17.4s, v17.4s, v18.4s
91008021 add x1, x1, #32
EB05003F cmp x1, x5
54FFFEC3 blo G_M000_IG04which is overall worse by does show that JIT is capable of emitting ldp/stp for SIMD registers.
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI