-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Closed
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIenhancementProduct code improvement that does NOT require public API changes/additionsProduct code improvement that does NOT require public API changes/additionsoptimization
Milestone
Description
Unsafe.WriteUnaligned<Vector128<T>> does an unnecessary stack write, before the actual data is written.
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace ConsoleApp2
{
class Program
{
static void Main(string[] args)
{
Span<byte> destination = stackalloc byte[16];
ref byte dest = ref MemoryMarshal.GetReference(destination);
Write<int>(ref dest);
As<int>(ref dest);
}
[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
private static void Write<T>(ref byte dest) where T : struct
{
Vector128<T> zero = Sse2.SetZeroVector128<T>();
Unsafe.WriteUnaligned(ref dest, zero);
}
[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
private static void As<T>(ref byte dest) where T : struct
{
Vector128<T> zero = Sse2.SetZeroVector128<T>();
Unsafe.As<byte, Vector128<T>>(ref dest) = zero;
}
}
}; Assembly listing for method Program:Write(byref)
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 3, 3 ) byref -> rdi
; V01 loc0 [V01,T01] ( 3, 3 ) simd16 -> mm0
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
; V03 tmp1 [V03 ] ( 1, 2 ) simd16 -> [rsp+0x00] do-not-enreg[SB] "Inlining Arg"
; V04 tmp2 [V04 ] ( 1, 2 ) long -> [rsp+0x00] do-not-enreg[] V03._00(offs=0x00) P-DEP "field V03._00 (fldOffset=0x0)"
; V05 tmp3 [V05 ] ( 1, 2 ) long -> [rsp+0x08] do-not-enreg[] V03._01(offs=0x08) P-DEP "field V03._01 (fldOffset=0x8)"
;
; Lcl frame size = 24
G_M37822_IG01:
4883EC18 sub rsp, 24
G_M37822_IG02:
C4E179EFC0 vpxor xmm0, xmm0, xmm0
C4E179290424 vmovapd xmmword ptr [rsp], xmm0 ; not necessary
C4E1791107 vmovupd xmmword ptr [rdi], xmm0
G_M37822_IG03:
4883C418 add rsp, 24
C3 ret
; Total bytes of code 25, prolog size 4 for method Program:Write(byref)
; ============================================================
; Assembly listing for method Program:As(byref)
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 3, 3 ) byref -> rdi
; V01 loc0 [V01,T01] ( 2, 2 ) simd16 -> mm0
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M39603_IG01:
G_M39603_IG02:
C4E179EFC0 vpxor xmm0, xmm0, xmm0
C4E1791107 vmovupd xmmword ptr [rdi], xmm0
G_M39603_IG03:
C3 ret
; Total bytes of code 11, prolog size 0 for method Program:As(byref)
; ============================================================This is actually a really contrived example, but I saw this same behavior in production code with different values for Vector128 (not only with 0).
Vector256<T> is not affected, and produced optimal code.
/cc: @tannergooding
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIenhancementProduct code improvement that does NOT require public API changes/additionsProduct code improvement that does NOT require public API changes/additionsoptimization