Skip to content

Unsafe.WriteUnaligned with Vector128<T> does unnecessary stack write #11524

@gfoidl

Description

@gfoidl

Unsafe.WriteUnaligned<Vector128<T>> does an unnecessary stack write, before the actual data is written.

using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace ConsoleApp2
{
    class Program
    {
        static void Main(string[] args)
        {
            Span<byte> destination = stackalloc byte[16];
            ref byte dest = ref MemoryMarshal.GetReference(destination);

            Write<int>(ref dest);
            As<int>(ref dest);
        }

        [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
        private static void Write<T>(ref byte dest) where T : struct
        {
            Vector128<T> zero = Sse2.SetZeroVector128<T>();
            Unsafe.WriteUnaligned(ref dest, zero);
        }

        [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
        private static void As<T>(ref byte dest) where T : struct
        {
            Vector128<T> zero = Sse2.SetZeroVector128<T>();
            Unsafe.As<byte, Vector128<T>>(ref dest) = zero;
        }
    }
}
; Assembly listing for method Program:Write(byref)
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )   byref  ->  rdi
;  V01 loc0         [V01,T01] (  3,  3   )  simd16  ->  mm0
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V03 tmp1         [V03    ] (  1,  2   )  simd16  ->  [rsp+0x00]   do-not-enreg[SB] "Inlining Arg"
;  V04 tmp2         [V04    ] (  1,  2   )    long  ->  [rsp+0x00]   do-not-enreg[] V03._00(offs=0x00) P-DEP "field V03._00 (fldOffset=0x0)"
;  V05 tmp3         [V05    ] (  1,  2   )    long  ->  [rsp+0x08]   do-not-enreg[] V03._01(offs=0x08) P-DEP "field V03._01 (fldOffset=0x8)"
;
; Lcl frame size = 24

G_M37822_IG01:
       4883EC18             sub      rsp, 24

G_M37822_IG02:
       C4E179EFC0           vpxor    xmm0, xmm0, xmm0
       C4E179290424         vmovapd  xmmword ptr [rsp], xmm0            ; not necessary
       C4E1791107           vmovupd  xmmword ptr [rdi], xmm0

G_M37822_IG03:
       4883C418             add      rsp, 24
       C3                   ret

; Total bytes of code 25, prolog size 4 for method Program:Write(byref)
; ============================================================
; Assembly listing for method Program:As(byref)
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )   byref  ->  rdi
;  V01 loc0         [V01,T01] (  2,  2   )  simd16  ->  mm0
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M39603_IG01:

G_M39603_IG02:
       C4E179EFC0           vpxor    xmm0, xmm0, xmm0
       C4E1791107           vmovupd  xmmword ptr [rdi], xmm0

G_M39603_IG03:
       C3                   ret

; Total bytes of code 11, prolog size 0 for method Program:As(byref)
; ============================================================

This is actually a really contrived example, but I saw this same behavior in production code with different values for Vector128 (not only with 0).

Vector256<T> is not affected, and produced optimal code.

/cc: @tannergooding

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIenhancementProduct code improvement that does NOT require public API changes/additionsoptimization

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions