Skip to content

Add AVX512 BMM API#124804

Merged
tannergooding merged 12 commits intodotnet:mainfrom
alexcovington:avx512bmm
Mar 4, 2026
Merged

Add AVX512 BMM API#124804
tannergooding merged 12 commits intodotnet:mainfrom
alexcovington:avx512bmm

Conversation

@alexcovington
Copy link
Contributor

@alexcovington alexcovington commented Feb 24, 2026

This PR implements the AVX512 BMM API.

namespace System.Runtime.Intrinsics.X86
{
    public abstract class Avx512Bmm : Avx512F
    {
        public static new bool IsSupported { get; }

        public static Vector128<byte> ReverseBits(Vector128<byte> values);
        public static Vector256<byte> ReverseBits(Vector256<byte> values);
        public static Vector512<byte> ReverseBits(Vector512<byte> values);

        public static Vector256<ushort> BitMultiplyMatrix16x16WithOrReduction(Vector256<ushort> addend, Vector256<ushort> left, Vector256<ushort> right);
        public static Vector512<ushort> BitMultiplyMatrix16x16WithOrReduction(Vector512<ushort> addend, Vector512<ushort> left, Vector512<ushort> right);
        public static Vector256<ushort> BitMultiplyMatrix16x16WithXorReduction(Vector256<ushort> addend, Vector256<ushort> left, Vector256<ushort> right);
        public static Vector512<ushort> BitMultiplyMatrix16x16WithXorReduction(Vector512<ushort> addend, Vector512<ushort> left, Vector512<ushort> right);
    }
}

Disasm Samples

BitMultiplyMatrix16x16WithOrReduction
private static Vector256<ushort> BitMultiplyMatrix16x16WithOrReduction_Vector256(Vector256<ushort> x, Vector256<ushort> y, Vector256<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithOrReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithOrReduction_Vector256(System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort]):System.Runtime.Intrinsics.Vector256`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymm1, ymmword ptr [r8]
       vbmacor16x16x16 ymm0, ymm1, ymmword ptr [r9]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0016
       vzeroupper
       ret

; Total bytes of code 26
private static Vector512<ushort> BitMultiplyMatrix16x16WithOrReduction_Vector512(Vector512<ushort> x, Vector512<ushort> y, Vector512<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithOrReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithOrReduction_Vector512(System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]):System.Runtime.Intrinsics.Vector512`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [r8]
       vbmacor16x16x16 zmm0, zmm1, zmmword ptr [r9]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001B
       vzeroupper
       ret

; Total bytes of code 31
BitMultiplyMatrix16x16WithXorReduction
private static Vector256<ushort> BitMultiplyMatrix16x16WithXorReduction_Vector256(Vector256<ushort> x, Vector256<ushort> y, Vector256<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithXorReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithXorReduction_Vector256(System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort]):System.Runtime.Intrinsics.Vector256`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymm1, ymmword ptr [r8]
       vbmacxor16x16x16 ymm0, ymm1, ymmword ptr [r9]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0016
       vzeroupper
       ret

; Total bytes of code 26
private static Vector512<ushort> BitMultiplyMatrix16x16WithXorReduction_Vector512(Vector512<ushort> x, Vector512<ushort> y, Vector512<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithXorReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithXorReduction_Vector512(System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]):System.Runtime.Intrinsics.Vector512`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [r8]
       vbmacxor16x16x16 zmm0, zmm1, zmmword ptr [r9]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001B
       vzeroupper
       ret

; Total bytes of code 31
ReverseBits
private static Vector128<byte> ReverseBits_Vector128(Vector128<byte> values)
{
    return Avx512Bmm.ReverseBits(values);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector128(System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vbitrev  xmm0, xmmword ptr [rdx]
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x000D
       ret

; Total bytes of code 14
private static Vector256<byte> ReverseBits_Vector256(Vector256<byte> values)
{
    return Avx512Bmm.ReverseBits(values);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector256(System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vbitrev  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x000D
       vzeroupper
       ret

; Total bytes of code 17
private static Vector512<byte> ReverseBits_Vector512(Vector512<byte> values)
{
    return Avx512Bmm.ReverseBits(values);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector512(System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vbitrev  zmm0, zmmword ptr [rdx]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x000F
       vzeroupper
       ret

; Total bytes of code 19
ReverseBits (Merge Masking)
private static Vector128<byte> ReverseBits_Mask_Vector128(Vector128<byte> values, Vector128<byte> mask)
{
    return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector128(System.Runtime.Intrinsics.Vector128`1[byte],System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  xmm0, xmmword ptr [rdx]
       vmovups  xmm1, xmmword ptr [r8]
       vpmovb2m k1, xmm1
       vbitrev  xmm0 {k1}, xmm0
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001C
       ret

; Total bytes of code 29
private static Vector256<byte> ReverseBits_Mask_Vector256(Vector256<byte> values, Vector256<byte> mask)
{
    return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector256(System.Runtime.Intrinsics.Vector256`1[byte],System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymm1, ymmword ptr [r8]
       vpmovb2m k1, ymm1
       vbitrev  ymm0 {k1}, ymm0
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001C
       vzeroupper
       ret

; Total bytes of code 32
private static Vector512<byte> ReverseBits_Mask_Vector512(Vector512<byte> values, Vector512<byte> mask)
{
    return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector512(System.Runtime.Intrinsics.Vector512`1[byte],System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [r8]
       vpmovb2m k1, zmm1
       vbitrev  zmm0 {k1}, zmm0
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0021
       vzeroupper
       ret

; Total bytes of code 37
ReverseBits (Zero Masking)
private static Vector128<byte> ReverseBits_Maskz_Vector128(Vector128<byte> values, Vector128<byte> mask)
{
    return Avx512BW.BlendVariable(Vector128<byte>.Zero, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector128(System.Runtime.Intrinsics.Vector128`1[byte],System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  xmm0, xmmword ptr [r8]
       vpmovb2m k1, xmm0
       vbitrev  xmm0 {k1}{z}, xmmword ptr [rdx]
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0018
       ret

; Total bytes of code 25
private static Vector256<byte> ReverseBits_Maskz_Vector256(Vector256<byte> values, Vector256<byte> mask)
{
    return Avx512BW.BlendVariable(Vector256<byte>.Zero, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector256(System.Runtime.Intrinsics.Vector256`1[byte],System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [r8]
       vpmovb2m k1, ymm0
       vbitrev  ymm0 {k1}{z}, ymmword ptr [rdx]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0018
       vzeroupper
       ret

; Total bytes of code 28
private static Vector512<byte> ReverseBits_Maskz_Vector512(Vector512<byte> values, Vector512<byte> mask)
{
    return Avx512BW.BlendVariable(Vector512<byte>.Zero, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector512(System.Runtime.Intrinsics.Vector512`1[byte],System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [r8]
       vpmovb2m k1, zmm0
       vbitrev  zmm0 {k1}{z}, zmmword ptr [rdx]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001B
       vzeroupper
       ret

; Total bytes of code 31

@github-actions github-actions bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 24, 2026
@dotnet-policy-service dotnet-policy-service bot added the community-contribution Indicates that the PR has been added by a community member label Feb 24, 2026
@dotnet-policy-service
Copy link
Contributor

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

@alexcovington alexcovington marked this pull request as draft February 24, 2026 18:12
@tannergooding
Copy link
Member

tannergooding commented Feb 26, 2026

LGTM minus the nit in the CPUID check (and then this can be unmarked as draft as well)

CC. @dotnet/jit-contrib, @EgorBo for secondary review

@tannergooding tannergooding requested a review from EgorBo February 26, 2026 17:19
@alexcovington alexcovington marked this pull request as ready for review February 26, 2026 18:28
Alex Covington (Advanced Micro Devices Inc) added 4 commits March 2, 2026 09:52
Copy link
Member

@kg kg left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM as well

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI community-contribution Indicates that the PR has been added by a community member

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants