Skip to content

Compiler overwrites wrong memory location in .NET 8 RC1 when using Unsafe.Add and Vector256<uint>[0] #92590

@jwdj

Description

@jwdj

Description

In a release build with .NET 8 RC1 the compiler overwrites memory it shouldn't touch.

Input is a Span<byte> of pixels with layout RGBA, where every fourth byte is an alpha byte set to 255 (index 3, 7, 11, etc).
The code only overwrites the RGB bytes (indexes 0, 1, 2, 4, 5, 6, 8, etc), but the compiler overwrites 3, 7, 11, etc too (with 0).
This overwrite happens when a ref byte (3 bytes earlier) is overwritten with the first element of a Vector256

Unsafe.Add(ref dst_ptr, 0) = (byte)blurredPixel0[0];    // blurredPixel0 is a Vector256<uint>    dst_ptr is ref byte
//   byte at dst_ptr + 3 is now overwritten with 0

Below is this line in more context. I have tried to shorten the example code but the bug is very specific and is Vector256 related.

Configuration

  • .NET 8.0 RC1
  • OS: Windows 10
  • Only tested with x64

Regression?

Yes, compared to .NET 7.0

Other information

The following configurations work fine:
dotnet run StackBlur --framework net7.0 --configuration Release
dotnet run StackBlur --framework net7.0 --configuration Debug
dotnet run StackBlur --framework net8.0 --configuration Debug

To reproduce the error (from PowerShell):
$env:DOTNET_JitDisasm = 'StackblurPass'
dotnet run StackBlur --framework net8.0 --configuration Release

Here the project code StackBlur.csproj:

<Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFrameworks>net7.0;net8.0</TargetFrameworks>
    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>	
  </PropertyGroup>
</Project>

And the source code Program.cs:
The program is rather long but scroll down to where it is not indented to find the issue.

using System;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Diagnostics;

namespace Application
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine(Environment.Version);

            const int Width = 1920;
            const int Height = 1080;
            Span<byte> pixels = new byte[Width * Height * sizeof(uint)];
            for (int i = 3; i < pixels.Length; i += 4)
            {
                pixels[i] = 255; // fill alpha channel
            }
            pixels.Fill(255);

            var area = Vector128.Create(0, 0, 384, 384);
            StackBlur(pixels, Width, Height, Width * sizeof(uint), radius: 100, area);

            for (int i = 3; i < pixels.Length; i += 4)
            {
                if (pixels[i] != 255)
                {
                    Console.WriteLine("FAILED");
                    return;
                }
            }
        }




        static ushort[] stackblur_mul = new ushort[255]
       {
        512,512,456,512,328,456,335,512,405,328,271,456,388,335,292,512,
        454,405,364,328,298,271,496,456,420,388,360,335,312,292,273,512,
        482,454,428,405,383,364,345,328,312,298,284,271,259,496,475,456,
        437,420,404,388,374,360,347,335,323,312,302,292,282,273,265,512,
        497,482,468,454,441,428,417,405,394,383,373,364,354,345,337,328,
        320,312,305,298,291,284,278,271,265,259,507,496,485,475,465,456,
        446,437,428,420,412,404,396,388,381,374,367,360,354,347,341,335,
        329,323,318,312,307,302,297,292,287,282,278,273,269,265,261,512,
        505,497,489,482,475,468,461,454,447,441,435,428,422,417,411,405,
        399,394,389,383,378,373,368,364,359,354,350,345,341,337,332,328,
        324,320,316,312,309,305,301,298,294,291,287,284,281,278,274,271,
        268,265,262,259,257,507,501,496,491,485,480,475,470,465,460,456,
        451,446,442,437,433,428,424,420,416,412,408,404,400,396,392,388,
        385,381,377,374,370,367,363,360,357,354,350,347,344,341,338,335,
        332,329,326,323,320,318,315,312,310,307,304,302,299,297,294,292,
        289,287,285,282,280,278,275,273,271,269,267,265,263,261,259
       };

        static ushort[] stackblur_shr = new ushort[255]
       {
        9, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17,
        17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19,
        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22,
        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24
       };


        //[MethodImpl(MethodImplOptions.NoOptimization)] // also 'fixes' the issue
        static void StackblurPass(Span<byte> src,              ///< input image data
                          int width,                    ///< image width
                          int height,                   ///< image height
                          int rowStride,
                          uint radius,              ///< blur intensity (should be in 2..254 range)
                          bool vertical,
                          Vector128<int> area
                          )
        {
            uint mul_sum = stackblur_mul[radius];
            int shr_sum = stackblur_shr[radius];

            var areaX = area[0];
            var areaY = area[1];

            var areaWidth = area[2];
            var areaHeight = area[3];

            int strideX = 4;
            int strideY = rowStride;
            int areaMinX = areaX >= 0 ? areaX : 0;
            int areaMinY = areaY >= 0 ? areaY : 0;

            int maxX = width - 1;
            int maxY = height - 1;
            int areaMaxX = areaX + areaWidth - 1;
            int areaMaxY = areaY + areaHeight - 1;
            if (areaMaxX > maxX) areaMaxX = maxX;
            if (areaMaxY > maxY) areaMaxY = maxY;

            areaMaxX -= areaMinX;
            areaMaxY -= areaMinY;

            if (vertical)
            {
                // transpose (swap X and Y)
                (maxX, maxY) = (maxY, maxX);
                (strideX, strideY) = (strideY, strideX);
                (areaMinX, areaMinY) = (areaMinY, areaMinX);
                (areaMaxX, areaMaxY) = (areaMaxY, areaMaxX);
            }

            uint initialSum = (uint)SumNumbers((int)radius + 1);

            ref var sourcePtr = ref src[areaMinX * strideX + areaMinY * strideY];

            if (Avx2.IsSupported && Vector256.IsHardwareAccelerated)
            {
                var idx0 = Vector256.Create(0, 1, 2,
                                            1 * strideY, 1 * strideY + 1, 1 * strideY + 2,
                                            2 * strideY, 2 * strideY + 1);
                var idx1 = Vector256.Create(2 * strideY + 2, 3 * strideY, 3 * strideY + 1,
                                            3 * strideY + 2, 4 * strideY, 4 * strideY + 1,
                                            4 * strideY + 2, 5 * strideY);
                var idx2 = Vector256.Create(5 * strideY + 1, 5 * strideY + 2, 6 * strideY,
                                            6 * strideY + 1, 6 * strideY + 2, 7 * strideY,
                                            7 * strideY + 1, 7 * strideY + 2);
                var lowerByte = Vector256.Create(0xFFU);

                [MethodImpl(MethodImplOptions.AggressiveInlining)]
                unsafe void FillVectorsFromPixels(ref byte src, ref Vector256<uint> data0, ref Vector256<uint> data1, ref Vector256<uint> data2)
                {
                    var src_ptr = (uint*)Unsafe.AsPointer(ref src);

                    data0 = Avx2.GatherVector256(src_ptr, idx0, 1);
                    data1 = Avx2.GatherVector256(src_ptr, idx1, 1);
                    data2 = Avx2.GatherVector256(src_ptr, idx2, 1);
                    data0 = Avx2.And(data0, lowerByte);
                    data1 = Avx2.And(data1, lowerByte);
                    data2 = Avx2.And(data2, lowerByte);
                }

                Span<Vector256<uint>> stack0 = stackalloc Vector256<uint>[(int)radius * 2 + 1];
                Span<Vector256<uint>> stack1 = stackalloc Vector256<uint>[(int)radius * 2 + 1];
                Span<Vector256<uint>> stack2 = stackalloc Vector256<uint>[(int)radius * 2 + 1];
                for (int y = 0; y <= areaMaxY; y += 8)
                {
                    var sum_in0 = Vector256<uint>.Zero;
                    var sum_in1 = Vector256<uint>.Zero;
                    var sum_in2 = Vector256<uint>.Zero;

                    var data0 = Vector256<uint>.Zero;
                    var data1 = Vector256<uint>.Zero;
                    var data2 = Vector256<uint>.Zero;

                    ref var src_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY); // start of line (0,y)
                    FillVectorsFromPixels(ref src_ptr, ref data0, ref data1, ref data2);

                    var sum0 = Vector256.Multiply(data0, initialSum);
                    var sum1 = Vector256.Multiply(data1, initialSum);
                    var sum2 = Vector256.Multiply(data2, initialSum);
                    var sum_out0 = Vector256.Multiply(data0, radius + 1);
                    var sum_out1 = Vector256.Multiply(data1, radius + 1);
                    var sum_out2 = Vector256.Multiply(data2, radius + 1);
                    foreach (ref var stack_ptr in stack0.Slice(0, (int)radius + 1))
                    {
                        stack_ptr = data0;
                    }
                    foreach (ref var stack_ptr in stack1.Slice(0, (int)radius + 1))
                    {
                        stack_ptr = data1;
                    }
                    foreach (ref var stack_ptr in stack2.Slice(0, (int)radius + 1))
                    {
                        stack_ptr = data2;
                    }

                    for (int i = 1; i <= radius; i++)
                    {
                        if (i < areaMaxX)
                        {
                            src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
                            FillVectorsFromPixels(ref src_ptr, ref data0, ref data1, ref data2);
                        }
                        ref var stack_ptr = ref stack0[(int)(i + radius)];
                        stack_ptr = data0;
                        stack_ptr = ref stack1[(int)(i + radius)];
                        stack_ptr = data1;
                        stack_ptr = ref stack2[(int)(i + radius)];
                        stack_ptr = data2;

                        sum0 += data0 * (uint)(radius + 1 - i);
                        sum1 += data1 * (uint)(radius + 1 - i);
                        sum2 += data2 * (uint)(radius + 1 - i);
                        sum_in0 += data0;
                        sum_in1 += data1;
                        sum_in2 += data2;
                    }

                    int sp = (int)radius;
                    int xp = (int)radius;
                    if (xp > areaMaxX) xp = areaMaxX;

                    src_ptr = ref Unsafe.Add(ref sourcePtr, xp * strideX + y * strideY);
                    ref var dst_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY);
                    for (int x = 0; x <= areaMaxX; x++)
                    {
                        var blurredPixel0 = Vector256.ShiftRightLogical(Vector256.Multiply(sum0, mul_sum), shr_sum);
                        var blurredPixel1 = Vector256.ShiftRightLogical(Vector256.Multiply(sum1, mul_sum), shr_sum);
                        var blurredPixel2 = Vector256.ShiftRightLogical(Vector256.Multiply(sum2, mul_sum), shr_sum);

#if NET8_0_OR_GREATER
Console.WriteLine(Unsafe.Add(ref dst_ptr, 3).ToString("X")); // should print FF
#endif

                        Unsafe.Add(ref dst_ptr, 0) = (byte)blurredPixel0[0];   // bug, should only write a single byte

#if NET8_0_OR_GREATER
Console.WriteLine(Unsafe.Add(ref dst_ptr, 3).ToString("X")); // should print FF, not 00
#endif

                        Unsafe.Add(ref dst_ptr, 1) = (byte)blurredPixel0[1];
                        Unsafe.Add(ref dst_ptr, 2) = (byte)blurredPixel0[2];


                        Unsafe.Add(ref dst_ptr, 0 + strideY) = (byte)blurredPixel0[3];
                        Unsafe.Add(ref dst_ptr, 1 + strideY) = (byte)blurredPixel0[4];
                        Unsafe.Add(ref dst_ptr, 2 + strideY) = (byte)blurredPixel0[5];

                        Unsafe.Add(ref dst_ptr, 0 + strideY * 2) = (byte)blurredPixel0[6];
                        Unsafe.Add(ref dst_ptr, 1 + strideY * 2) = (byte)blurredPixel0[7];
                        Unsafe.Add(ref dst_ptr, 2 + strideY * 2) = (byte)blurredPixel1[0];

                        Unsafe.Add(ref dst_ptr, 0 + strideY * 3) = (byte)blurredPixel1[1];
                        Unsafe.Add(ref dst_ptr, 1 + strideY * 3) = (byte)blurredPixel1[2];
                        Unsafe.Add(ref dst_ptr, 2 + strideY * 3) = (byte)blurredPixel1[3];

                        Unsafe.Add(ref dst_ptr, 0 + strideY * 4) = (byte)blurredPixel1[4];
                        Unsafe.Add(ref dst_ptr, 1 + strideY * 4) = (byte)blurredPixel1[5];
                        Unsafe.Add(ref dst_ptr, 2 + strideY * 4) = (byte)blurredPixel1[6];

                        Unsafe.Add(ref dst_ptr, 0 + strideY * 5) = (byte)blurredPixel1[7];
                        Unsafe.Add(ref dst_ptr, 1 + strideY * 5) = (byte)blurredPixel2[0];
                        Unsafe.Add(ref dst_ptr, 2 + strideY * 5) = (byte)blurredPixel2[1];

                        Unsafe.Add(ref dst_ptr, 0 + strideY * 6) = (byte)blurredPixel2[2];
                        Unsafe.Add(ref dst_ptr, 1 + strideY * 6) = (byte)blurredPixel2[3];
                        Unsafe.Add(ref dst_ptr, 2 + strideY * 6) = (byte)blurredPixel2[4];

                        Unsafe.Add(ref dst_ptr, 0 + strideY * 7) = (byte)blurredPixel2[5];
                        Unsafe.Add(ref dst_ptr, 1 + strideY * 7) = (byte)blurredPixel2[6];
                        Unsafe.Add(ref dst_ptr, 2 + strideY * 7) = (byte)blurredPixel2[7];

                        dst_ptr = ref Unsafe.Add(ref dst_ptr, strideX);

                        sum0 -= sum_out0;
                        sum1 -= sum_out1;
                        sum2 -= sum_out2;

                        int stack_start = sp + (int)radius + 1;
                        if (stack_start >= stack0.Length) stack_start -= stack0.Length;

                        ref var stack_ptr0 = ref stack0[stack_start];
                        sum_out0 -= stack_ptr0;

                        ref var stack_ptr1 = ref stack1[stack_start];
                        sum_out1 -= stack_ptr1;

                        ref var stack_ptr2 = ref stack2[stack_start];
                        sum_out2 -= stack_ptr2;

                        if (xp < areaMaxX)
                        {
                            src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
                            FillVectorsFromPixels(ref src_ptr, ref data0, ref data1, ref data2);
                            ++xp;
                        }

                        stack_ptr0 = data0;
                        sum_in0 += data0;
                        sum0 += sum_in0;

                        stack_ptr1 = data1;
                        sum_in1 += data1;
                        sum1 += sum_in1;

                        stack_ptr2 = data2;
                        sum_in2 += data2;
                        sum2 += sum_in2;

                        ++sp;
                        if (sp >= stack0.Length) sp = 0;
                        stack_ptr0 = ref stack0[sp];
                        sum_out0 += stack_ptr0;
                        sum_in0 -= stack_ptr0;

                        stack_ptr1 = ref stack1[sp];
                        sum_out1 += stack_ptr1;
                        sum_in1 -= stack_ptr1;

                        stack_ptr2 = ref stack2[sp];
                        sum_out2 += stack_ptr2;
                        sum_in2 -= stack_ptr2;
                    }
                }
            }
            else
            {
                [MethodImpl(MethodImplOptions.AggressiveInlining)]
                static void FillVectorFromPixel(ref Vector128<uint> data, ref byte src_ptr)
                {
                    data = Vector128.Create((uint)src_ptr,
                        (uint)Unsafe.Add(ref src_ptr, 1),
                        (uint)Unsafe.Add(ref src_ptr, 2),
                        (uint)Unsafe.Add(ref src_ptr, 3));
                }

                Span<Vector128<uint>> stack = stackalloc Vector128<uint>[(int)radius * 2 + 1];
                for (int y = 0; y <= areaMaxY; y++)
                {
                    var sum_in = Vector128<uint>.Zero;

                    ref byte src_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY); // start of line (0,y)

                    Vector128<uint> data = Vector128<uint>.Zero;
                    FillVectorFromPixel(ref data, ref src_ptr);
                    var sum = Vector128.Multiply(data, initialSum);
                    var sum_out = Vector128.Multiply(data, radius + 1);
                    foreach (ref var stack_ptr in stack.Slice(0, (int)radius + 1))
                    {
                        stack_ptr = data;
                    }

                    for (int i = 1; i <= radius; i++)
                    {
                        if (i < areaMaxX)
                        {
                            src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
                            FillVectorFromPixel(ref data, ref src_ptr);
                        }
                        ref var stack_ptr = ref stack[(int)(i + radius)];
                        stack_ptr = data;

                        sum += data * (uint)(radius + 1 - i);
                        sum_in += data;
                    }

                    int sp = (int)radius;
                    int xp = (int)radius;
                    if (xp > areaMaxX) xp = areaMaxX;
                    src_ptr = ref Unsafe.Add(ref sourcePtr, xp * strideX + y * strideY); //   img.pix_ptr(xp, y);
                    ref byte dst_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY); // img.pix_ptr(0, y);
                    for (int x = 0; x <= areaMaxX; x++)
                    {
                        var blurredPixel = Vector128.ShiftRightLogical(Vector128.Multiply(sum, mul_sum), shr_sum);

                        dst_ptr = (byte)blurredPixel[0];
                        Unsafe.Add(ref dst_ptr, 1) = (byte)blurredPixel[1];
                        Unsafe.Add(ref dst_ptr, 2) = (byte)blurredPixel[2];
                        Unsafe.Add(ref dst_ptr, 3) = (byte)blurredPixel[3];
                        dst_ptr = ref Unsafe.Add(ref dst_ptr, strideX);

                        sum -= sum_out;

                        int stack_start = sp + (int)radius + 1;
                        if (stack_start >= stack.Length) stack_start -= stack.Length;
                        ref var stack_ptr = ref stack[stack_start];

                        sum_out -= stack_ptr;

                        if (xp < areaMaxX)
                        {
                            src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
                            FillVectorFromPixel(ref data, ref src_ptr);
                            ++xp;
                        }

                        stack_ptr = data;
                        sum_in += data;
                        sum += sum_in;

                        ++sp;
                        if (sp >= stack.Length) sp = 0;
                        stack_ptr = ref stack[sp];

                        sum_out += stack_ptr;
                        sum_in -= stack_ptr;
                    }
                }
            }
        }

        static int SumNumbers(int n)
        {
            return n * (n + 1) / 2; // fast way of calculating range (0..n).Sum()
        }

        /// Stackblur algorithm by Mario Klingemann
        /// Details here:
        /// http://www.quasimondo.com/StackBlurForCanvas/StackBlurDemo.html
        /// C++ implemenation base from:
        /// https://gist.github.com/benjamin9999/3809142
        /// http://www.antigrain.com/__code/include/agg_blur.h.html
        /// This version works only with RGBA color
        static void StackBlur(Span<byte> src,              ///< input image data
                                     int w,                 ///< image width
                                     int h,                 ///< image height
                                     int rowStride,
                                     uint radius                ///< blur intensity (should be in 2..254 range)
                                     )
        {
            var area = Vector128.Create(0, 0, w, h);
            StackBlur(src, w, h, rowStride, radius, area);
        }

        static void StackBlur(Span<byte> src,              ///< input image data
                                     int w,                 ///< image width
                                     int h,                 ///< image height
                                     int rowStride,
                                     uint radius,               ///< blur intensity (should be in 2..254 range)
                                     Vector128<int> area
                                     )
        {
            if (radius > 254) return;
            if (radius < 1) return;

            StackblurPass(src, w, h, rowStride, radius, vertical: false, area);
            StackblurPass(src, w, h, rowStride, radius, vertical: true, area);
        }

    }
}

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions