-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Description
Description
In a release build with .NET 8 RC1 the compiler overwrites memory it shouldn't touch.
Input is a Span<byte> of pixels with layout RGBA, where every fourth byte is an alpha byte set to 255 (index 3, 7, 11, etc).
The code only overwrites the RGB bytes (indexes 0, 1, 2, 4, 5, 6, 8, etc), but the compiler overwrites 3, 7, 11, etc too (with 0).
This overwrite happens when a ref byte (3 bytes earlier) is overwritten with the first element of a Vector256
Unsafe.Add(ref dst_ptr, 0) = (byte)blurredPixel0[0]; // blurredPixel0 is a Vector256<uint> dst_ptr is ref byte
// byte at dst_ptr + 3 is now overwritten with 0Below is this line in more context. I have tried to shorten the example code but the bug is very specific and is Vector256 related.
Configuration
- .NET 8.0 RC1
- OS: Windows 10
- Only tested with x64
Regression?
Yes, compared to .NET 7.0
Other information
The following configurations work fine:
dotnet run StackBlur --framework net7.0 --configuration Release
dotnet run StackBlur --framework net7.0 --configuration Debug
dotnet run StackBlur --framework net8.0 --configuration Debug
To reproduce the error (from PowerShell):
$env:DOTNET_JitDisasm = 'StackblurPass'
dotnet run StackBlur --framework net8.0 --configuration Release
Here the project code StackBlur.csproj:
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFrameworks>net7.0;net8.0</TargetFrameworks>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
</Project>And the source code Program.cs:
The program is rather long but scroll down to where it is not indented to find the issue.
using System;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Diagnostics;
namespace Application
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine(Environment.Version);
const int Width = 1920;
const int Height = 1080;
Span<byte> pixels = new byte[Width * Height * sizeof(uint)];
for (int i = 3; i < pixels.Length; i += 4)
{
pixels[i] = 255; // fill alpha channel
}
pixels.Fill(255);
var area = Vector128.Create(0, 0, 384, 384);
StackBlur(pixels, Width, Height, Width * sizeof(uint), radius: 100, area);
for (int i = 3; i < pixels.Length; i += 4)
{
if (pixels[i] != 255)
{
Console.WriteLine("FAILED");
return;
}
}
}
static ushort[] stackblur_mul = new ushort[255]
{
512,512,456,512,328,456,335,512,405,328,271,456,388,335,292,512,
454,405,364,328,298,271,496,456,420,388,360,335,312,292,273,512,
482,454,428,405,383,364,345,328,312,298,284,271,259,496,475,456,
437,420,404,388,374,360,347,335,323,312,302,292,282,273,265,512,
497,482,468,454,441,428,417,405,394,383,373,364,354,345,337,328,
320,312,305,298,291,284,278,271,265,259,507,496,485,475,465,456,
446,437,428,420,412,404,396,388,381,374,367,360,354,347,341,335,
329,323,318,312,307,302,297,292,287,282,278,273,269,265,261,512,
505,497,489,482,475,468,461,454,447,441,435,428,422,417,411,405,
399,394,389,383,378,373,368,364,359,354,350,345,341,337,332,328,
324,320,316,312,309,305,301,298,294,291,287,284,281,278,274,271,
268,265,262,259,257,507,501,496,491,485,480,475,470,465,460,456,
451,446,442,437,433,428,424,420,416,412,408,404,400,396,392,388,
385,381,377,374,370,367,363,360,357,354,350,347,344,341,338,335,
332,329,326,323,320,318,315,312,310,307,304,302,299,297,294,292,
289,287,285,282,280,278,275,273,271,269,267,265,263,261,259
};
static ushort[] stackblur_shr = new ushort[255]
{
9, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17,
17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21,
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22,
22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24
};
//[MethodImpl(MethodImplOptions.NoOptimization)] // also 'fixes' the issue
static void StackblurPass(Span<byte> src, ///< input image data
int width, ///< image width
int height, ///< image height
int rowStride,
uint radius, ///< blur intensity (should be in 2..254 range)
bool vertical,
Vector128<int> area
)
{
uint mul_sum = stackblur_mul[radius];
int shr_sum = stackblur_shr[radius];
var areaX = area[0];
var areaY = area[1];
var areaWidth = area[2];
var areaHeight = area[3];
int strideX = 4;
int strideY = rowStride;
int areaMinX = areaX >= 0 ? areaX : 0;
int areaMinY = areaY >= 0 ? areaY : 0;
int maxX = width - 1;
int maxY = height - 1;
int areaMaxX = areaX + areaWidth - 1;
int areaMaxY = areaY + areaHeight - 1;
if (areaMaxX > maxX) areaMaxX = maxX;
if (areaMaxY > maxY) areaMaxY = maxY;
areaMaxX -= areaMinX;
areaMaxY -= areaMinY;
if (vertical)
{
// transpose (swap X and Y)
(maxX, maxY) = (maxY, maxX);
(strideX, strideY) = (strideY, strideX);
(areaMinX, areaMinY) = (areaMinY, areaMinX);
(areaMaxX, areaMaxY) = (areaMaxY, areaMaxX);
}
uint initialSum = (uint)SumNumbers((int)radius + 1);
ref var sourcePtr = ref src[areaMinX * strideX + areaMinY * strideY];
if (Avx2.IsSupported && Vector256.IsHardwareAccelerated)
{
var idx0 = Vector256.Create(0, 1, 2,
1 * strideY, 1 * strideY + 1, 1 * strideY + 2,
2 * strideY, 2 * strideY + 1);
var idx1 = Vector256.Create(2 * strideY + 2, 3 * strideY, 3 * strideY + 1,
3 * strideY + 2, 4 * strideY, 4 * strideY + 1,
4 * strideY + 2, 5 * strideY);
var idx2 = Vector256.Create(5 * strideY + 1, 5 * strideY + 2, 6 * strideY,
6 * strideY + 1, 6 * strideY + 2, 7 * strideY,
7 * strideY + 1, 7 * strideY + 2);
var lowerByte = Vector256.Create(0xFFU);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
unsafe void FillVectorsFromPixels(ref byte src, ref Vector256<uint> data0, ref Vector256<uint> data1, ref Vector256<uint> data2)
{
var src_ptr = (uint*)Unsafe.AsPointer(ref src);
data0 = Avx2.GatherVector256(src_ptr, idx0, 1);
data1 = Avx2.GatherVector256(src_ptr, idx1, 1);
data2 = Avx2.GatherVector256(src_ptr, idx2, 1);
data0 = Avx2.And(data0, lowerByte);
data1 = Avx2.And(data1, lowerByte);
data2 = Avx2.And(data2, lowerByte);
}
Span<Vector256<uint>> stack0 = stackalloc Vector256<uint>[(int)radius * 2 + 1];
Span<Vector256<uint>> stack1 = stackalloc Vector256<uint>[(int)radius * 2 + 1];
Span<Vector256<uint>> stack2 = stackalloc Vector256<uint>[(int)radius * 2 + 1];
for (int y = 0; y <= areaMaxY; y += 8)
{
var sum_in0 = Vector256<uint>.Zero;
var sum_in1 = Vector256<uint>.Zero;
var sum_in2 = Vector256<uint>.Zero;
var data0 = Vector256<uint>.Zero;
var data1 = Vector256<uint>.Zero;
var data2 = Vector256<uint>.Zero;
ref var src_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY); // start of line (0,y)
FillVectorsFromPixels(ref src_ptr, ref data0, ref data1, ref data2);
var sum0 = Vector256.Multiply(data0, initialSum);
var sum1 = Vector256.Multiply(data1, initialSum);
var sum2 = Vector256.Multiply(data2, initialSum);
var sum_out0 = Vector256.Multiply(data0, radius + 1);
var sum_out1 = Vector256.Multiply(data1, radius + 1);
var sum_out2 = Vector256.Multiply(data2, radius + 1);
foreach (ref var stack_ptr in stack0.Slice(0, (int)radius + 1))
{
stack_ptr = data0;
}
foreach (ref var stack_ptr in stack1.Slice(0, (int)radius + 1))
{
stack_ptr = data1;
}
foreach (ref var stack_ptr in stack2.Slice(0, (int)radius + 1))
{
stack_ptr = data2;
}
for (int i = 1; i <= radius; i++)
{
if (i < areaMaxX)
{
src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
FillVectorsFromPixels(ref src_ptr, ref data0, ref data1, ref data2);
}
ref var stack_ptr = ref stack0[(int)(i + radius)];
stack_ptr = data0;
stack_ptr = ref stack1[(int)(i + radius)];
stack_ptr = data1;
stack_ptr = ref stack2[(int)(i + radius)];
stack_ptr = data2;
sum0 += data0 * (uint)(radius + 1 - i);
sum1 += data1 * (uint)(radius + 1 - i);
sum2 += data2 * (uint)(radius + 1 - i);
sum_in0 += data0;
sum_in1 += data1;
sum_in2 += data2;
}
int sp = (int)radius;
int xp = (int)radius;
if (xp > areaMaxX) xp = areaMaxX;
src_ptr = ref Unsafe.Add(ref sourcePtr, xp * strideX + y * strideY);
ref var dst_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY);
for (int x = 0; x <= areaMaxX; x++)
{
var blurredPixel0 = Vector256.ShiftRightLogical(Vector256.Multiply(sum0, mul_sum), shr_sum);
var blurredPixel1 = Vector256.ShiftRightLogical(Vector256.Multiply(sum1, mul_sum), shr_sum);
var blurredPixel2 = Vector256.ShiftRightLogical(Vector256.Multiply(sum2, mul_sum), shr_sum);
#if NET8_0_OR_GREATER
Console.WriteLine(Unsafe.Add(ref dst_ptr, 3).ToString("X")); // should print FF
#endif
Unsafe.Add(ref dst_ptr, 0) = (byte)blurredPixel0[0]; // bug, should only write a single byte
#if NET8_0_OR_GREATER
Console.WriteLine(Unsafe.Add(ref dst_ptr, 3).ToString("X")); // should print FF, not 00
#endif
Unsafe.Add(ref dst_ptr, 1) = (byte)blurredPixel0[1];
Unsafe.Add(ref dst_ptr, 2) = (byte)blurredPixel0[2];
Unsafe.Add(ref dst_ptr, 0 + strideY) = (byte)blurredPixel0[3];
Unsafe.Add(ref dst_ptr, 1 + strideY) = (byte)blurredPixel0[4];
Unsafe.Add(ref dst_ptr, 2 + strideY) = (byte)blurredPixel0[5];
Unsafe.Add(ref dst_ptr, 0 + strideY * 2) = (byte)blurredPixel0[6];
Unsafe.Add(ref dst_ptr, 1 + strideY * 2) = (byte)blurredPixel0[7];
Unsafe.Add(ref dst_ptr, 2 + strideY * 2) = (byte)blurredPixel1[0];
Unsafe.Add(ref dst_ptr, 0 + strideY * 3) = (byte)blurredPixel1[1];
Unsafe.Add(ref dst_ptr, 1 + strideY * 3) = (byte)blurredPixel1[2];
Unsafe.Add(ref dst_ptr, 2 + strideY * 3) = (byte)blurredPixel1[3];
Unsafe.Add(ref dst_ptr, 0 + strideY * 4) = (byte)blurredPixel1[4];
Unsafe.Add(ref dst_ptr, 1 + strideY * 4) = (byte)blurredPixel1[5];
Unsafe.Add(ref dst_ptr, 2 + strideY * 4) = (byte)blurredPixel1[6];
Unsafe.Add(ref dst_ptr, 0 + strideY * 5) = (byte)blurredPixel1[7];
Unsafe.Add(ref dst_ptr, 1 + strideY * 5) = (byte)blurredPixel2[0];
Unsafe.Add(ref dst_ptr, 2 + strideY * 5) = (byte)blurredPixel2[1];
Unsafe.Add(ref dst_ptr, 0 + strideY * 6) = (byte)blurredPixel2[2];
Unsafe.Add(ref dst_ptr, 1 + strideY * 6) = (byte)blurredPixel2[3];
Unsafe.Add(ref dst_ptr, 2 + strideY * 6) = (byte)blurredPixel2[4];
Unsafe.Add(ref dst_ptr, 0 + strideY * 7) = (byte)blurredPixel2[5];
Unsafe.Add(ref dst_ptr, 1 + strideY * 7) = (byte)blurredPixel2[6];
Unsafe.Add(ref dst_ptr, 2 + strideY * 7) = (byte)blurredPixel2[7];
dst_ptr = ref Unsafe.Add(ref dst_ptr, strideX);
sum0 -= sum_out0;
sum1 -= sum_out1;
sum2 -= sum_out2;
int stack_start = sp + (int)radius + 1;
if (stack_start >= stack0.Length) stack_start -= stack0.Length;
ref var stack_ptr0 = ref stack0[stack_start];
sum_out0 -= stack_ptr0;
ref var stack_ptr1 = ref stack1[stack_start];
sum_out1 -= stack_ptr1;
ref var stack_ptr2 = ref stack2[stack_start];
sum_out2 -= stack_ptr2;
if (xp < areaMaxX)
{
src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
FillVectorsFromPixels(ref src_ptr, ref data0, ref data1, ref data2);
++xp;
}
stack_ptr0 = data0;
sum_in0 += data0;
sum0 += sum_in0;
stack_ptr1 = data1;
sum_in1 += data1;
sum1 += sum_in1;
stack_ptr2 = data2;
sum_in2 += data2;
sum2 += sum_in2;
++sp;
if (sp >= stack0.Length) sp = 0;
stack_ptr0 = ref stack0[sp];
sum_out0 += stack_ptr0;
sum_in0 -= stack_ptr0;
stack_ptr1 = ref stack1[sp];
sum_out1 += stack_ptr1;
sum_in1 -= stack_ptr1;
stack_ptr2 = ref stack2[sp];
sum_out2 += stack_ptr2;
sum_in2 -= stack_ptr2;
}
}
}
else
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void FillVectorFromPixel(ref Vector128<uint> data, ref byte src_ptr)
{
data = Vector128.Create((uint)src_ptr,
(uint)Unsafe.Add(ref src_ptr, 1),
(uint)Unsafe.Add(ref src_ptr, 2),
(uint)Unsafe.Add(ref src_ptr, 3));
}
Span<Vector128<uint>> stack = stackalloc Vector128<uint>[(int)radius * 2 + 1];
for (int y = 0; y <= areaMaxY; y++)
{
var sum_in = Vector128<uint>.Zero;
ref byte src_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY); // start of line (0,y)
Vector128<uint> data = Vector128<uint>.Zero;
FillVectorFromPixel(ref data, ref src_ptr);
var sum = Vector128.Multiply(data, initialSum);
var sum_out = Vector128.Multiply(data, radius + 1);
foreach (ref var stack_ptr in stack.Slice(0, (int)radius + 1))
{
stack_ptr = data;
}
for (int i = 1; i <= radius; i++)
{
if (i < areaMaxX)
{
src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
FillVectorFromPixel(ref data, ref src_ptr);
}
ref var stack_ptr = ref stack[(int)(i + radius)];
stack_ptr = data;
sum += data * (uint)(radius + 1 - i);
sum_in += data;
}
int sp = (int)radius;
int xp = (int)radius;
if (xp > areaMaxX) xp = areaMaxX;
src_ptr = ref Unsafe.Add(ref sourcePtr, xp * strideX + y * strideY); // img.pix_ptr(xp, y);
ref byte dst_ptr = ref Unsafe.Add(ref sourcePtr, y * strideY); // img.pix_ptr(0, y);
for (int x = 0; x <= areaMaxX; x++)
{
var blurredPixel = Vector128.ShiftRightLogical(Vector128.Multiply(sum, mul_sum), shr_sum);
dst_ptr = (byte)blurredPixel[0];
Unsafe.Add(ref dst_ptr, 1) = (byte)blurredPixel[1];
Unsafe.Add(ref dst_ptr, 2) = (byte)blurredPixel[2];
Unsafe.Add(ref dst_ptr, 3) = (byte)blurredPixel[3];
dst_ptr = ref Unsafe.Add(ref dst_ptr, strideX);
sum -= sum_out;
int stack_start = sp + (int)radius + 1;
if (stack_start >= stack.Length) stack_start -= stack.Length;
ref var stack_ptr = ref stack[stack_start];
sum_out -= stack_ptr;
if (xp < areaMaxX)
{
src_ptr = ref Unsafe.Add(ref src_ptr, strideX);
FillVectorFromPixel(ref data, ref src_ptr);
++xp;
}
stack_ptr = data;
sum_in += data;
sum += sum_in;
++sp;
if (sp >= stack.Length) sp = 0;
stack_ptr = ref stack[sp];
sum_out += stack_ptr;
sum_in -= stack_ptr;
}
}
}
}
static int SumNumbers(int n)
{
return n * (n + 1) / 2; // fast way of calculating range (0..n).Sum()
}
/// Stackblur algorithm by Mario Klingemann
/// Details here:
/// http://www.quasimondo.com/StackBlurForCanvas/StackBlurDemo.html
/// C++ implemenation base from:
/// https://gist.github.com/benjamin9999/3809142
/// http://www.antigrain.com/__code/include/agg_blur.h.html
/// This version works only with RGBA color
static void StackBlur(Span<byte> src, ///< input image data
int w, ///< image width
int h, ///< image height
int rowStride,
uint radius ///< blur intensity (should be in 2..254 range)
)
{
var area = Vector128.Create(0, 0, w, h);
StackBlur(src, w, h, rowStride, radius, area);
}
static void StackBlur(Span<byte> src, ///< input image data
int w, ///< image width
int h, ///< image height
int rowStride,
uint radius, ///< blur intensity (should be in 2..254 range)
Vector128<int> area
)
{
if (radius > 254) return;
if (radius < 1) return;
StackblurPass(src, w, h, rowStride, radius, vertical: false, area);
StackblurPass(src, w, h, rowStride, radius, vertical: true, area);
}
}
}