Optimize IndexOfAnyAsciiSearcher on Arm64#126678
Conversation
|
@EgorBot -arm -linux_arm using System.Buffers;
using BenchmarkDotNet.Attributes;
public class Benchmarks
{
private static readonly SearchValues<byte> s_controlQuoteBackslash = SearchValues.Create(
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B"u8 +
"\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018"u8 +
"\u0019\u001A\u001B\u001C\u001D\u001E\u001F"u8 + "\""u8 + "\\"u8);
private byte[] _str = "Product description with some text that is a bit longer than usual\""u8.ToArray();
[Benchmark]
public int Medium() => _str.AsSpan().IndexOfAny(s_controlQuoteBackslash);
} |
|
@EgorBot -linux_azure_arm -arm using System.Text;
using System.Text.Json;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
BenchmarkSwitcher.FromAssembly(typeof(Benchmarks).Assembly).Run(args);
[MemoryDiagnoser]
public class Benchmarks
{
// ── TokenSerialization fields ────────────────────────────────────────────
private List<object> _tokenObjects;
[ThreadStatic] static Utf8JsonWriter t_writer;
[ThreadStatic] static MemoryStream t_stream;
[GlobalSetup]
public void Setup()
{
// TokenSerialization
_tokenObjects = new List<object>(200);
for (int i = 0; i < 200; i++)
{
if (i % 3 == 0)
_tokenObjects.Add(GenerateRecordJson(1));
else
_tokenObjects.Add(new Dictionary<string, object>
{
["seq"] = i,
["label"] = $"item_{i}",
["blob"] = new byte[100]
});
}
}
private static string GenerateRecordJson(int targetSizeKb = 150)
{
var sb = new StringBuilder(targetSizeKb * 1024 + 512);
sb.Append("{");
sb.Append("\"TypeName\":\"product\",");
sb.Append("\"CategoryCode\":1,");
sb.Append("\"Label\":\"Product\",");
sb.Append("\"IsAction\":false,");
sb.Append("\"IsActionMember\":false,");
sb.Append("\"IsTrackingEnabled\":true,");
sb.Append("\"IsAvailableLocal\":true,");
sb.Append("\"IsChildRecord\":false,");
sb.Append("\"IsLinksEnabled\":true,");
sb.Append("\"IsCustomRecord\":false,");
sb.Append("\"PrimaryKeyField\":\"productid\",");
sb.Append("\"PrimaryLabelField\":\"title\",");
sb.Append("\"Fields\":[");
int targetBytes = targetSizeKb * 1024;
int fieldIndex = 0;
bool firstField = true;
while (sb.Length < targetBytes - 512)
{
if (!firstField) sb.Append(",");
firstField = false;
sb.Append("{");
sb.Append($"\"TypeName\":\"field_{fieldIndex}\",");
sb.Append($"\"InternalName\":\"Field_{fieldIndex}\",");
sb.Append($"\"FieldType\":\"String\",");
sb.Append($"\"Label\":\"Field {fieldIndex}\",");
sb.Append($"\"MaxSize\":100,");
sb.Append($"\"IsReadable\":true,");
sb.Append($"\"IsCreatable\":true,");
sb.Append($"\"IsUpdatable\":true,");
sb.Append($"\"IsTrackingEnabled\":false,");
sb.Append($"\"IsPrimaryKey\":false,");
sb.Append($"\"IsVirtual\":false,");
sb.Append($"\"Requirement\":\"None\"");
sb.Append("}");
fieldIndex++;
}
sb.Append("]");
sb.Append("}");
return sb.ToString();
}
[Benchmark]
public void TokenSerialization()
{
var stream = t_stream ??= new MemoryStream(64 * 1024);
stream.Position = 0;
stream.SetLength(0);
var writer = t_writer;
if (writer == null)
{
writer = new Utf8JsonWriter(stream, new JsonWriterOptions { SkipValidation = true });
t_writer = writer;
}
else
writer.Reset(stream);
writer.WriteStartObject();
writer.WriteStartArray("Catalog");
foreach (var token in _tokenObjects)
{
if (token is string strToken)
{
if (!string.IsNullOrEmpty(strToken))
writer.WriteRawValue(strToken);
}
else if (token is Dictionary<string, object> dictToken)
{
writer.WriteStartObject();
foreach (var kvp in dictToken)
{
writer.WritePropertyName(kvp.Key);
JsonSerializer.Serialize(writer, kvp.Value);
}
writer.WriteEndObject();
}
}
writer.WriteEndArray();
writer.WriteEndObject();
writer.Flush();
if (stream.Length == 0) throw new Exception("unreachable");
}
} |
|
Tagging subscribers to this area: @dotnet/area-system-buffers |
There was a problem hiding this comment.
Pull request overview
This PR adds an Arm64/AdvSimd-specific fast-path for computing the first match index from a Vector128<byte> match result in IndexOfAnyAsciiSearcher, aiming to reduce overhead in IndexOfAny-style searches.
Changes:
- Extended
INegatorwithIndexOfFirstMatch(Vector128<byte> result)to centralize “find first match” logic. - Implemented an AdvSimd-based first-match index computation for both
DontNegateandNegate. - Updated
IndexOfAnyResultMapper’sVector128code paths to use the newIndexOfFirstMatchhelper.
src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs
Outdated
Show resolved
Hide resolved
|
@EgorBot -aws_arm -profiler using System.Buffers;
using BenchmarkDotNet.Attributes;
public class Benchmarks
{
private static readonly SearchValues<byte> s_controlQuoteBackslash = SearchValues.Create(
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B"u8 +
"\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018"u8 +
"\u0019\u001A\u001B\u001C\u001D\u001E\u001F"u8 + "\""u8 + "\\"u8);
private byte[] _str = "Product description with some text that is a bit longer than usual\""u8.ToArray();
[Benchmark]
public int Medium() => _str.AsSpan().IndexOfAny(s_controlQuoteBackslash);
} |
|
@EgorBot -aws_amd -profiler using System.Text;
using System.Text.Json;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
BenchmarkSwitcher.FromAssembly(typeof(Benchmarks).Assembly).Run(args);
[MemoryDiagnoser]
public class Benchmarks
{
// ── TokenSerialization fields ────────────────────────────────────────────
private List<object> _tokenObjects;
[ThreadStatic] static Utf8JsonWriter t_writer;
[ThreadStatic] static MemoryStream t_stream;
[GlobalSetup]
public void Setup()
{
// TokenSerialization
_tokenObjects = new List<object>(200);
for (int i = 0; i < 200; i++)
{
if (i % 3 == 0)
_tokenObjects.Add(GenerateRecordJson(1));
else
_tokenObjects.Add(new Dictionary<string, object>
{
["seq"] = i,
["label"] = $"item_{i}",
["blob"] = new byte[100]
});
}
}
private static string GenerateRecordJson(int targetSizeKb = 150)
{
var sb = new StringBuilder(targetSizeKb * 1024 + 512);
sb.Append("{");
sb.Append("\"TypeName\":\"product\",");
sb.Append("\"CategoryCode\":1,");
sb.Append("\"Label\":\"Product\",");
sb.Append("\"IsAction\":false,");
sb.Append("\"IsActionMember\":false,");
sb.Append("\"IsTrackingEnabled\":true,");
sb.Append("\"IsAvailableLocal\":true,");
sb.Append("\"IsChildRecord\":false,");
sb.Append("\"IsLinksEnabled\":true,");
sb.Append("\"IsCustomRecord\":false,");
sb.Append("\"PrimaryKeyField\":\"productid\",");
sb.Append("\"PrimaryLabelField\":\"title\",");
sb.Append("\"Fields\":[");
int targetBytes = targetSizeKb * 1024;
int fieldIndex = 0;
bool firstField = true;
while (sb.Length < targetBytes - 512)
{
if (!firstField) sb.Append(",");
firstField = false;
sb.Append("{");
sb.Append($"\"TypeName\":\"field_{fieldIndex}\",");
sb.Append($"\"InternalName\":\"Field_{fieldIndex}\",");
sb.Append($"\"FieldType\":\"String\",");
sb.Append($"\"Label\":\"Field {fieldIndex}\",");
sb.Append($"\"MaxSize\":100,");
sb.Append($"\"IsReadable\":true,");
sb.Append($"\"IsCreatable\":true,");
sb.Append($"\"IsUpdatable\":true,");
sb.Append($"\"IsTrackingEnabled\":false,");
sb.Append($"\"IsPrimaryKey\":false,");
sb.Append($"\"IsVirtual\":false,");
sb.Append($"\"Requirement\":\"None\"");
sb.Append("}");
fieldIndex++;
}
sb.Append("]");
sb.Append("}");
return sb.ToString();
}
[Benchmark]
public void TokenSerialization()
{
var stream = t_stream ??= new MemoryStream(64 * 1024);
stream.Position = 0;
stream.SetLength(0);
var writer = t_writer;
if (writer == null)
{
writer = new Utf8JsonWriter(stream, new JsonWriterOptions { SkipValidation = true });
t_writer = writer;
}
else
writer.Reset(stream);
writer.WriteStartObject();
writer.WriteStartArray("Catalog");
foreach (var token in _tokenObjects)
{
if (token is string strToken)
{
if (!string.IsNullOrEmpty(strToken))
writer.WriteRawValue(strToken);
}
else if (token is Dictionary<string, object> dictToken)
{
writer.WriteStartObject();
foreach (var kvp in dictToken)
{
writer.WritePropertyName(kvp.Key);
JsonSerializer.Serialize(writer, kvp.Value);
}
writer.WriteEndObject();
}
}
writer.WriteEndArray();
writer.WriteEndObject();
writer.Flush();
if (stream.Length == 0) throw new Exception("unreachable");
}
} |
9dfb601 to
8d992f6
Compare
8d992f6 to
73a99fa
Compare
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
Outdated
Show resolved
Hide resolved
73a99fa to
e150405
Compare
…ndexOf vector APIs
e150405 to
21503a6
Compare
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
Outdated
Show resolved
Hide resolved
|
CC. @EgorBo Faster on Arm64 across the board. On x64, the pattern can sometimes change codegen to use EVEX kmask registers instead and so perf can be slightly slower or faster depending on hardware. It'd be something to handle separately and avoid kmask creation for V128/256 in this scenario, if we care. |
There was a problem hiding this comment.
I wonder if we could also optimize IndexOfWhereAllBitsSet by just flipping all bits and searching for all zeroes using normal IndexOf. Not fully optimal but better than nothing?
Although, let's see if I can optimize it via #126790
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
b9d28fe to
5474a17
Compare
|
GitHub failed to pickup the applied typo suggestions even though it committed them. Had to force push to get it to pick it up so the merge was actually possible, no changes. |
This improves the codegen of the dedicated
Count,IndexOf, andLastIndexOfAPIs on Arm64 and correspondingly updatesIndexOfAnyAsciiSearcherto consume them instead ofExtractMostSignificantBitsThis is notably not strictly the "ideal" codegen as it still does an extra comparison, but that is something we can address in the JIT and should still provide a 5-50% performance increase in workloads using this API as part of their core loop.