Skip to content

Commit 75612fc

Browse files
danmoseleyCopilot
andcommitted
Experiment: defer reduction from parser to FinalOptimize
Parser calls AddChildMinimal (ReduceMinimal: Group unwrap, 0/1-child Concatenation/Alternation unwrap, IgnoreCase strip) instead of full Reduce. Full reduction happens via FinalReduce before and after the optimization passes in FinalOptimize. All tests pass but this is 4ms slower than reduce-during-parse (66ms vs 62ms for 15,817 patterns in Release). The parse phase saves ~7ms by skipping full Reduce, but the added pre-optimization FinalReduce tree walk costs ~12ms. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 0281f05 commit 75612fc

File tree

4 files changed

+95018
-13
lines changed

4 files changed

+95018
-13
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,11 @@ internal RegexNode FinalOptimize()
373373
Debug.Assert(rootNode.Parent is null);
374374
Debug.Assert(rootNode.ChildCount() == 1);
375375

376+
// Reduce the full tree. With deferred reduction (parser uses AddChildMinimal),
377+
// nodes haven't been fully reduced during parsing, so a full reduction pass is needed
378+
// before the optimization passes can operate correctly.
379+
rootNode.FinalReduce();
380+
376381
// Only apply optimization when LTR to avoid needing additional code for the much rarer RTL case.
377382
// Also only apply these optimizations when not using NonBacktracking, as these optimizations are
378383
// all about avoiding things that are impactful for the backtracking engines but nops for non-backtracking.
@@ -629,6 +634,31 @@ internal RegexNode Reduce()
629634
};
630635
}
631636

637+
/// <summary>
638+
/// Minimal reduction for use during parsing when full reduction is deferred to FinalOptimize.
639+
/// Only strips IgnoreCase, unwraps Group nodes, and simplifies 0/1-child Concatenation/Alternation.
640+
/// </summary>
641+
private RegexNode ReduceMinimal()
642+
{
643+
// Remove IgnoreCase option from everything except a Backreference
644+
switch (Kind)
645+
{
646+
default:
647+
Options &= ~RegexOptions.IgnoreCase;
648+
break;
649+
650+
case RegexNodeKind.Backreference:
651+
break;
652+
}
653+
654+
return Kind switch
655+
{
656+
RegexNodeKind.Group => ReduceGroup(),
657+
RegexNodeKind.Alternate or RegexNodeKind.Concatenate => ReplaceNodeIfUnnecessary(),
658+
_ => this,
659+
};
660+
}
661+
632662
/// <summary>Remove an unnecessary Concatenation or Alternation node</summary>
633663
/// <remarks>
634664
/// Simple optimization for a concatenation or alternation:
@@ -3190,6 +3220,30 @@ public void AddChild(RegexNode newChild)
31903220
}
31913221
}
31923222

3223+
/// <summary>
3224+
/// Adds a child with only minimal reduction (Group unwrap, 0/1-child Concatenation/Alternation
3225+
/// unwrap, and IgnoreCase strip). Used by the parser to defer full reduction to FinalOptimize.
3226+
/// </summary>
3227+
internal void AddChildMinimal(RegexNode newChild)
3228+
{
3229+
newChild.Parent = this;
3230+
newChild = newChild.ReduceMinimal();
3231+
newChild.Parent = this;
3232+
3233+
if (Children is null)
3234+
{
3235+
Children = newChild;
3236+
}
3237+
else if (Children is RegexNode currentChild)
3238+
{
3239+
Children = new List<RegexNode>() { currentChild, newChild };
3240+
}
3241+
else
3242+
{
3243+
((List<RegexNode>)Children).Add(newChild);
3244+
}
3245+
}
3246+
31933247
public void InsertChild(int index, RegexNode newChild)
31943248
{
31953249
Debug.Assert(Children is List<RegexNode>);

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ private RegexNode ScanRegex()
426426

427427
if (_pos == _pattern.Length || !(isQuantifier = IsTrueQuantifier()))
428428
{
429-
_concatenation!.AddChild(_unit!);
429+
_concatenation!.AddChildMinimal(_unit!);
430430
_unit = null;
431431
goto ContinueOuterScan;
432432
}
@@ -467,7 +467,7 @@ private RegexNode ScanRegex()
467467

468468
if (startpos == _pos || _pos == _pattern.Length || _pattern[_pos++] != '}')
469469
{
470-
_concatenation!.AddChild(_unit!);
470+
_concatenation!.AddChildMinimal(_unit!);
471471
_unit = null;
472472
_pos = startpos - 1;
473473
goto ContinueOuterScan;
@@ -494,7 +494,7 @@ private RegexNode ScanRegex()
494494
throw MakeException(RegexParseError.ReversedQuantifierRange, SR.ReversedQuantifierRange);
495495
}
496496

497-
_concatenation!.AddChild(_unit!.MakeQuantifier(lazy, min, max));
497+
_concatenation!.AddChildMinimal(_unit!.MakeQuantifier(lazy, min, max));
498498
_unit = null;
499499
}
500500

@@ -535,7 +535,7 @@ private RegexNode ScanReplacement()
535535
if (_pos < _pattern.Length)
536536
{
537537
_pos++;
538-
_concatenation.AddChild(ScanDollar());
538+
_concatenation.AddChildMinimal(ScanDollar());
539539
_unit = null;
540540
}
541541
}
@@ -2044,17 +2044,17 @@ private void AddToConcatenate(int pos, int cch, bool isReplacement)
20442044
return;
20452045

20462046
case 1:
2047-
_concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture, ref _caseBehavior));
2047+
_concatenation!.AddChildMinimal(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture, ref _caseBehavior));
20482048
break;
20492049

20502050
case > 1 when (_options & RegexOptions.IgnoreCase) == 0 || isReplacement || !RegexCharClass.ParticipatesInCaseConversion(_pattern.AsSpan(pos, cch)):
2051-
_concatenation!.AddChild(new RegexNode(RegexNodeKind.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch)));
2051+
_concatenation!.AddChildMinimal(new RegexNode(RegexNodeKind.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch)));
20522052
break;
20532053

20542054
default:
20552055
foreach (char c in _pattern.AsSpan(pos, cch))
20562056
{
2057-
_concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(c, _options, _culture, ref _caseBehavior));
2057+
_concatenation!.AddChildMinimal(RegexNode.CreateOneWithCaseConversion(c, _options, _culture, ref _caseBehavior));
20582058
}
20592059
break;
20602060
}
@@ -2085,7 +2085,7 @@ private void PopGroup()
20852085
throw MakeException(RegexParseError.AlternationHasMalformedCondition, SR.AlternationHasMalformedCondition);
20862086
}
20872087

2088-
_group.AddChild(_unit);
2088+
_group.AddChildMinimal(_unit);
20892089
_unit = null;
20902090
}
20912091
}
@@ -2105,11 +2105,11 @@ private void AddAlternate()
21052105

21062106
if (_group!.Kind is RegexNodeKind.ExpressionConditional or RegexNodeKind.BackreferenceConditional)
21072107
{
2108-
_group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
2108+
_group.AddChildMinimal(_concatenation!.ReverseConcatenationIfRightToLeft());
21092109
}
21102110
else
21112111
{
2112-
_alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
2112+
_alternation!.AddChildMinimal(_concatenation!.ReverseConcatenationIfRightToLeft());
21132113
}
21142114

21152115
_concatenation = new RegexNode(RegexNodeKind.Concatenate, _options);
@@ -2120,7 +2120,7 @@ private void AddGroup()
21202120
{
21212121
if (_group!.Kind is RegexNodeKind.ExpressionConditional or RegexNodeKind.BackreferenceConditional)
21222122
{
2123-
_group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
2123+
_group.AddChildMinimal(_concatenation!.ReverseConcatenationIfRightToLeft());
21242124

21252125
if (_group.Kind == RegexNodeKind.BackreferenceConditional && _group.ChildCount() > 2 || _group.ChildCount() > 3)
21262126
{
@@ -2129,8 +2129,8 @@ private void AddGroup()
21292129
}
21302130
else
21312131
{
2132-
_alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft());
2133-
_group.AddChild(_alternation);
2132+
_alternation!.AddChildMinimal(_concatenation!.ReverseConcatenationIfRightToLeft());
2133+
_group.AddChildMinimal(_alternation);
21342134
}
21352135

21362136
_unit = _group;
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// Perf measurement: time regex parse/optimize across all real-world patterns
2+
// Temporary - not to be committed to the PR
3+
4+
using System.Collections.Generic;
5+
using System.Diagnostics;
6+
using System.Globalization;
7+
using System.IO;
8+
using System.Text.Json;
9+
using System.Text.RegularExpressions;
10+
using Xunit;
11+
using Xunit.Abstractions;
12+
13+
namespace System.Text.RegularExpressions.Tests
14+
{
15+
public class RegexPerfMeasurement
16+
{
17+
private readonly ITestOutputHelper _output;
18+
19+
public RegexPerfMeasurement(ITestOutputHelper output) => _output = output;
20+
21+
private string[] LoadPatterns()
22+
{
23+
string jsonPath = Path.Combine(
24+
Path.GetDirectoryName(typeof(RegexPerfMeasurement).Assembly.Location)!,
25+
"Regex_RealWorldPatterns.json");
26+
27+
if (!File.Exists(jsonPath))
28+
{
29+
// Fallback: look in the source tree
30+
jsonPath = @"C:\git\runtime\src\libraries\System.Text.RegularExpressions\tests\UnitTests\Regex_RealWorldPatterns.json";
31+
}
32+
33+
string text = File.ReadAllText(jsonPath);
34+
int arrayStart = text.IndexOf('[');
35+
string json = text.Substring(arrayStart);
36+
37+
using var doc = JsonDocument.Parse(json);
38+
var set = new HashSet<string>();
39+
foreach (var elem in doc.RootElement.EnumerateArray())
40+
{
41+
set.Add(elem.GetProperty("Pattern").GetString()!);
42+
}
43+
44+
string[] patterns = new string[set.Count];
45+
set.CopyTo(patterns);
46+
return patterns;
47+
}
48+
49+
[Fact]
50+
public void MeasureParseTime()
51+
{
52+
string[] patterns = LoadPatterns();
53+
_output.WriteLine($"Loaded {patterns.Length} unique patterns");
54+
55+
// Warmup (5 rounds)
56+
for (int w = 0; w < 5; w++)
57+
{
58+
foreach (string p in patterns)
59+
{
60+
try { RegexParser.Parse(p, RegexOptions.None, CultureInfo.InvariantCulture); } catch { }
61+
}
62+
}
63+
64+
var times = new List<long>();
65+
for (int iter = 0; iter < 7; iter++)
66+
{
67+
var sw = Stopwatch.StartNew();
68+
foreach (string p in patterns)
69+
{
70+
try { RegexParser.Parse(p, RegexOptions.None, CultureInfo.InvariantCulture); } catch { }
71+
}
72+
sw.Stop();
73+
times.Add(sw.ElapsedMilliseconds);
74+
_output.WriteLine($"Iteration {iter + 1}: {sw.ElapsedMilliseconds} ms ({(double)sw.ElapsedMilliseconds / patterns.Length:F4} ms/pattern)");
75+
}
76+
77+
times.Sort();
78+
long median = times[times.Count / 2];
79+
_output.WriteLine($"\nMedian: {median} ms ({(double)median / patterns.Length:F4} ms/pattern) over {patterns.Length} patterns");
80+
}
81+
82+
[Fact]
83+
public void ProfilePhases()
84+
{
85+
string[] patterns = LoadPatterns();
86+
_output.WriteLine($"Loaded {patterns.Length} unique patterns");
87+
88+
// Warmup
89+
for (int w = 0; w < 5; w++)
90+
{
91+
foreach (string p in patterns)
92+
{
93+
try { RegexParser.Parse(p, RegexOptions.None, CultureInfo.InvariantCulture); } catch { }
94+
}
95+
}
96+
97+
// Reset counters
98+
RegexNode.s_findLoopsAtomicTicks = 0;
99+
RegexNode.s_eliminateEndingTicks = 0;
100+
RegexNode.s_finalReduceTicks = 0;
101+
RegexNode.s_preReduceTicks = 0;
102+
103+
var sw = Stopwatch.StartNew();
104+
foreach (string p in patterns)
105+
{
106+
try { RegexParser.Parse(p, RegexOptions.None, CultureInfo.InvariantCulture); } catch { }
107+
}
108+
sw.Stop();
109+
110+
double freq = Stopwatch.Frequency;
111+
double totalMs = sw.ElapsedTicks / freq * 1000;
112+
double findLoopsMs = RegexNode.s_findLoopsAtomicTicks / freq * 1000;
113+
double elimEndMs = RegexNode.s_eliminateEndingTicks / freq * 1000;
114+
double finalReduceMs = RegexNode.s_finalReduceTicks / freq * 1000;
115+
double preReduceMs = RegexNode.s_preReduceTicks / freq * 1000;
116+
double otherMs = totalMs - findLoopsMs - elimEndMs - finalReduceMs - preReduceMs;
117+
118+
_output.WriteLine($"Total parse time: {totalMs:F1} ms");
119+
_output.WriteLine($" PreOptimize FinalReduce: {preReduceMs:F1} ms ({preReduceMs / totalMs * 100:F1}%)");
120+
_output.WriteLine($" FindAndMakeLoopsAtomic: {findLoopsMs:F1} ms ({findLoopsMs / totalMs * 100:F1}%)");
121+
_output.WriteLine($" EliminateEndingBacktracking: {elimEndMs:F1} ms ({elimEndMs / totalMs * 100:F1}%)");
122+
_output.WriteLine($" PostOptimize FinalReduce: {finalReduceMs:F1} ms ({finalReduceMs / totalMs * 100:F1}%)");
123+
_output.WriteLine($" Other (parse + ReduceMinimal): {otherMs:F1} ms ({otherMs / totalMs * 100:F1}%)");
124+
}
125+
}
126+
}

0 commit comments

Comments
 (0)