Skip to content

Commit 3c4319f

Browse files
fix: optimize regex character class with ranges for runtime code
1 parent d571f3f commit 3c4319f

File tree

3 files changed

+116
-10
lines changed

3 files changed

+116
-10
lines changed

.changeset/tender-jars-cheer.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"webpack": patch
3+
---
4+
5+
Optimizing the regular expression character class by specifying ranges for runtime code.

lib/util/compileBooleanMatcher.js

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,79 @@
1111
*/
1212
const quoteMeta = (str) => str.replace(/[-[\]\\/{}()*+?.^$|]/g, "\\$&");
1313

14+
/**
15+
* @param {string} char character to escape for use in character class
16+
* @returns {string} escaped character
17+
*/
18+
const quoteMetaInCharClass = (char) => {
19+
// In character class, only these need escaping: ] \ ^ -
20+
if (char === "]" || char === "\\" || char === "^" || char === "-") {
21+
return `\\${char}`;
22+
}
23+
return char;
24+
};
25+
26+
/**
27+
* Converts an array of single characters into an optimized character class string
28+
* using ranges where possible. E.g., ["1","2","3","4","a"] => "1-4a"
29+
* @param {string[]} chars array of single characters (should be sorted)
30+
* @returns {string} optimized character class content (without the brackets)
31+
*/
32+
const charsToCharClassContent = (chars) => {
33+
if (chars.length === 0) return "";
34+
if (chars.length === 1) return quoteMetaInCharClass(chars[0]);
35+
36+
// Sort by char code
37+
const sorted = [...chars].sort((a, b) => a.charCodeAt(0) - b.charCodeAt(0));
38+
39+
/** @type {string[]} */
40+
const parts = [];
41+
let rangeStart = sorted[0];
42+
let rangeEnd = sorted[0];
43+
44+
for (let i = 1; i < sorted.length; i++) {
45+
const char = sorted[i];
46+
const prevCode = rangeEnd.charCodeAt(0);
47+
const currCode = char.charCodeAt(0);
48+
49+
if (currCode === prevCode + 1) {
50+
// Extend the range
51+
rangeEnd = char;
52+
} else {
53+
// Flush the current range
54+
parts.push(formatRange(rangeStart, rangeEnd));
55+
rangeStart = char;
56+
rangeEnd = char;
57+
}
58+
}
59+
// Flush the last range
60+
parts.push(formatRange(rangeStart, rangeEnd));
61+
62+
return parts.join("");
63+
};
64+
65+
/**
66+
* Formats a range of characters for use in a character class
67+
* @param {string} start start character
68+
* @param {string} end end character
69+
* @returns {string} formatted range
70+
*/
71+
const formatRange = (start, end) => {
72+
const startCode = start.charCodeAt(0);
73+
const endCode = end.charCodeAt(0);
74+
const length = endCode - startCode + 1;
75+
76+
if (length === 1) {
77+
return quoteMetaInCharClass(start);
78+
}
79+
if (length === 2) {
80+
// For 2 chars, just list them (e.g., "ab" instead of "a-b")
81+
return quoteMetaInCharClass(start) + quoteMetaInCharClass(end);
82+
}
83+
// For 3+ chars, use range notation
84+
return `${quoteMetaInCharClass(start)}-${quoteMetaInCharClass(end)}`;
85+
};
86+
1487
/**
1588
* @param {string} str string
1689
* @returns {string} string
@@ -148,19 +221,20 @@ const itemsToRegexp = (itemsArr) => {
148221
}
149222
// special case for only single char items
150223
if (countOfSingleCharItems === itemsArr.length) {
151-
return `[${quoteMeta(itemsArr.sort().join(""))}]`;
224+
return `[${charsToCharClassContent(itemsArr)}]`;
152225
}
153226
/** @type {Set<string>} */
154227
const items = new Set(itemsArr.sort());
155228
if (countOfSingleCharItems > 2) {
156-
let singleCharItems = "";
229+
/** @type {string[]} */
230+
const singleCharItems = [];
157231
for (const item of items) {
158232
if (item.length === 1) {
159-
singleCharItems += item;
233+
singleCharItems.push(item);
160234
items.delete(item);
161235
}
162236
}
163-
finishedItems.push(`[${quoteMeta(singleCharItems)}]`);
237+
finishedItems.push(`[${charsToCharClassContent(singleCharItems)}]`);
164238
}
165239

166240
// special case for 2 items with common prefix/suffix
@@ -227,8 +301,6 @@ const itemsToRegexp = (itemsArr) => {
227301
);
228302
}
229303

230-
// TODO further optimize regexp, i. e.
231-
// use ranges: (1|2|3|4|a) => [1-4a]
232304
/** @type {string[]} */
233305
const conditional = [...finishedItems, ...Array.from(items, quoteMeta)];
234306
if (conditional.length === 1) return conditional[0];

test/compileBooleanMatcher.unittest.js

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,42 @@ describe("itemsToRegexp", () => {
2929
);
3030

3131
expectCompiled("single chars", ["a", "b", "c", "1", "2", "3"], (e) =>
32-
e.toMatchInlineSnapshot("[123abc]")
32+
e.toMatchInlineSnapshot("[1-3a-c]")
33+
);
34+
35+
expectCompiled("single chars with ranges", ["1", "2", "3", "4", "a"], (e) =>
36+
e.toMatchInlineSnapshot("[1-4a]")
37+
);
38+
39+
expectCompiled(
40+
"single chars consecutive range",
41+
["a", "b", "c", "d", "e"],
42+
(e) => e.toMatchInlineSnapshot("[a-e]")
43+
);
44+
45+
expectCompiled(
46+
"single chars multiple ranges",
47+
["1", "2", "3", "a", "b", "c", "x", "y", "z"],
48+
(e) => e.toMatchInlineSnapshot("[1-3a-cx-z]")
49+
);
50+
51+
expectCompiled(
52+
"single chars with gaps",
53+
["a", "c", "e", "g"],
54+
// cspell:ignore aceg
55+
(e) => e.toMatchInlineSnapshot("[aceg]")
56+
);
57+
58+
expectCompiled(
59+
"single chars mixed ranges and singles",
60+
["1", "2", "3", "5", "7", "8", "9"],
61+
(e) => e.toMatchInlineSnapshot("[1-357-9]")
3362
);
3463

3564
expectCompiled(
3665
"prefixes",
3766
["ab1", "ab2", "ab3", "ab4", "de5", "de6", "de7", "ef8", "ef9", "gh0"],
38-
(e) => e.toMatchInlineSnapshot("(ab[1234]|de[567]|ef[89]|gh0)")
67+
(e) => e.toMatchInlineSnapshot("(ab[1-4]|de[5-7]|ef[89]|gh0)")
3968
);
4069

4170
expectCompiled("short prefixes", "a,ab", (e) =>
@@ -49,15 +78,15 @@ describe("itemsToRegexp", () => {
4978
);
5079

5180
expectCompiled("suffixes", "a1,b1,c1,d1,e1,a2,b2,c2", (e) =>
52-
e.toMatchInlineSnapshot("([abcde]1|[abc]2)")
81+
e.toMatchInlineSnapshot("([a-e]1|[a-c]2)")
5382
);
5483

5584
expectCompiled(
5685
"common prod",
5786
"674,542,965,12,942,483,445,943,423,995,434,122,995,248,432,165,436,86,435,221",
5887
(e) =>
5988
e.toMatchInlineSnapshot(
60-
"(1(2|22|65)|4(3[2456]|23|45|83)|9(42|43|65|95)|221|248|542|674|86)"
89+
"(1(2|22|65)|4(3[24-6]|23|45|83)|9(42|43|65|95)|221|248|542|674|86)"
6190
)
6291
);
6392

0 commit comments

Comments
 (0)