Skip to content

Commit 38d2bea

Browse files
committed
fix(parser): fix parsing lone surrogates in StringLiterals (#10180)
Fix 2 edge cases when parsing lone surrogates in `StringLiteral`s: * Lone surrogate followed by `\u{...}` escape e.g. `"\uD800\u{41}"`. * Escaped lossy replacement character after lone surrogate e.g. `"\uD800 \u{FFFD}"`.
1 parent 775abac commit 38d2bea

File tree

4 files changed

+143
-53
lines changed

4 files changed

+143
-53
lines changed

crates/oxc_codegen/tests/integration/unit.rs

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -510,10 +510,16 @@ fn string() {
510510
test_minify("let x = '\\'\\'\\'\"\"\"${}';", "let x=`'''\"\"\"\\${}`;");
511511

512512
// Lossy replacement character
513-
test("let x = \"\";", "let x = \"\";\n");
514-
test_minify("let x = \"\";", "let x=`�`;");
515-
test("let x = \"� ��� �\";", "let x = \"� ��� �\";\n");
516-
test_minify("let x = \"� ��� �\";", "let x=`� ��� �`;");
513+
test("let x = \"\\u{FFFD}\";", "let x = \"��\";\n");
514+
test_minify("let x = \"\\u{FFFD}\";", "let x=`��`;");
515+
test(
516+
"let x = \"� ��� \\u{FFFD} \\u{FFFD}\\u{FFFD}\\u{FFFD} �\";",
517+
"let x = \"� ��� � ��� �\";\n",
518+
);
519+
test_minify(
520+
"let x = \"� ��� \\u{FFFD} \\u{FFFD}\\u{FFFD}\\u{FFFD} �\";",
521+
"let x=`� ��� � ��� �`;",
522+
);
517523
// Lone surrogates
518524
test(
519525
"let x = \"\\uD800 \\uDBFF \\uDC00 \\uDFFF\";",
@@ -523,6 +529,8 @@ fn string() {
523529
"let x = \"\\uD800 \\uDBFF \\uDC00 \\uDFFF\";",
524530
"let x=`\\ud800 \\udbff \\udc00 \\udfff`;",
525531
);
532+
test("let x = \"\\uD800\u{41}\";", "let x = \"\\ud800A\";\n");
533+
test_minify("let x = \"\\uD800\u{41}\";", "let x=`\\ud800A`;");
526534
// Invalid pairs
527535
test(
528536
"let x = \"\\uD800\\uDBFF \\uDC00\\uDFFF\";",
@@ -534,12 +542,12 @@ fn string() {
534542
);
535543
// Lone surrogates and lossy replacement characters
536544
test(
537-
"let x = \"��\\uD800\\uDBFF��\\uDC00\\uDFFF��\";",
538-
"let x = \"���\\ud800\\udbff���\\udc00\\udfff���\";\n",
545+
"let x = \"��\\u{FFFD}\\u{FFFD}\\uD800\\uDBFF��\\u{FFFD}\\u{FFFD}\\uDC00\\uDFFF��\\u{FFFD}\\u{FFFD}\";",
546+
"let x = \"���\\ud800\\udbff���\\udc00\\udfff���\";\n",
539547
);
540548
test_minify(
541-
"let x = \"��\\uD800\\uDBFF��\\uDC00\\uDFFF��\";",
542-
"let x=`���\\ud800\\udbff���\\udc00\\udfff���`;",
549+
"let x = \"��\\u{FFFD}\\u{FFFD}\\uD800\\uDBFF��\\u{FFFD}\\u{FFFD}\\uDC00\\uDFFF��\\u{FFFD}\\u{FFFD}\";",
550+
"let x=`���\\ud800\\udbff���\\udc00\\udfff���`;",
543551
);
544552

545553
test_minify(

crates/oxc_parser/src/lexer/unicode.rs

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,16 @@ impl<'a> Lexer<'a> {
134134

135135
// For strings and templates, surrogate pairs are valid grammar, e.g. `"\uD83D\uDE00" === 😀`.
136136
match value {
137-
UnicodeEscape::CodePoint(ch) | UnicodeEscape::SurrogatePair(ch) => {
137+
UnicodeEscape::CodePoint(ch) => {
138+
if ch == '\u{FFFD}' && self.token.lone_surrogates {
139+
// Lossy replacement character is being used as an escape marker. Escape it.
140+
text.push_str("\u{FFFD}fffd");
141+
} else {
142+
text.push(ch);
143+
}
144+
}
145+
UnicodeEscape::SurrogatePair(ch) => {
146+
// Surrogate pair is always >= 0x10000, so cannot be 0xFFFD
138147
text.push(ch);
139148
}
140149
UnicodeEscape::LoneSurrogate(code_point) => {
@@ -276,22 +285,23 @@ impl<'a> Lexer<'a> {
276285
self.source.next_byte_unchecked();
277286
}
278287

279-
let low = self.hex_4_digits()?;
280-
281288
// The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF,
282289
// and is called a low surrogate or a trail surrogate.
283-
// If this isn't a valid pair, rewind to before the 2nd, and return the first only.
284-
// The 2nd could be the first part of a valid pair.
285-
if !(MIN_LOW..=MAX_LOW).contains(&low) {
286-
self.source.set_position(before_second);
287-
return Some(UnicodeEscape::LoneSurrogate(high));
290+
if let Some(low) = self.hex_4_digits() {
291+
if (MIN_LOW..=MAX_LOW).contains(&low) {
292+
let code_point = pair_to_code_point(high, low);
293+
// SAFETY: `high` and `low` have been checked to be in ranges which always yield a `code_point`
294+
// which is a valid `char`
295+
let ch = unsafe { char::from_u32_unchecked(code_point) };
296+
return Some(UnicodeEscape::SurrogatePair(ch));
297+
}
288298
}
289299

290-
let code_point = pair_to_code_point(high, low);
291-
// SAFETY: `high` and `low` have been checked to be in ranges which always yield a `code_point`
292-
// which is a valid `char`
293-
let ch = unsafe { char::from_u32_unchecked(code_point) };
294-
Some(UnicodeEscape::SurrogatePair(ch))
300+
// Not a valid surrogate pair.
301+
// Rewind to before the 2nd, and return the first only.
302+
// The 2nd could be the first part of a valid pair, or a `\u{...}` escape.
303+
self.source.set_position(before_second);
304+
Some(UnicodeEscape::LoneSurrogate(high))
295305
}
296306

297307
// EscapeSequence ::

napi/parser/test/parse-raw.test.ts

Lines changed: 48 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -65,38 +65,6 @@ for (let path of await readdir(ACORN_TEST262_DIR_PATH, { recursive: true })) {
6565
test262FixturePaths.push(path);
6666
}
6767

68-
// Test raw transfer output matches standard (via JSON) output for some large files
69-
describe('fixtures', () => {
70-
it.each(benchFixtures)('%s', (filename, sourceText) => {
71-
const retStandard = parseSync(filename, sourceText);
72-
const { program: programStandard, comments: commentsStandard, module: moduleStandard, errors: errorsStandard } =
73-
retStandard;
74-
75-
// @ts-ignore
76-
const retRaw = parseSync(filename, sourceText, { experimentalRawTransfer: true });
77-
const { program: programRaw, comments: commentsRaw } = retRaw;
78-
// Remove `null` values, to match what NAPI-RS does
79-
const moduleRaw = clean(retRaw.module);
80-
const errorsRaw = clean(retRaw.errors);
81-
82-
// Compare as objects
83-
expect(programRaw).toEqual(programStandard);
84-
expect(commentsRaw).toEqual(commentsStandard);
85-
expect(moduleRaw).toEqual(moduleStandard);
86-
expect(errorsRaw).toEqual(errorsStandard);
87-
88-
// Compare as JSON (to ensure same field order)
89-
const jsonStandard = stringify({
90-
program: programStandard,
91-
comments: commentsStandard,
92-
module: moduleStandard,
93-
errors: errorsStandard,
94-
});
95-
const jsonRaw = stringify({ program: programRaw, comments: commentsRaw, module: moduleRaw, errors: errorsRaw });
96-
expect(jsonRaw).toEqual(jsonStandard);
97-
});
98-
});
99-
10068
// Test raw transfer output matches standard (via JSON) output for Test262 test cases
10169
describe('test262', () => {
10270
it.each(test262FixturePaths)('%s', async (path) => {
@@ -122,6 +90,54 @@ describe('test262', () => {
12290
});
12391
});
12492

93+
// Test raw transfer output matches standard (via JSON) output for edge cases not covered by Test262
94+
describe('edge cases', () => {
95+
it.each([
96+
// `StringLiteral`s containing lone surrogates and/or lossy replacement characters
97+
';"\\uD800\\uDBFF";',
98+
';"�\\u{FFFD}";',
99+
';"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}";',
100+
])('%s', (sourceText) => {
101+
assertRawAndStandardMatch('dummy.js', sourceText);
102+
});
103+
});
104+
105+
// Test raw transfer output matches standard (via JSON) output for some large files
106+
describe('fixtures', () => {
107+
it.each(benchFixtures)('%s', (filename, sourceText) => {
108+
assertRawAndStandardMatch(filename, sourceText);
109+
});
110+
});
111+
112+
function assertRawAndStandardMatch(filename, sourceText) {
113+
const retStandard = parseSync(filename, sourceText);
114+
const { program: programStandard, comments: commentsStandard, module: moduleStandard, errors: errorsStandard } =
115+
retStandard;
116+
117+
// @ts-ignore
118+
const retRaw = parseSync(filename, sourceText, { experimentalRawTransfer: true });
119+
const { program: programRaw, comments: commentsRaw } = retRaw;
120+
// Remove `null` values, to match what NAPI-RS does
121+
const moduleRaw = clean(retRaw.module);
122+
const errorsRaw = clean(retRaw.errors);
123+
124+
// Compare as objects
125+
expect(programRaw).toEqual(programStandard);
126+
expect(commentsRaw).toEqual(commentsStandard);
127+
expect(moduleRaw).toEqual(moduleStandard);
128+
expect(errorsRaw).toEqual(errorsStandard);
129+
130+
// Compare as JSON (to ensure same field order)
131+
const jsonStandard = stringify({
132+
program: programStandard,
133+
comments: commentsStandard,
134+
module: moduleStandard,
135+
errors: errorsStandard,
136+
});
137+
const jsonRaw = stringify({ program: programRaw, comments: commentsRaw, module: moduleRaw, errors: errorsRaw });
138+
expect(jsonRaw).toEqual(jsonStandard);
139+
}
140+
125141
// Stringify to JSON, removing values which are invalid in JSON
126142
function stringify(obj) {
127143
return JSON.stringify(obj, (_key, value) => {

napi/parser/test/parse.test.ts

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,62 @@ describe('parse', () => {
180180
});
181181
});
182182

183+
describe('`StringLiteral`', () => {
184+
it('lone surrogates', () => {
185+
const ret = parseSync('test.js', ';"\\uD800\\uDBFF";');
186+
expect(ret.errors.length).toBe(0);
187+
expect(ret.program.body.length).toBe(2);
188+
expect(ret.program.body[1]).toEqual({
189+
type: 'ExpressionStatement',
190+
start: 1,
191+
end: 16,
192+
expression: {
193+
type: 'Literal',
194+
start: 1,
195+
end: 15,
196+
value: '\ud800\udbff',
197+
raw: '"\\uD800\\uDBFF"',
198+
},
199+
});
200+
});
201+
202+
it('lossy replacement character', () => {
203+
const ret = parseSync('test.js', ';"�\\u{FFFD}";');
204+
expect(ret.errors.length).toBe(0);
205+
expect(ret.program.body.length).toBe(2);
206+
expect(ret.program.body[1]).toEqual({
207+
type: 'ExpressionStatement',
208+
start: 1,
209+
end: 13,
210+
expression: {
211+
type: 'Literal',
212+
start: 1,
213+
end: 12,
214+
value: '��',
215+
raw: '"�\\u{FFFD}"',
216+
},
217+
});
218+
});
219+
220+
it('lone surrogates and lossy replacement characters', () => {
221+
const ret = parseSync('test.js', ';"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}";');
222+
expect(ret.errors.length).toBe(0);
223+
expect(ret.program.body.length).toBe(2);
224+
expect(ret.program.body[1]).toEqual({
225+
type: 'ExpressionStatement',
226+
start: 1,
227+
end: 34,
228+
expression: {
229+
type: 'Literal',
230+
start: 1,
231+
end: 33,
232+
value: '��\ud800\udbff��',
233+
raw: '"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}"',
234+
},
235+
});
236+
});
237+
});
238+
183239
describe('`RegExpLiteral`', () => {
184240
it('has `value` as `RegExp` when valid regexp', () => {
185241
const ret = parseSync('test.js', '/abc/gu');

0 commit comments

Comments
 (0)