fix(parser): fix parsing lone surrogates in StringLiterals (#10180)

overlookmotel · overlookmotel · commit 38d2beaf6017 · 2025-04-02T12:57:52.000Z
Fix 2 edge cases when parsing lone surrogates in `StringLiteral`s:

* Lone surrogate followed by `\u{...}` escape e.g. `"\uD800\u{41}"`.
* Escaped lossy replacement character after lone surrogate e.g. `"\uD800 \u{FFFD}"`.
diff --git a/crates/oxc_codegen/tests/integration/unit.rs b/crates/oxc_codegen/tests/integration/unit.rs
@@ -510,10 +510,16 @@ fn string() {
     test_minify("let x = '\\'\\'\\'\"\"\"${}';", "let x=`'''\"\"\"\\${}`;");
 
     // Lossy replacement character
-    test("let x = \"�\";", "let x = \"�\";\n");
-    test_minify("let x = \"�\";", "let x=`�`;");
-    test("let x = \"� ��� �\";", "let x = \"� ��� �\";\n");
-    test_minify("let x = \"� ��� �\";", "let x=`� ��� �`;");
+    test("let x = \"�\\u{FFFD}\";", "let x = \"��\";\n");
+    test_minify("let x = \"�\\u{FFFD}\";", "let x=`��`;");
+    test(
+        "let x = \"� ��� \\u{FFFD} \\u{FFFD}\\u{FFFD}\\u{FFFD} �\";",
+        "let x = \"� ��� � ��� �\";\n",
+    );
+    test_minify(
+        "let x = \"� ��� \\u{FFFD} \\u{FFFD}\\u{FFFD}\\u{FFFD} �\";",
+        "let x=`� ��� � ��� �`;",
+    );
     // Lone surrogates
     test(
         "let x = \"\\uD800 \\uDBFF \\uDC00 \\uDFFF\";",
@@ -523,6 +529,8 @@ fn string() {
         "let x = \"\\uD800 \\uDBFF \\uDC00 \\uDFFF\";",
         "let x=`\\ud800 \\udbff \\udc00 \\udfff`;",
     );
+    test("let x = \"\\uD800\u{41}\";", "let x = \"\\ud800A\";\n");
+    test_minify("let x = \"\\uD800\u{41}\";", "let x=`\\ud800A`;");
     // Invalid pairs
     test(
         "let x = \"\\uD800\\uDBFF \\uDC00\\uDFFF\";",
@@ -534,12 +542,12 @@ fn string() {
     );
     // Lone surrogates and lossy replacement characters
     test(
-        "let x = \"���\\uD800\\uDBFF���\\uDC00\\uDFFF���\";",
-        "let x = \"���\\ud800\\udbff���\\udc00\\udfff���\";\n",
+        "let x = \"��\\u{FFFD}\\u{FFFD}\\uD800\\uDBFF��\\u{FFFD}\\u{FFFD}\\uDC00\\uDFFF��\\u{FFFD}\\u{FFFD}\";",
+        "let x = \"����\\ud800\\udbff����\\udc00\\udfff����\";\n",
     );
     test_minify(
-        "let x = \"���\\uD800\\uDBFF���\\uDC00\\uDFFF���\";",
-        "let x=`���\\ud800\\udbff���\\udc00\\udfff���`;",
+        "let x = \"��\\u{FFFD}\\u{FFFD}\\uD800\\uDBFF��\\u{FFFD}\\u{FFFD}\\uDC00\\uDFFF��\\u{FFFD}\\u{FFFD}\";",
+        "let x=`����\\ud800\\udbff����\\udc00\\udfff����`;",
     );
 
     test_minify(
diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs
@@ -134,7 +134,16 @@ impl<'a> Lexer<'a> {
 
         // For strings and templates, surrogate pairs are valid grammar, e.g. `"\uD83D\uDE00" === 😀`.
         match value {
-            UnicodeEscape::CodePoint(ch) | UnicodeEscape::SurrogatePair(ch) => {
+            UnicodeEscape::CodePoint(ch) => {
+                if ch == '\u{FFFD}' && self.token.lone_surrogates {
+                    // Lossy replacement character is being used as an escape marker. Escape it.
+                    text.push_str("\u{FFFD}fffd");
+                } else {
+                    text.push(ch);
+                }
+            }
+            UnicodeEscape::SurrogatePair(ch) => {
+                // Surrogate pair is always >= 0x10000, so cannot be 0xFFFD
                 text.push(ch);
             }
             UnicodeEscape::LoneSurrogate(code_point) => {
@@ -276,22 +285,23 @@ impl<'a> Lexer<'a> {
             self.source.next_byte_unchecked();
         }
 
-        let low = self.hex_4_digits()?;
-
         // The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF,
         // and is called a low surrogate or a trail surrogate.
-        // If this isn't a valid pair, rewind to before the 2nd, and return the first only.
-        // The 2nd could be the first part of a valid pair.
-        if !(MIN_LOW..=MAX_LOW).contains(&low) {
-            self.source.set_position(before_second);
-            return Some(UnicodeEscape::LoneSurrogate(high));
+        if let Some(low) = self.hex_4_digits() {
+            if (MIN_LOW..=MAX_LOW).contains(&low) {
+                let code_point = pair_to_code_point(high, low);
+                // SAFETY: `high` and `low` have been checked to be in ranges which always yield a `code_point`
+                // which is a valid `char`
+                let ch = unsafe { char::from_u32_unchecked(code_point) };
+                return Some(UnicodeEscape::SurrogatePair(ch));
+            }
         }
 
-        let code_point = pair_to_code_point(high, low);
-        // SAFETY: `high` and `low` have been checked to be in ranges which always yield a `code_point`
-        // which is a valid `char`
-        let ch = unsafe { char::from_u32_unchecked(code_point) };
-        Some(UnicodeEscape::SurrogatePair(ch))
+        // Not a valid surrogate pair.
+        // Rewind to before the 2nd, and return the first only.
+        // The 2nd could be the first part of a valid pair, or a `\u{...}` escape.
+        self.source.set_position(before_second);
+        Some(UnicodeEscape::LoneSurrogate(high))
     }
 
     // EscapeSequence ::
diff --git a/napi/parser/test/parse-raw.test.ts b/napi/parser/test/parse-raw.test.ts
@@ -65,38 +65,6 @@ for (let path of await readdir(ACORN_TEST262_DIR_PATH, { recursive: true })) {
   test262FixturePaths.push(path);
 }
 
-// Test raw transfer output matches standard (via JSON) output for some large files
-describe('fixtures', () => {
-  it.each(benchFixtures)('%s', (filename, sourceText) => {
-    const retStandard = parseSync(filename, sourceText);
-    const { program: programStandard, comments: commentsStandard, module: moduleStandard, errors: errorsStandard } =
-      retStandard;
-
-    // @ts-ignore
-    const retRaw = parseSync(filename, sourceText, { experimentalRawTransfer: true });
-    const { program: programRaw, comments: commentsRaw } = retRaw;
-    // Remove `null` values, to match what NAPI-RS does
-    const moduleRaw = clean(retRaw.module);
-    const errorsRaw = clean(retRaw.errors);
-
-    // Compare as objects
-    expect(programRaw).toEqual(programStandard);
-    expect(commentsRaw).toEqual(commentsStandard);
-    expect(moduleRaw).toEqual(moduleStandard);
-    expect(errorsRaw).toEqual(errorsStandard);
-
-    // Compare as JSON (to ensure same field order)
-    const jsonStandard = stringify({
-      program: programStandard,
-      comments: commentsStandard,
-      module: moduleStandard,
-      errors: errorsStandard,
-    });
-    const jsonRaw = stringify({ program: programRaw, comments: commentsRaw, module: moduleRaw, errors: errorsRaw });
-    expect(jsonRaw).toEqual(jsonStandard);
-  });
-});
-
 // Test raw transfer output matches standard (via JSON) output for Test262 test cases
 describe('test262', () => {
   it.each(test262FixturePaths)('%s', async (path) => {
@@ -122,6 +90,54 @@ describe('test262', () => {
   });
 });
 
+// Test raw transfer output matches standard (via JSON) output for edge cases not covered by Test262
+describe('edge cases', () => {
+  it.each([
+    // `StringLiteral`s containing lone surrogates and/or lossy replacement characters
+    ';"\\uD800\\uDBFF";',
+    ';"�\\u{FFFD}";',
+    ';"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}";',
+  ])('%s', (sourceText) => {
+    assertRawAndStandardMatch('dummy.js', sourceText);
+  });
+});
+
+// Test raw transfer output matches standard (via JSON) output for some large files
+describe('fixtures', () => {
+  it.each(benchFixtures)('%s', (filename, sourceText) => {
+    assertRawAndStandardMatch(filename, sourceText);
+  });
+});
+
+function assertRawAndStandardMatch(filename, sourceText) {
+  const retStandard = parseSync(filename, sourceText);
+  const { program: programStandard, comments: commentsStandard, module: moduleStandard, errors: errorsStandard } =
+    retStandard;
+
+  // @ts-ignore
+  const retRaw = parseSync(filename, sourceText, { experimentalRawTransfer: true });
+  const { program: programRaw, comments: commentsRaw } = retRaw;
+  // Remove `null` values, to match what NAPI-RS does
+  const moduleRaw = clean(retRaw.module);
+  const errorsRaw = clean(retRaw.errors);
+
+  // Compare as objects
+  expect(programRaw).toEqual(programStandard);
+  expect(commentsRaw).toEqual(commentsStandard);
+  expect(moduleRaw).toEqual(moduleStandard);
+  expect(errorsRaw).toEqual(errorsStandard);
+
+  // Compare as JSON (to ensure same field order)
+  const jsonStandard = stringify({
+    program: programStandard,
+    comments: commentsStandard,
+    module: moduleStandard,
+    errors: errorsStandard,
+  });
+  const jsonRaw = stringify({ program: programRaw, comments: commentsRaw, module: moduleRaw, errors: errorsRaw });
+  expect(jsonRaw).toEqual(jsonStandard);
+}
+
 // Stringify to JSON, removing values which are invalid in JSON
 function stringify(obj) {
   return JSON.stringify(obj, (_key, value) => {
diff --git a/napi/parser/test/parse.test.ts b/napi/parser/test/parse.test.ts
@@ -180,6 +180,62 @@ describe('parse', () => {
     });
   });
 
+  describe('`StringLiteral`', () => {
+    it('lone surrogates', () => {
+      const ret = parseSync('test.js', ';"\\uD800\\uDBFF";');
+      expect(ret.errors.length).toBe(0);
+      expect(ret.program.body.length).toBe(2);
+      expect(ret.program.body[1]).toEqual({
+        type: 'ExpressionStatement',
+        start: 1,
+        end: 16,
+        expression: {
+          type: 'Literal',
+          start: 1,
+          end: 15,
+          value: '\ud800\udbff',
+          raw: '"\\uD800\\uDBFF"',
+        },
+      });
+    });
+
+    it('lossy replacement character', () => {
+      const ret = parseSync('test.js', ';"�\\u{FFFD}";');
+      expect(ret.errors.length).toBe(0);
+      expect(ret.program.body.length).toBe(2);
+      expect(ret.program.body[1]).toEqual({
+        type: 'ExpressionStatement',
+        start: 1,
+        end: 13,
+        expression: {
+          type: 'Literal',
+          start: 1,
+          end: 12,
+          value: '��',
+          raw: '"�\\u{FFFD}"',
+        },
+      });
+    });
+
+    it('lone surrogates and lossy replacement characters', () => {
+      const ret = parseSync('test.js', ';"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}";');
+      expect(ret.errors.length).toBe(0);
+      expect(ret.program.body.length).toBe(2);
+      expect(ret.program.body[1]).toEqual({
+        type: 'ExpressionStatement',
+        start: 1,
+        end: 34,
+        expression: {
+          type: 'Literal',
+          start: 1,
+          end: 33,
+          value: '��\ud800\udbff��',
+          raw: '"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}"',
+        },
+      });
+    });
+  });
+
   describe('`RegExpLiteral`', () => {
     it('has `value` as `RegExp` when valid regexp', () => {
       const ret = parseSync('test.js', '/abc/gu');