Skip to content

Commit 8dee6c3

Browse files
committed
handle invalid utf-16 surrogates on TextDecoder
1 parent ebdbd36 commit 8dee6c3

5 files changed

Lines changed: 50 additions & 19 deletions

File tree

src/workerd/api/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,9 @@ wd_cc_library(
433433
srcs = ["encoding.c++"],
434434
hdrs = ["encoding.h"],
435435
implementation_deps = [
436+
"//src/workerd/io:features",
436437
"//src/workerd/util:strings",
438+
"@simdutf",
437439
],
438440
visibility = ["//visibility:public"],
439441
deps = [
@@ -537,6 +539,7 @@ kj_test(
537539
src = "data-url-test.c++",
538540
deps = [
539541
":data-url",
542+
"//src/workerd/io",
540543
],
541544
)
542545

src/workerd/api/encoding.c++

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44

55
#include "encoding.h"
66

7+
#include "simdutf.h"
78
#include "util.h"
89

10+
#include <workerd/io/features.h>
911
#include <workerd/jsg/jsg.h>
1012
#include <workerd/util/strings.h>
1113

@@ -289,7 +291,7 @@ kj::Maybe<IcuDecoder> IcuDecoder::create(Encoding encoding, bool fatal, bool ign
289291
if (U_FAILURE(status)) return kj::none;
290292
}
291293

292-
return IcuDecoder(encoding, inner, ignoreBom);
294+
return IcuDecoder(encoding, inner, fatal, ignoreBom);
293295
}
294296

295297
kj::Maybe<jsg::JsString> IcuDecoder::decode(
@@ -350,7 +352,32 @@ kj::Maybe<jsg::JsString> IcuDecoder::decode(
350352
omitInitialBom = data[0] == 0xfeff;
351353
bomSeen = true;
352354
}
353-
return js.str(data.slice(omitInitialBom ? 1 : 0, data.size()));
355+
356+
auto slice = data.slice(omitInitialBom ? 1 : 0, data.size());
357+
358+
// If pedanticWpt flag is enabled, then we follow the spec and fix invalid
359+
// surrogates on the UTF-16 input.
360+
if (slice.size() == 0 || !FeatureFlags::get(js).getPedanticWpt()) {
361+
return js.str(slice);
362+
}
363+
364+
if (simdutf::validate_utf16(slice.begin(), slice.size())) {
365+
return js.str(slice);
366+
}
367+
368+
if (fatal) {
369+
// In fatal mode, return error for invalid surrogates
370+
return kj::none;
371+
}
372+
373+
// In non-fatal mode, replace invalid surrogates with U+FFFD.
374+
// Output size equals input size because each invalid surrogate (1 code unit)
375+
// is replaced with U+FFFD (also 1 code unit).
376+
// Use stack allocation for small strings (up to 256 code units) to avoid
377+
// heap allocation overhead.
378+
kj::SmallArray<char16_t, 256> fixed(slice.size());
379+
simdutf::to_well_formed_utf16(slice.begin(), slice.size(), fixed.begin());
380+
return js.str(fixed.asPtr());
354381
}
355382
}
356383
}

src/workerd/api/encoding.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,10 @@ class AsciiDecoder final: public Decoder {
9595
// of encodings required by the Encoding specification.
9696
class IcuDecoder final: public Decoder {
9797
public:
98-
IcuDecoder(Encoding encoding, UConverter* converter, bool ignoreBom)
98+
IcuDecoder(Encoding encoding, UConverter* converter, bool fatal, bool ignoreBom)
9999
: encoding(encoding),
100100
inner(converter),
101+
fatal(fatal),
101102
ignoreBom(ignoreBom),
102103
bomSeen(false) {}
103104
IcuDecoder(IcuDecoder&&) = default;
@@ -124,6 +125,7 @@ class IcuDecoder final: public Decoder {
124125
Encoding encoding;
125126
std::unique_ptr<UConverter, ConverterDeleter> inner;
126127

128+
bool fatal;
127129
bool ignoreBom;
128130
bool bomSeen;
129131
};

src/workerd/io/BUILD.bazel

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ wd_cc_library(
4747
] + ["//src/workerd/api:srcs"],
4848
hdrs = [
4949
"compatibility-date.h",
50-
"features.h",
5150
"hibernation-manager.h",
5251
"io-channels.h",
5352
"io-context.h",
@@ -90,6 +89,7 @@ wd_cc_library(
9089
":actor-storage_capnp",
9190
":cdp_capnp",
9291
":container_capnp",
92+
":features",
9393
":frankenvalue",
9494
":io-gate",
9595
":io-helpers",
@@ -125,6 +125,19 @@ wd_cc_library(
125125
],
126126
)
127127

128+
# Headers only target to avoid having circular dependencies.
129+
wd_cc_library(
130+
name = "features",
131+
hdrs = ["features.h"],
132+
implementation_deps = [
133+
"//src/workerd/jsg",
134+
],
135+
visibility = ["//visibility:public"],
136+
deps = [
137+
":compatibility-date_capnp",
138+
],
139+
)
140+
128141
wd_cc_library(
129142
name = "worker-modules",
130143
srcs = ["worker-modules.c++"],

src/wpt/encoding-test.ts

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -165,21 +165,7 @@ export default {
165165
],
166166
},
167167
'textdecoder-streaming.any.js': {},
168-
'textdecoder-utf16-surrogates.any.js': {
169-
comment: 'Investigate why we are not blocking invalid surrogates',
170-
expectedFailures: [
171-
'utf-16le - lone surrogate lead',
172-
'utf-16le - lone surrogate lead (fatal flag set)',
173-
'utf-16le - lone surrogate trail',
174-
'utf-16le - lone surrogate trail (fatal flag set)',
175-
'utf-16le - unmatched surrogate lead',
176-
'utf-16le - unmatched surrogate lead (fatal flag set)',
177-
'utf-16le - unmatched surrogate trail',
178-
'utf-16le - unmatched surrogate trail (fatal flag set)',
179-
'utf-16le - swapped surrogate pair',
180-
'utf-16le - swapped surrogate pair (fatal flag set)',
181-
],
182-
},
168+
'textdecoder-utf16-surrogates.any.js': {},
183169
'textencoder-constructor-non-utf.any.js': {
184170
comment: 'Investigate this',
185171
expectedFailures: [

0 commit comments

Comments
 (0)