|
4 | 4 |
|
5 | 5 | #include "encoding.h" |
6 | 6 |
|
| 7 | +#include "simdutf.h" |
7 | 8 | #include "util.h" |
8 | 9 |
|
| 10 | +#include <workerd/io/features.h> |
9 | 11 | #include <workerd/jsg/jsg.h> |
10 | 12 | #include <workerd/util/strings.h> |
11 | 13 |
|
@@ -289,7 +291,7 @@ kj::Maybe<IcuDecoder> IcuDecoder::create(Encoding encoding, bool fatal, bool ign |
289 | 291 | if (U_FAILURE(status)) return kj::none; |
290 | 292 | } |
291 | 293 |
|
292 | | - return IcuDecoder(encoding, inner, ignoreBom); |
| 294 | + return IcuDecoder(encoding, inner, fatal, ignoreBom); |
293 | 295 | } |
294 | 296 |
|
295 | 297 | kj::Maybe<jsg::JsString> IcuDecoder::decode( |
@@ -350,7 +352,32 @@ kj::Maybe<jsg::JsString> IcuDecoder::decode( |
350 | 352 | omitInitialBom = data[0] == 0xfeff; |
351 | 353 | bomSeen = true; |
352 | 354 | } |
353 | | - return js.str(data.slice(omitInitialBom ? 1 : 0, data.size())); |
| 355 | + |
| 356 | + auto slice = data.slice(omitInitialBom ? 1 : 0, data.size()); |
| 357 | + |
| 358 | + // If pedanticWpt flag is enabled, then we follow the spec and fix invalid |
| 359 | + // surrogates on the UTF-16 input. |
| 360 | + if (slice.size() == 0 || !FeatureFlags::get(js).getPedanticWpt()) { |
| 361 | + return js.str(slice); |
| 362 | + } |
| 363 | + |
| 364 | + if (simdutf::validate_utf16(slice.begin(), slice.size())) { |
| 365 | + return js.str(slice); |
| 366 | + } |
| 367 | + |
| 368 | + if (fatal) { |
| 369 | + // In fatal mode, return error for invalid surrogates |
| 370 | + return kj::none; |
| 371 | + } |
| 372 | + |
| 373 | + // In non-fatal mode, replace invalid surrogates with U+FFFD. |
| 374 | + // Output size equals input size because each invalid surrogate (1 code unit) |
| 375 | + // is replaced with U+FFFD (also 1 code unit). |
| 376 | + // Use stack allocation for small strings (up to 256 code units) to avoid |
| 377 | + // heap allocation overhead. |
| 378 | + kj::SmallArray<char16_t, 256> fixed(slice.size()); |
| 379 | + simdutf::to_well_formed_utf16(slice.begin(), slice.size(), fixed.begin()); |
| 380 | + return js.str(fixed.asPtr()); |
354 | 381 | } |
355 | 382 | } |
356 | 383 | } |
|
0 commit comments