-
Notifications
You must be signed in to change notification settings - Fork 116
Closed
Labels
Description
Summary
Heap-buffer-overflow at utf16_to_utf8.h:141 due to "write-before-check" pattern in the 1-byte ASCII case. The _safe function designed to prevent buffer overflows actually causes one.
Root Cause
Two locations with identical "write-before-check" bug:
Bug 1: ASCII fast path (lines 122-131)
while (pos < final_pos) {
*utf8_output++ = ...; // BUG: Write FIRST
pos++;
if (check_output && size_t(end - utf8_output) == 0) { // Check AFTER
return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, ...);
}
}Bug 2: Single-char ASCII (lines 139-146)
if ((word & 0xFF80) == 0) {
*utf8_output++ = char(word); // BUG: Write FIRST
pos++;
if (check_output && size_t(end - utf8_output) == 0) { // Check AFTER
return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, ...);
}
}Correct pattern (2-byte case, lines 147-156):
if (check_output && size_t(end - utf8_output) < 2) { // Check FIRST
return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, ...);
}
*utf8_output++ = ...; // Write AFTERBoth 1-byte cases write before checking, causing overflow when buffer is exactly full.
PoC
Minimal reproduction
#include "simdutf.h"
int main() {
// 'é' (2 UTF-8 bytes) + 'A' (1 UTF-8 byte) = 3 bytes needed
char16_t input[] = {0x00E9, 'A'};
char output[2]; // Only 2 bytes buffer
// API says utf8_len is "the maximum output length"
// Function should write at most 2 bytes, but writes 3 → overflow!
size_t written = simdutf::convert_utf16_to_utf8_safe(
input, 2, output, 2);
// written == 3, violating the API contract
return 0;
}Fuzzer harness (discovers this bug)
This is a custom fuzzer we wrote to test _safe() functions not covered by existing fuzzers:
// fuzz_find_safe.cpp - Fuzzer for simdutf find() and *_safe() functions
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <vector>
#include <algorithm>
#include "simdutf.h"
constexpr size_t MAX_INPUT_SIZE = 64 * 1024;
constexpr size_t MAX_OUTPUT_SIZE = 4 * 1024 * 1024;
struct FuzzReader {
const uint8_t* data;
size_t size;
size_t pos = 0;
explicit FuzzReader(const uint8_t* d, size_t s) : data(d), size(s) {}
uint8_t read_u8() {
if (pos >= size) return 0;
return data[pos++];
}
uint16_t read_u16() {
if (pos + 2 > size) return 0;
uint16_t val = data[pos] | (data[pos + 1] << 8);
pos += 2;
return val;
}
const char* get_char_data() const {
return reinterpret_cast<const char*>(data + pos);
}
size_t remaining() const {
return (pos < size) ? (size - pos) : 0;
}
};
// Test convert_utf16_to_utf8_safe - this function found the bug
void fuzz_convert_utf16_to_utf8_safe(const std::vector<char16_t>& input,
size_t output_limit) {
if (input.empty()) return;
size_t expected_len = simdutf::utf8_length_from_utf16(input.data(), input.size());
if (expected_len == 0 || expected_len > MAX_OUTPUT_SIZE) return;
size_t actual_limit = std::min(output_limit, expected_len + 1);
if (actual_limit == 0) actual_limit = 1;
std::vector<char> output(actual_limit);
size_t written = simdutf::convert_utf16_to_utf8_safe(
input.data(), input.size(), output.data(), actual_limit);
// Key check: written must not exceed actual_limit
if (written > actual_limit) {
__builtin_trap(); // Buffer overflow detected!
}
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
if (size < 4) return 0;
FuzzReader reader(data, size);
uint8_t test_case = reader.read_u8();
reader.read_u8(); // needle_byte (unused here)
uint16_t limit_mod = reader.read_u16();
const char* char_input = reader.get_char_data();
size_t char_len = reader.remaining();
if (char_len == 0 || char_len > MAX_INPUT_SIZE) return 0;
size_t char16_count = char_len / sizeof(char16_t);
std::vector<char16_t> char16_input;
if (char16_count > 0) {
char16_input.resize(char16_count);
std::memcpy(char16_input.data(), char_input, char16_count * sizeof(char16_t));
}
size_t output_limit = (limit_mod == 0) ? 1 : (limit_mod * 64);
if (output_limit > MAX_OUTPUT_SIZE) output_limit = MAX_OUTPUT_SIZE;
if (test_case % 5 == 3 && !char16_input.empty()) {
fuzz_convert_utf16_to_utf8_safe(char16_input, output_limit);
}
return 0;
}Compile and run
# Build fuzzer
clang++ -fsanitize=address,fuzzer -g -std=c++17 -I<simdutf_include> fuzz_find_safe.cpp -o fuzz_find_safe
# Run fuzzer (finds crash within seconds)
./fuzz_find_safeASAN output
==123==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x502000000032
WRITE of size 1 at 0x502000000032 thread T0
#0 in simdutf::scalar::utf16_to_utf8::convert_with_errors<...> utf16_to_utf8.h:141:22
#1 in simdutf::convert_utf16_to_utf8_safe implementation.cpp:1819:7
0x502000000032 is located 0 bytes after 2-byte region [0x502000000030,0x502000000032)
API Contract Violation
The API documentation explicitly states:
@param utf8_len the maximum output length
But the function:
- Writes beyond
utf8_lenbytes - Returns
written > utf8_len(returns 3 when limit is 2)
Both behaviors violate the documented API contract.
Suggested Fix
Two locations need fixing:
Fix 1: ASCII fast path (lines 122-131)
while (pos < final_pos) {
+ if (check_output && size_t(end - utf8_output) < 1) {
+ return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
+ utf8_output - start);
+ }
*utf8_output++ = !match_system(big_endian)
? char(u16_swap_bytes(data[pos]))
: char(data[pos]);
pos++;
- if (check_output && size_t(end - utf8_output) == 0) {
- return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
- utf8_output - start);
- }
}Fix 2: Single-char ASCII (lines 139-146)
if ((word & 0xFF80) == 0) {
// will generate one UTF-8 bytes
+ if (check_output && size_t(end - utf8_output) < 1) {
+ return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
+ utf8_output - start);
+ }
*utf8_output++ = char(word);
pos++;
- if (check_output && size_t(end - utf8_output) == 0) {
- return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
- utf8_output - start);
- }
}Both fixes move the bounds check BEFORE the write operation.