Skip to content

# Heap buffer overflow in convert_utf16_to_utf8_safe #911

@OwenSanzas

Description

@OwenSanzas

Summary

Heap-buffer-overflow at utf16_to_utf8.h:141 due to "write-before-check" pattern in the 1-byte ASCII case. The _safe function designed to prevent buffer overflows actually causes one.

Root Cause

Two locations with identical "write-before-check" bug:

Bug 1: ASCII fast path (lines 122-131)

while (pos < final_pos) {
  *utf8_output++ = ...;   // BUG: Write FIRST
  pos++;
  if (check_output && size_t(end - utf8_output) == 0) {  // Check AFTER
    return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, ...);
  }
}

Bug 2: Single-char ASCII (lines 139-146)

if ((word & 0xFF80) == 0) {
  *utf8_output++ = char(word);   // BUG: Write FIRST
  pos++;
  if (check_output && size_t(end - utf8_output) == 0) {  // Check AFTER
    return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, ...);
  }
}

Correct pattern (2-byte case, lines 147-156):

if (check_output && size_t(end - utf8_output) < 2) {  // Check FIRST
  return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, ...);
}
*utf8_output++ = ...;  // Write AFTER

Both 1-byte cases write before checking, causing overflow when buffer is exactly full.

PoC

Minimal reproduction

#include "simdutf.h"

int main() {
    // 'é' (2 UTF-8 bytes) + 'A' (1 UTF-8 byte) = 3 bytes needed
    char16_t input[] = {0x00E9, 'A'};
    char output[2];  // Only 2 bytes buffer

    // API says utf8_len is "the maximum output length"
    // Function should write at most 2 bytes, but writes 3 → overflow!
    size_t written = simdutf::convert_utf16_to_utf8_safe(
        input, 2, output, 2);

    // written == 3, violating the API contract
    return 0;
}

Fuzzer harness (discovers this bug)

This is a custom fuzzer we wrote to test _safe() functions not covered by existing fuzzers:

// fuzz_find_safe.cpp - Fuzzer for simdutf find() and *_safe() functions
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <vector>
#include <algorithm>
#include "simdutf.h"

constexpr size_t MAX_INPUT_SIZE = 64 * 1024;
constexpr size_t MAX_OUTPUT_SIZE = 4 * 1024 * 1024;

struct FuzzReader {
    const uint8_t* data;
    size_t size;
    size_t pos = 0;

    explicit FuzzReader(const uint8_t* d, size_t s) : data(d), size(s) {}

    uint8_t read_u8() {
        if (pos >= size) return 0;
        return data[pos++];
    }

    uint16_t read_u16() {
        if (pos + 2 > size) return 0;
        uint16_t val = data[pos] | (data[pos + 1] << 8);
        pos += 2;
        return val;
    }

    const char* get_char_data() const {
        return reinterpret_cast<const char*>(data + pos);
    }

    size_t remaining() const {
        return (pos < size) ? (size - pos) : 0;
    }
};

// Test convert_utf16_to_utf8_safe - this function found the bug
void fuzz_convert_utf16_to_utf8_safe(const std::vector<char16_t>& input,
                                      size_t output_limit) {
    if (input.empty()) return;

    size_t expected_len = simdutf::utf8_length_from_utf16(input.data(), input.size());
    if (expected_len == 0 || expected_len > MAX_OUTPUT_SIZE) return;

    size_t actual_limit = std::min(output_limit, expected_len + 1);
    if (actual_limit == 0) actual_limit = 1;

    std::vector<char> output(actual_limit);

    size_t written = simdutf::convert_utf16_to_utf8_safe(
        input.data(), input.size(), output.data(), actual_limit);

    // Key check: written must not exceed actual_limit
    if (written > actual_limit) {
        __builtin_trap();  // Buffer overflow detected!
    }
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
    if (size < 4) return 0;

    FuzzReader reader(data, size);
    uint8_t test_case = reader.read_u8();
    reader.read_u8();  // needle_byte (unused here)
    uint16_t limit_mod = reader.read_u16();

    const char* char_input = reader.get_char_data();
    size_t char_len = reader.remaining();
    if (char_len == 0 || char_len > MAX_INPUT_SIZE) return 0;

    size_t char16_count = char_len / sizeof(char16_t);
    std::vector<char16_t> char16_input;
    if (char16_count > 0) {
        char16_input.resize(char16_count);
        std::memcpy(char16_input.data(), char_input, char16_count * sizeof(char16_t));
    }

    size_t output_limit = (limit_mod == 0) ? 1 : (limit_mod * 64);
    if (output_limit > MAX_OUTPUT_SIZE) output_limit = MAX_OUTPUT_SIZE;

    if (test_case % 5 == 3 && !char16_input.empty()) {
        fuzz_convert_utf16_to_utf8_safe(char16_input, output_limit);
    }

    return 0;
}

Compile and run

# Build fuzzer
clang++ -fsanitize=address,fuzzer -g -std=c++17 -I<simdutf_include> fuzz_find_safe.cpp -o fuzz_find_safe

# Run fuzzer (finds crash within seconds)
./fuzz_find_safe

ASAN output

==123==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x502000000032
WRITE of size 1 at 0x502000000032 thread T0
    #0 in simdutf::scalar::utf16_to_utf8::convert_with_errors<...> utf16_to_utf8.h:141:22
    #1 in simdutf::convert_utf16_to_utf8_safe implementation.cpp:1819:7

0x502000000032 is located 0 bytes after 2-byte region [0x502000000030,0x502000000032)

API Contract Violation

The API documentation explicitly states:

@param utf8_len      the maximum output length

But the function:

  1. Writes beyond utf8_len bytes
  2. Returns written > utf8_len (returns 3 when limit is 2)

Both behaviors violate the documented API contract.

Suggested Fix

Two locations need fixing:

Fix 1: ASCII fast path (lines 122-131)

          while (pos < final_pos) {
+           if (check_output && size_t(end - utf8_output) < 1) {
+             return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
+                                utf8_output - start);
+           }
            *utf8_output++ = !match_system(big_endian)
                                 ? char(u16_swap_bytes(data[pos]))
                                 : char(data[pos]);
            pos++;
-           if (check_output && size_t(end - utf8_output) == 0) {
-             return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
-                                utf8_output - start);
-           }
          }

Fix 2: Single-char ASCII (lines 139-146)

     if ((word & 0xFF80) == 0) {
       // will generate one UTF-8 bytes
+      if (check_output && size_t(end - utf8_output) < 1) {
+        return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
+                           utf8_output - start);
+      }
       *utf8_output++ = char(word);
       pos++;
-      if (check_output && size_t(end - utf8_output) == 0) {
-        return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
-                           utf8_output - start);
-      }
     }

Both fixes move the bounds check BEFORE the write operation.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions