Skip to content

Commit 7a23b2d

Browse files
authored
fix: decode web fetch legacy charsets (#73513)
* fix: decode web fetch legacy charsets
1 parent e4ff7c1 commit 7a23b2d

3 files changed

Lines changed: 206 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ Docs: https://docs.openclaw.ai
1010

1111
### Fixes
1212

13+
- Tools/web_fetch: decode response bodies from raw bytes using declared HTTP, XML, or HTML meta charsets before extraction, so Shift_JIS and other legacy-charset pages no longer return mojibake. Fixes #72916. Thanks @amknight.
14+
- Channels/Discord: bound message read/search REST calls, route those actions through Gateway execution, and fall back to `CommandTargetSessionKey` for inbound hook session keys so Discord reads do not hang and hooks still fire when `SessionKey` is empty. Fixes #73431. (#73521) Thanks @amknight.
1315
- Plugins/media: auto-enable provider plugins referenced by `agents.defaults.imageGenerationModel`, `videoGenerationModel`, and `musicGenerationModel` primary/fallback refs, so configured Google and MiniMax media providers do not stay disabled behind a restrictive plugin allowlist. Thanks @vincentkoc.
1416
- Memory-core/dreaming: retry managed dreaming cron registration after startup when the cron service is not reachable yet, so the scheduled Memory Dreaming Promotion sweep recovers without waiting for heartbeat traffic. Fixes #72841. Thanks @amknight.
1517

src/agents/tools/web-shared.ts

Lines changed: 127 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,114 @@ export type ReadResponseTextResult = {
9494
bytesRead: number;
9595
};
9696

97+
const RESPONSE_CHARSET_SCAN_BYTES = 4096;
98+
const latin1Decoder = new TextDecoder("latin1");
99+
const utf8Decoder = new TextDecoder("utf-8");
100+
101+
function normalizeCharset(value: string | undefined): string | undefined {
102+
const normalized = value?.trim().replace(/^["']|["']$/g, "") ?? "";
103+
return normalized && normalized.length <= 64 && /^[A-Za-z0-9._:-]+$/.test(normalized)
104+
? normalized
105+
: undefined;
106+
}
107+
108+
function readCharsetParam(value: string | null | undefined): string | undefined {
109+
const match = /(?:^|;)\s*charset\s*=\s*(?:"([^"]+)"|'([^']+)'|([^;\s]+))/i.exec(value ?? "");
110+
return normalizeCharset(match?.[1] ?? match?.[2] ?? match?.[3]);
111+
}
112+
113+
function readAttribute(tag: string, name: string): string | undefined {
114+
const target = name.toLowerCase();
115+
for (const match of tag.matchAll(
116+
/([A-Za-z0-9:_-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>]+))/g,
117+
)) {
118+
if (match[1]?.toLowerCase() === target) {
119+
return match[2] ?? match[3] ?? match[4] ?? "";
120+
}
121+
}
122+
return undefined;
123+
}
124+
125+
function shouldSniffDocumentCharset(contentType: string | null): boolean {
126+
const mediaType = contentType?.split(";", 1)[0]?.trim().toLowerCase();
127+
if (!mediaType) {
128+
return true;
129+
}
130+
return (
131+
mediaType === "text/html" ||
132+
mediaType === "application/xhtml+xml" ||
133+
mediaType === "text/xml" ||
134+
mediaType === "application/xml" ||
135+
mediaType.endsWith("+xml")
136+
);
137+
}
138+
139+
function sniffCharset(contentType: string | null, bytes: Uint8Array): string | undefined {
140+
if (bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
141+
return "utf-8";
142+
}
143+
if (bytes[0] === 0xff && bytes[1] === 0xfe) {
144+
return "utf-16le";
145+
}
146+
if (bytes[0] === 0xfe && bytes[1] === 0xff) {
147+
return "utf-16be";
148+
}
149+
if (!shouldSniffDocumentCharset(contentType)) {
150+
return undefined;
151+
}
152+
153+
const head = latin1Decoder.decode(
154+
bytes.subarray(0, Math.min(bytes.byteLength, RESPONSE_CHARSET_SCAN_BYTES)),
155+
);
156+
const xmlEncoding = /<\?xml\s+[^>]*\bencoding\s*=\s*(?:"([^"]+)"|'([^']+)')/i.exec(head);
157+
if (xmlEncoding) {
158+
return normalizeCharset(xmlEncoding[1] ?? xmlEncoding[2]);
159+
}
160+
161+
for (const match of head.matchAll(/<meta\b[^>]*>/gi)) {
162+
const tag = match[0];
163+
const charset = normalizeCharset(readAttribute(tag, "charset"));
164+
if (charset) {
165+
return charset;
166+
}
167+
if (/^content-type$/i.test(readAttribute(tag, "http-equiv") ?? "")) {
168+
const contentCharset = readCharsetParam(readAttribute(tag, "content"));
169+
if (contentCharset) {
170+
return contentCharset;
171+
}
172+
}
173+
}
174+
return undefined;
175+
}
176+
177+
function concatBytes(parts: Uint8Array[], totalBytes: number): Uint8Array {
178+
if (parts.length === 1 && parts[0]?.byteLength === totalBytes) {
179+
return parts[0];
180+
}
181+
const bytes = new Uint8Array(totalBytes);
182+
let offset = 0;
183+
for (const part of parts) {
184+
bytes.set(part, offset);
185+
offset += part.byteLength;
186+
}
187+
return bytes;
188+
}
189+
190+
function responseContentType(res: Response): string | null {
191+
const headers = (res as { headers?: { get?: (name: string) => string | null } }).headers;
192+
return typeof headers?.get === "function" ? headers.get("content-type") : null;
193+
}
194+
195+
function decodeResponseBytes(res: Response, bytes: Uint8Array): string {
196+
const contentType = responseContentType(res);
197+
const charset = readCharsetParam(contentType) ?? sniffCharset(contentType, bytes);
198+
try {
199+
return new TextDecoder(charset ?? "utf-8").decode(bytes);
200+
} catch {
201+
return utf8Decoder.decode(bytes);
202+
}
203+
}
204+
97205
export async function readResponseText(
98206
res: Response,
99207
options?: { maxBytes?: number },
@@ -113,10 +221,9 @@ export async function readResponseText(
113221
typeof (body as { getReader: () => unknown }).getReader === "function"
114222
) {
115223
const reader = (body as ReadableStream<Uint8Array>).getReader();
116-
const decoder = new TextDecoder();
117224
let bytesRead = 0;
118225
let truncated = false;
119-
const parts: string[] = [];
226+
const parts: Uint8Array[] = [];
120227

121228
try {
122229
while (true) {
@@ -140,15 +247,15 @@ export async function readResponseText(
140247
}
141248

142249
bytesRead += chunk.byteLength;
143-
parts.push(decoder.decode(chunk, { stream: true }));
250+
parts.push(chunk);
144251

145252
if (truncated || bytesRead >= maxBytes) {
146253
truncated = true;
147254
break;
148255
}
149256
}
150257
} catch {
151-
// Best-effort: return whatever we decoded so far.
258+
// Best-effort: return whatever we read so far.
152259
} finally {
153260
if (truncated) {
154261
// Some mocked or non-compliant streams never settle cancel(); do not
@@ -157,8 +264,22 @@ export async function readResponseText(
157264
}
158265
}
159266

160-
parts.push(decoder.decode());
161-
return { text: parts.join(""), truncated, bytesRead };
267+
const bytes = concatBytes(parts, bytesRead);
268+
return { text: decodeResponseBytes(res, bytes), truncated, bytesRead };
269+
}
270+
271+
const readBytes = (res as { arrayBuffer?: () => Promise<ArrayBuffer> }).arrayBuffer;
272+
if (typeof readBytes === "function") {
273+
try {
274+
const bytes = new Uint8Array(await readBytes.call(res));
275+
return {
276+
text: decodeResponseBytes(res, bytes),
277+
truncated: false,
278+
bytesRead: bytes.byteLength,
279+
};
280+
} catch {
281+
// Fall back to text() for lightweight Response-like mocks that do not expose bytes.
282+
}
162283
}
163284

164285
try {

src/agents/tools/web-tools.fetch.test.ts

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,83 @@ describe("web_fetch extraction fallbacks", () => {
231231
expect(details.truncated).toBe(true);
232232
});
233233

234+
it("decodes response bytes with a charset from Content-Type", async () => {
235+
installMockFetch((input: RequestInfo | URL) => {
236+
const response = new Response(new Uint8Array([0x63, 0x61, 0x66, 0xe9]), {
237+
status: 200,
238+
headers: { "content-type": "text/plain; charset=iso-8859-1" },
239+
});
240+
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
241+
return Promise.resolve(response);
242+
});
243+
244+
const tool = createFetchTool({ firecrawl: { enabled: false } });
245+
const result = await executeFetch(tool, {
246+
url: "https://example.com/latin1",
247+
extractMode: "text",
248+
});
249+
const details = result?.details as { text?: string };
250+
251+
expect(details.text).toContain("café");
252+
expect(details.text).not.toContain("caf�");
253+
});
254+
255+
it("decodes HTML using a meta http-equiv charset before extraction", async () => {
256+
const encoder = new TextEncoder();
257+
const japanese = new Uint8Array([0x93, 0xfa, 0x96, 0x7b, 0x8c, 0xea]);
258+
const responseBytes = new Uint8Array([
259+
...encoder.encode(
260+
'<!doctype html><html><head><meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS"><title>',
261+
),
262+
...japanese,
263+
...encoder.encode("</title></head><body><p>"),
264+
...japanese,
265+
...encoder.encode("</p></body></html>"),
266+
]);
267+
installMockFetch((input: RequestInfo | URL) => {
268+
const response = new Response(responseBytes, {
269+
status: 200,
270+
headers: { "content-type": "text/html" },
271+
});
272+
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
273+
return Promise.resolve(response);
274+
});
275+
276+
const tool = createFetchTool({ firecrawl: { enabled: false } });
277+
const result = await executeFetch(tool, {
278+
url: "https://example.com/shift-jis",
279+
extractMode: "text",
280+
});
281+
const details = result?.details as { text?: string; title?: string };
282+
const output = `${details.title ?? ""}\n${details.text ?? ""}`;
283+
284+
expect(output).toContain("日本語");
285+
expect(output).not.toContain("�");
286+
});
287+
288+
it("ignores charset text in unrelated meta content", async () => {
289+
const body =
290+
'<!doctype html><html><head><meta name="description" content="charset=Shift_JIS"><title>日本語</title></head><body>日本語</body></html>';
291+
installMockFetch((input: RequestInfo | URL) => {
292+
const response = new Response(new TextEncoder().encode(body), {
293+
status: 200,
294+
headers: { "content-type": "text/html" },
295+
});
296+
Object.defineProperty(response, "url", { value: resolveRequestUrl(input) });
297+
return Promise.resolve(response);
298+
});
299+
300+
const tool = createFetchTool({ firecrawl: { enabled: false } });
301+
const result = await executeFetch(tool, {
302+
url: "https://example.com/content-only-charset",
303+
extractMode: "text",
304+
});
305+
const details = result?.details as { text?: string; title?: string };
306+
const output = `${details.title ?? ""}\n${details.text ?? ""}`;
307+
308+
expect(output).toContain("日本語");
309+
});
310+
234311
it("caps response bytes and does not hang on endless streams", async () => {
235312
const chunk = new TextEncoder().encode("<html><body><div>hi</div></body></html>");
236313
const stream = new ReadableStream<Uint8Array>({

0 commit comments

Comments
 (0)