Skip to content

Commit 654cf20

Browse files
committed
fix(media): strip control and bidi characters from outbound filenames
Outbound media filenames flow from two untrusted sources into channel adapters (Telegram, Discord, Matrix, WhatsApp): HTTP `Content-Disposition` headers parsed in `fetchRemoteMedia`, and URL pathnames extracted in `loadWebMediaInternal`. Both paths previously called `path.basename(...)` with no further sanitization, leaving C0/C1 control bytes, zero-width joiners, and bidi formatting characters in the filename that downstream channels forwarded verbatim. This adds a `stripFilenameControlChars` helper in `src/media/outbound-filename.ts` that removes the union of those invisible ranges (U+0000-U+001F, U+007F-U+009F, U+200B-U+200D, U+202A-U+202E, U+2066-U+2069, U+FEFF) and applies it at the two root extraction sites. All other Unicode is preserved, so legitimate Japanese, Cyrillic, Arabic, etc. filenames pass through unchanged. Defense-in-depth: the operating systems on the receiving end already enforce the real extension via content sniffing, so this only closes the visual-display gap. The new file is named to avoid colliding with the three existing `sanitize{File,Filename}` helpers in `store.ts`, `file-context.ts`, and `server.ts`, each of which has a different purpose (cross-platform path safety, LLM prompt attribute escaping, HTTP header attachment escaping).
1 parent f20a295 commit 654cf20

5 files changed

Lines changed: 112 additions & 4 deletions

File tree

src/media/fetch.test.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,4 +320,28 @@ describe("fetchRemoteMedia", () => {
320320
}),
321321
);
322322
});
323+
324+
it("strips invisible / bidi characters from Content-Disposition filenames", async () => {
325+
// Headers can only carry ByteString values, so the bidi RLO (U+202E) and
326+
// zero-width space (U+200B) ride in via the RFC 5987 `filename*` form as
327+
// UTF-8 percent-escapes (%E2%80%AE / %E2%80%8B).
328+
const fetchImpl = vi.fn(
329+
async () =>
330+
new Response(makeStream([new Uint8Array([1, 2, 3])]), {
331+
status: 200,
332+
headers: {
333+
"content-disposition": "attachment; filename*=UTF-8''report%E2%80%AEgpj%E2%80%8B.exe",
334+
},
335+
}),
336+
);
337+
338+
const result = await fetchRemoteMedia({
339+
url: "https://example.com/file.bin",
340+
fetchImpl,
341+
lookupFn: makeLookupFn(),
342+
maxBytes: 1024,
343+
});
344+
345+
expect(result.fileName).toBe("reportgpj.exe");
346+
});
323347
});

src/media/fetch.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
import type { LookupFn, PinnedDispatcherPolicy, SsrFPolicy } from "../infra/net/ssrf.js";
99
import { redactSensitiveText } from "../logging/redact.js";
1010
import { detectMime, extensionForMime } from "./mime.js";
11+
import { stripFilenameControlChars } from "./outbound-filename.js";
1112
import { readResponseTextSnippet, readResponseWithLimit } from "./read-response-with-limit.js";
1213

1314
type FetchMediaResult = {
@@ -69,14 +70,14 @@ function parseContentDispositionFileName(header?: string | null): string | undef
6970
const cleaned = stripQuotes(starMatch[1].trim());
7071
const encoded = cleaned.split("''").slice(1).join("''") || cleaned;
7172
try {
72-
return path.basename(decodeURIComponent(encoded));
73+
return stripFilenameControlChars(path.basename(decodeURIComponent(encoded)));
7374
} catch {
74-
return path.basename(encoded);
75+
return stripFilenameControlChars(path.basename(encoded));
7576
}
7677
}
7778
const match = /filename\s*=\s*([^;]+)/i.exec(header);
7879
if (match?.[1]) {
79-
return path.basename(stripQuotes(match[1].trim()));
80+
return stripFilenameControlChars(path.basename(stripQuotes(match[1].trim())));
8081
}
8182
return undefined;
8283
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import { describe, expect, it } from "vitest";
2+
import { stripFilenameControlChars } from "./outbound-filename.js";
3+
4+
const c = (code: number): string => String.fromCharCode(code);
5+
6+
describe("stripFilenameControlChars", () => {
7+
it("returns plain ASCII filenames unchanged", () => {
8+
expect(stripFilenameControlChars("report.pdf")).toBe("report.pdf");
9+
expect(stripFilenameControlChars("a.b-c_d.tar.gz")).toBe("a.b-c_d.tar.gz");
10+
});
11+
12+
it("preserves non-ASCII letters and digits", () => {
13+
expect(stripFilenameControlChars("日本語_2025.pdf")).toBe("日本語_2025.pdf");
14+
expect(stripFilenameControlChars("отчет.docx")).toBe("отчет.docx");
15+
expect(stripFilenameControlChars("ملف.pdf")).toBe("ملف.pdf");
16+
});
17+
18+
it.each([
19+
{ name: "C0 control NUL", code: 0x0000 },
20+
{ name: "C0 control TAB", code: 0x0009 },
21+
{ name: "C0 control LF", code: 0x000a },
22+
{ name: "C0 control CR", code: 0x000d },
23+
{ name: "C0 control US", code: 0x001f },
24+
{ name: "DEL", code: 0x007f },
25+
{ name: "C1 control PAD", code: 0x0080 },
26+
{ name: "C1 control APC", code: 0x009f },
27+
{ name: "ZWSP", code: 0x200b },
28+
{ name: "ZWNJ", code: 0x200c },
29+
{ name: "ZWJ", code: 0x200d },
30+
{ name: "LRE", code: 0x202a },
31+
{ name: "RLE", code: 0x202b },
32+
{ name: "PDF", code: 0x202c },
33+
{ name: "LRO", code: 0x202d },
34+
{ name: "RLO", code: 0x202e },
35+
{ name: "LRI", code: 0x2066 },
36+
{ name: "RLI", code: 0x2067 },
37+
{ name: "FSI", code: 0x2068 },
38+
{ name: "PDI", code: 0x2069 },
39+
{ name: "BOM / ZWNBSP", code: 0xfeff },
40+
] as const)("strips $name", ({ code }) => {
41+
const input = `pre${c(code)}post.txt`;
42+
expect(stripFilenameControlChars(input)).toBe("prepost.txt");
43+
});
44+
45+
it("collapses bidi-spoofed extensions to their visible byte order", () => {
46+
// "report" + RLO + "gpj.exe" displays as "reportexe.jpg" on bidi-aware
47+
// clients but the underlying bytes end in .exe. After stripping RLO the
48+
// visible name matches the bytes.
49+
const input = `report${c(0x202e)}gpj.exe`;
50+
expect(stripFilenameControlChars(input)).toBe("reportgpj.exe");
51+
});
52+
53+
it("returns an empty string when every character is stripped", () => {
54+
const allControl = `${c(0x0000)}${c(0x202e)}${c(0xfeff)}${c(0x200b)}`;
55+
expect(stripFilenameControlChars(allControl)).toBe("");
56+
});
57+
58+
it("returns an empty string for empty input", () => {
59+
expect(stripFilenameControlChars("")).toBe("");
60+
});
61+
62+
it("leaves printable Unicode outside the strip ranges intact", () => {
63+
// U+200E (LRM), U+200F (RLM), and U+202F (NARROW NO-BREAK SPACE) sit
64+
// just outside the strip ranges; preserve them so this helper does not
65+
// silently drift into broader filename normalization.
66+
const input = `a${c(0x200e)}b${c(0x200f)}c${c(0x202f)}d`;
67+
expect(stripFilenameControlChars(input)).toBe(input);
68+
});
69+
});

src/media/outbound-filename.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Strips invisible / formatting characters from filenames carried over
2+
// untrusted boundaries (HTTP Content-Disposition, URL pathname). The pattern
3+
// is built via the RegExp constructor so the source file stays plain ASCII.
4+
// - C0/C1 control: U+0000-U+001F, U+007F-U+009F
5+
// - Zero-width: U+200B-U+200D, U+FEFF
6+
// - Bidi format: U+202A-U+202E, U+2066-U+2069
7+
const FILENAME_INVISIBLE_CONTROL_PATTERN =
8+
"[\\u0000-\\u001F\\u007F-\\u009F\\u200B-\\u200D\\u202A-\\u202E\\u2066-\\u2069\\uFEFF]";
9+
const FILENAME_INVISIBLE_CONTROL_RE = new RegExp(FILENAME_INVISIBLE_CONTROL_PATTERN, "g");
10+
11+
export function stripFilenameControlChars(value: string): string {
12+
return value.replace(FILENAME_INVISIBLE_CONTROL_RE, "");
13+
}

src/media/web-media.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import {
2828
mimeTypeFromFilePath,
2929
normalizeMimeType,
3030
} from "./mime.js";
31+
import { stripFilenameControlChars } from "./outbound-filename.js";
3132

3233
export { getDefaultLocalRoots, LocalMediaAccessError };
3334
export type { LocalMediaAccessErrorCode };
@@ -552,7 +553,7 @@ async function loadWebMediaInternal(
552553
buffer: data,
553554
});
554555
}
555-
let fileName = path.basename(mediaUrl) || undefined;
556+
let fileName = stripFilenameControlChars(path.basename(mediaUrl)) || undefined;
556557
if (fileName && !path.extname(fileName) && mime) {
557558
const ext = extensionForMime(mime);
558559
if (ext) {

0 commit comments

Comments
 (0)