@@ -36,12 +36,10 @@ const MEDIA_URL_CAP = 25;
3636// Hard timeout for the gateway-side `tar -xzf` unpack process.
3737const TAR_UNPACK_TIMEOUT_MS = 60_000 ;
3838
39- // Defense-in-depth caps for the *uncompressed* extraction. The compressed
40- // tar is already capped at DIR_FETCH_HARD_MAX_BYTES upstream, but a
41- // gzip-bomb can decompress to many GB. These caps bound the unpacked
42- // size and entry count so a malicious node can't exhaust the gateway's
43- // disk or CPU.
44- const TAR_UNPACK_MAX_UNCOMPRESSED_BYTES = 64 * 1024 * 1024 ;
39+ // Cap on number of entries pre-validated. The compressed tar is already
40+ // capped at DIR_FETCH_HARD_MAX_BYTES upstream, and we walk the unpacked
41+ // tree to compute hashes — TAR_UNPACK_MAX_ENTRIES bounds how much work
42+ // that walk can do.
4543const TAR_UNPACK_MAX_ENTRIES = 5000 ;
4644
4745const DirFetchToolSchema = Type . Object ( {
@@ -90,26 +88,28 @@ async function computeFileSha256(filePath: string): Promise<string> {
9088}
9189
9290/**
93- * Run `tar -tzvf -` against the buffer to enumerate entries with their
94- * type letter (regular file / symlink / hardlink / dir) and size BEFORE
95- * we extract anything. Rejects:
96- * - any entry whose path is absolute (escapes destDir on -P-using tar)
97- * - any entry with ".." segments after normalization
98- * - any entry that is a symlink ("l"), hardlink ("h"), or unknown type
99- * - cumulative uncompressed size > TAR_UNPACK_MAX_UNCOMPRESSED_BYTES
100- * - entry count > TAR_UNPACK_MAX_ENTRIES
91+ * Run two passes against the buffer to enumerate entries BEFORE we extract:
10192 *
102- * BSD tar's -tvzf produces a `ls -l`-style line; we parse the leading
103- * type char and the size column.
93+ * 1. `tar -tf -` produces names ONLY, one per line. This is whitespace-safe
94+ * because each line is exactly one path; no parsing of fixed columns.
95+ * Used to validate paths (reject absolute, '..' traversal).
96+ * 2. `tar -tvf -` adds type info via the `ls -l`-style perm prefix.
97+ * Used ONLY to detect symlinks / hardlinks / non-regular entries via
98+ * the FIRST CHARACTER of each line, never the path column.
99+ *
100+ * Size limits are enforced at the *extraction* step instead — the tar
101+ * unpack process is bounded by the maxBytes we already pass through, and
102+ * the post-extract walkDir is hard-capped by TAR_UNPACK_MAX_ENTRIES.
103+ * Trying to parse uncompressed sizes from `tar -tvf` output is fragile
104+ * (filenames with whitespace shift the columns) and Aisle flagged that
105+ * shape as a bypass primitive — drop it.
104106 */
105- async function preValidateTarball (
107+ async function listTarPaths (
106108 tarBuffer : Buffer ,
107- ) : Promise < { ok : true } | { ok : false ; reason : string } > {
109+ ) : Promise < { ok : true ; paths : string [ ] } | { ok : false ; reason : string } > {
108110 return new Promise ( ( resolve ) => {
109111 const tarBin = process . platform !== "win32" ? "/usr/bin/tar" : "tar" ;
110- const child = spawn ( tarBin , [ "-tzvf" , "-" ] , {
111- stdio : [ "pipe" , "pipe" , "pipe" ] ,
112- } ) ;
112+ const child = spawn ( tarBin , [ "-tzf" , "-" ] , { stdio : [ "pipe" , "pipe" , "pipe" ] } ) ;
113113 let stdout = "" ;
114114 let stderr = "" ;
115115 let aborted = false ;
@@ -120,10 +120,20 @@ async function preValidateTarball(
120120 } catch {
121121 /* gone */
122122 }
123- resolve ( { ok : false , reason : "tar -tvzf timed out" } ) ;
123+ resolve ( { ok : false , reason : "tar -tzf timed out" } ) ;
124124 } , 30_000 ) ;
125125 child . stdout . on ( "data" , ( c : Buffer ) => {
126126 stdout += c . toString ( ) ;
127+ if ( stdout . length > 32 * 1024 * 1024 ) {
128+ aborted = true ;
129+ try {
130+ child . kill ( "SIGKILL" ) ;
131+ } catch {
132+ /* gone */
133+ }
134+ clearTimeout ( watchdog ) ;
135+ resolve ( { ok : false , reason : "tar -tzf output too large" } ) ;
136+ }
127137 } ) ;
128138 child . stderr . on ( "data" , ( c : Buffer ) => {
129139 stderr += c . toString ( ) ;
@@ -134,80 +144,142 @@ async function preValidateTarball(
134144 return ;
135145 }
136146 if ( code !== 0 ) {
137- resolve ( { ok : false , reason : `tar -tvzf exited ${ code } : ${ stderr . slice ( 0 , 200 ) } ` } ) ;
147+ resolve ( { ok : false , reason : `tar -tzf exited ${ code } : ${ stderr . slice ( 0 , 200 ) } ` } ) ;
138148 return ;
139149 }
140- const lines = stdout . split ( "\n" ) . filter ( ( l ) => l . trim ( ) . length > 0 ) ;
141- if ( lines . length > TAR_UNPACK_MAX_ENTRIES ) {
142- resolve ( {
143- ok : false ,
144- reason : `archive contains ${ lines . length } entries; limit ${ TAR_UNPACK_MAX_ENTRIES } ` ,
145- } ) ;
146- return ;
150+ // tar -tf emits one path per line with literal newlines as record
151+ // separators. Filenames containing newlines are exotic enough that
152+ // refusing them is safer than trying to parse around them.
153+ const paths = stdout . split ( "\n" ) . filter ( ( l ) => l . length > 0 ) ;
154+ resolve ( { ok : true , paths } ) ;
155+ } ) ;
156+ child . on ( "error" , ( e ) => {
157+ clearTimeout ( watchdog ) ;
158+ if ( ! aborted ) {
159+ resolve ( { ok : false , reason : `tar -tzf error: ${ String ( e ) } ` } ) ;
147160 }
148- let totalBytes = 0 ;
149- for ( const line of lines ) {
150- // BSD tar -tvzf format:
151- // "drwxr-xr-x 0 user staff 0 Mar 14 ... ./"
152- // "-rw-r--r-- 0 user staff 12 Mar 14 ... ./hello.txt"
153- // "lrwxr-xr-x 0 user staff 0 Mar 14 ... link -> target"
154- // First char of the perm field is the type.
155- const typeChar = line . charAt ( 0 ) ;
156- if ( typeChar === "l" || typeChar === "h" ) {
157- resolve ( { ok : false , reason : `archive contains link entry: ${ line . slice ( 0 , 120 ) } ` } ) ;
158- return ;
159- }
160- if ( typeChar !== "-" && typeChar !== "d" ) {
161- resolve ( { ok : false , reason : `archive contains non-regular entry type '${ typeChar } '` } ) ;
162- return ;
163- }
164- // Path is everything after the date/time field; cheapest is to
165- // grab the last whitespace-delimited token. For " name -> target"
166- // (symlink) we already rejected above.
167- const tokens = line . trim ( ) . split ( / \s + / u) ;
168- if ( tokens . length < 2 ) {
169- continue ;
170- }
171- const entryPath = tokens . at ( - 1 ) ?? "" ;
172- if ( path . isAbsolute ( entryPath ) ) {
173- resolve ( { ok : false , reason : `archive contains absolute path: ${ entryPath } ` } ) ;
174- return ;
175- }
176- const norm = path . posix . normalize ( entryPath ) ;
177- if ( norm === ".." || norm . startsWith ( "../" ) || norm . includes ( "/../" ) ) {
178- resolve ( { ok : false , reason : `archive contains '..' traversal: ${ entryPath } ` } ) ;
179- return ;
180- }
181- // Size column: tar -tvzf output has size in bytes at a stable
182- // position. tokens[2] is owner, tokens[3] is group, tokens[4] is
183- // size on BSD; on GNU it's tokens[2] (perm/owner combined).
184- // Be permissive — find the first all-digit token as size.
185- for ( const t of tokens . slice ( 1 ) ) {
186- if ( / ^ \d + $ / u. test ( t ) ) {
187- totalBytes += Number . parseInt ( t , 10 ) ;
188- break ;
189- }
161+ } ) ;
162+ child . stdin . end ( tarBuffer ) ;
163+ } ) ;
164+ }
165+
166+ async function listTarTypeChars (
167+ tarBuffer : Buffer ,
168+ ) : Promise < { ok : true ; typeChars : string [ ] } | { ok : false ; reason : string } > {
169+ return new Promise ( ( resolve ) => {
170+ const tarBin = process . platform !== "win32" ? "/usr/bin/tar" : "tar" ;
171+ const child = spawn ( tarBin , [ "-tzvf" , "-" ] , { stdio : [ "pipe" , "pipe" , "pipe" ] } ) ;
172+ let stdout = "" ;
173+ let stderr = "" ;
174+ let aborted = false ;
175+ const watchdog = setTimeout ( ( ) => {
176+ aborted = true ;
177+ try {
178+ child . kill ( "SIGKILL" ) ;
179+ } catch {
180+ /* gone */
181+ }
182+ resolve ( { ok : false , reason : "tar -tzvf timed out" } ) ;
183+ } , 30_000 ) ;
184+ child . stdout . on ( "data" , ( c : Buffer ) => {
185+ stdout += c . toString ( ) ;
186+ if ( stdout . length > 32 * 1024 * 1024 ) {
187+ aborted = true ;
188+ try {
189+ child . kill ( "SIGKILL" ) ;
190+ } catch {
191+ /* gone */
190192 }
193+ clearTimeout ( watchdog ) ;
194+ resolve ( { ok : false , reason : "tar -tzvf output too large" } ) ;
191195 }
192- if ( totalBytes > TAR_UNPACK_MAX_UNCOMPRESSED_BYTES ) {
193- resolve ( {
194- ok : false ,
195- reason : `uncompressed size ${ totalBytes } exceeds ${ TAR_UNPACK_MAX_UNCOMPRESSED_BYTES } ` ,
196- } ) ;
196+ } ) ;
197+ child . stderr . on ( "data" , ( c : Buffer ) => {
198+ stderr += c . toString ( ) ;
199+ } ) ;
200+ child . on ( "close" , ( code ) => {
201+ clearTimeout ( watchdog ) ;
202+ if ( aborted ) {
203+ return ;
204+ }
205+ if ( code !== 0 ) {
206+ resolve ( { ok : false , reason : `tar -tzvf exited ${ code } : ${ stderr . slice ( 0 , 200 ) } ` } ) ;
197207 return ;
198208 }
199- resolve ( { ok : true } ) ;
209+ // Take only the first character of each line — the entry type.
210+ // We don't touch the rest of the line (path/size/etc) so filenames
211+ // with whitespace can't shift our parser.
212+ const typeChars = stdout
213+ . split ( "\n" )
214+ . filter ( ( l ) => l . length > 0 )
215+ . map ( ( l ) => l . charAt ( 0 ) ) ;
216+ resolve ( { ok : true , typeChars } ) ;
200217 } ) ;
201218 child . on ( "error" , ( e ) => {
202219 clearTimeout ( watchdog ) ;
203220 if ( ! aborted ) {
204- resolve ( { ok : false , reason : `tar -tvzf error: ${ String ( e ) } ` } ) ;
221+ resolve ( { ok : false , reason : `tar -tzvf error: ${ String ( e ) } ` } ) ;
205222 }
206223 } ) ;
207224 child . stdin . end ( tarBuffer ) ;
208225 } ) ;
209226}
210227
228+ async function preValidateTarball (
229+ tarBuffer : Buffer ,
230+ ) : Promise < { ok : true } | { ok : false ; reason : string } > {
231+ const namesResult = await listTarPaths ( tarBuffer ) ;
232+ if ( ! namesResult . ok ) {
233+ return namesResult ;
234+ }
235+ const paths = namesResult . paths ;
236+ if ( paths . length > TAR_UNPACK_MAX_ENTRIES ) {
237+ return {
238+ ok : false ,
239+ reason : `archive contains ${ paths . length } entries; limit ${ TAR_UNPACK_MAX_ENTRIES } ` ,
240+ } ;
241+ }
242+
243+ const typesResult = await listTarTypeChars ( tarBuffer ) ;
244+ if ( ! typesResult . ok ) {
245+ return typesResult ;
246+ }
247+ const typeChars = typesResult . typeChars ;
248+ // The two passes should report the same number of entries; if they
249+ // don't, something exotic is going on (filenames with newlines, etc.)
250+ // and we refuse defensively.
251+ if ( typeChars . length !== paths . length ) {
252+ return {
253+ ok : false ,
254+ reason : `tar -tzf and tar -tzvf disagree on entry count (${ paths . length } vs ${ typeChars . length } ); refusing` ,
255+ } ;
256+ }
257+
258+ for ( let i = 0 ; i < paths . length ; i ++ ) {
259+ const entryPath = paths [ i ] ;
260+ const t = typeChars [ i ] ;
261+ if ( t === "l" || t === "h" ) {
262+ return { ok : false , reason : `archive contains link entry: ${ entryPath } ` } ;
263+ }
264+ if ( t !== "-" && t !== "d" ) {
265+ return { ok : false , reason : `archive contains non-regular entry type '${ t } ': ${ entryPath } ` } ;
266+ }
267+ if ( path . isAbsolute ( entryPath ) ) {
268+ return { ok : false , reason : `archive contains absolute path: ${ entryPath } ` } ;
269+ }
270+ const norm = path . posix . normalize ( entryPath ) ;
271+ if ( norm === ".." || norm . startsWith ( "../" ) || norm . includes ( "/../" ) ) {
272+ return { ok : false , reason : `archive contains '..' traversal: ${ entryPath } ` } ;
273+ }
274+ // Reject backslash-containing names too — refuses Windows-style
275+ // traversal in archives produced by an attacker on a Windows node.
276+ if ( entryPath . includes ( "\\" ) ) {
277+ return { ok : false , reason : `archive contains backslash in path: ${ entryPath } ` } ;
278+ }
279+ }
280+ return { ok : true } ;
281+ }
282+
211283type UnpackedFileEntry = {
212284 relPath : string ;
213285 size : number ;
0 commit comments