Skip to content

Commit ef0b40d

Browse files
jmarshallvaleriuo
authored andcommitted
Partially recognise bzip2-compressed data
We currently don't do anything with such data, and unfortunately we can't peek at the contents. But we can print "Unknown bzip2-compressed data" rather than "Unknown data" for e.g. bzip2-ed FASTQ files, which have been seen in the wild. Also for gzip compression: set compression_level according to XFL when CM is DEFLATE (BGZF does not set XFL, but plain gzip does).
1 parent 46843e5 commit ef0b40d

2 files changed

Lines changed: 17 additions & 1 deletion

File tree

hts.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,23 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt)
216216
// Determine which, and decompress the first few bytes.
217217
fmt->compression = (len >= 18 && (s[3] & 4) &&
218218
memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
219+
if (len >= 9 && s[2] == 8)
220+
fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
221+
219222
len = decompress_peek(hfile, s, sizeof s);
220223
}
224+
else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
225+
(memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
226+
memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
227+
fmt->compression = bzip2_compression;
228+
fmt->compression_level = s[3] - '0';
229+
// Decompressing via libbz2 produces no output until it has a whole
230+
// block (of size 100Kb x level), which is too large for peeking.
231+
// So unfortunately we can recognise bzip2 but not the contents,
232+
// except that \x1772... magic indicates the stream is empty.
233+
if (s[4] == '\x31') return 0;
234+
else len = 0;
235+
}
221236
else {
222237
len = hpeek(hfile, s, sizeof s);
223238
}
@@ -342,6 +357,7 @@ char *hts_format_description(const htsFormat *format)
342357
}
343358

344359
switch (format->compression) {
360+
case bzip2_compression: kputs(" bzip2-compressed", &str); break;
345361
case custom: kputs(" compressed", &str); break;
346362
case gzip: kputs(" gzip-compressed", &str); break;
347363
case bgzf:

htslib/hts.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ enum htsExactFormat {
191191
};
192192

193193
enum htsCompression {
194-
no_compression, gzip, bgzf, custom,
194+
no_compression, gzip, bgzf, custom, bzip2_compression,
195195
compression_maximum = 32767
196196
};
197197

0 commit comments

Comments
 (0)