Skip to content

Commit d42b34a

Browse files
fix: handle invalid UTF-8 in Ruby and Vue preprocessors (#19588)
## Summary This PR fixes a panic that occurs when the Ruby or Vue preprocessors encounter files with invalid UTF-8 bytes. **The issue:** - `ruby.rs:37` and `vue.rs:18` used `std::str::from_utf8(content).unwrap()` - This panics when processing files containing invalid UTF-8 bytes **Error message:** ``` thread panicked at crates/oxide/src/extractor/pre_processors/ruby.rs:37:59: called `Result::unwrap()` on an `Err` value: Utf8Error { valid_up_to: 45, error_len: Some(1) } ``` **The fix:** - Wrap UTF-8 conversion in `if let Ok(...)` to gracefully handle invalid UTF-8 - Skip regex-based template extraction when UTF-8 conversion fails - Allow byte-level processing to continue (in Ruby's case) This can happen in Rails projects when: - Binary files are inadvertently scanned - Files contain non-UTF-8 encodings - Files are truncated at multi-byte character boundaries during parallel processing ## Test plan - [x] Added `test_invalid_utf8_does_not_panic` test for Ruby preprocessor - [x] Added `test_valid_utf8_with_multibyte_chars` test for Ruby preprocessor - [x] Added `test_invalid_utf8_does_not_panic` test for Vue preprocessor - [x] All existing tests pass (`cargo test pre_processors` - 43 tests) --------- Co-authored-by: Robin Malfait <malfait.robin@gmail.com>
1 parent 0612ddc commit d42b34a

3 files changed

Lines changed: 81 additions & 43 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2525
- Ensure `@tailwindcss/cli` in `--watch` mode recovers when a tracked dependency is deleted and restored ([#20137](https://github.com/tailwindlabs/tailwindcss/pull/20137))
2626
- Ensure standalone `@tailwindcss/cli` binaries are ignored when scanning for class candidates ([#20139](https://github.com/tailwindlabs/tailwindcss/pull/20139))
2727
- Ensure class candidates are extracted from Twig `addClass(…)` and `removeClass(…)` calls ([#20198](https://github.com/tailwindlabs/tailwindcss/pull/20198))
28+
- Don't crash in the Ruby or Vue preprocessors when scanning files containing invalid UTF-8 bytes ([#19588](https://github.com/tailwindlabs/tailwindcss/pull/19588))
2829

2930
### Changed
3031

crates/oxide/src/extractor/pre_processors/ruby.rs

Lines changed: 58 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -34,45 +34,47 @@ impl PreProcessor for Ruby {
3434

3535
// Extract embedded template languages
3636
// https://viewcomponent.org/guide/templates.html#interpolations
37-
let content_as_str = std::str::from_utf8(content).unwrap();
38-
39-
let starts = TEMPLATE_START_REGEX
40-
.captures_iter(content_as_str)
41-
.collect::<Vec<_>>();
42-
let ends = TEMPLATE_END_REGEX
43-
.captures_iter(content_as_str)
44-
.collect::<Vec<_>>();
45-
46-
for start in starts.iter() {
47-
// The language for this block
48-
let lang = start.get(1).unwrap().as_str();
49-
50-
// The HEREDOC delimiter
51-
let delimiter_start = start.get(2).unwrap().as_str();
52-
53-
// Where the "body" starts for the HEREDOC block
54-
let body_start = start.get(0).unwrap().end();
55-
56-
// Look through all of the ends to find a matching language
57-
for end in ends.iter() {
58-
// 1. This must appear after the start
59-
let body_end = end.get(0).unwrap().start();
60-
if body_end < body_start {
61-
continue;
62-
}
37+
// Only process if content is valid UTF-8, otherwise skip HEREDOC extraction
38+
// but still perform the byte-level Ruby processing below
39+
if let Ok(content_as_str) = std::str::from_utf8(content) {
40+
let starts = TEMPLATE_START_REGEX
41+
.captures_iter(content_as_str)
42+
.collect::<Vec<_>>();
43+
let ends = TEMPLATE_END_REGEX
44+
.captures_iter(content_as_str)
45+
.collect::<Vec<_>>();
46+
47+
for start in starts.iter() {
48+
// The language for this block
49+
let lang = start.get(1).unwrap().as_str();
50+
51+
// The HEREDOC delimiter
52+
let delimiter_start = start.get(2).unwrap().as_str();
53+
54+
// Where the "body" starts for the HEREDOC block
55+
let body_start = start.get(0).unwrap().end();
56+
57+
// Look through all of the ends to find a matching language
58+
for end in ends.iter() {
59+
// 1. This must appear after the start
60+
let body_end = end.get(0).unwrap().start();
61+
if body_end < body_start {
62+
continue;
63+
}
6364

64-
// The languages must match otherwise we haven't found the end
65-
let delimiter_end = end.get(1).unwrap().as_str();
66-
if delimiter_end != delimiter_start {
67-
continue;
68-
}
65+
// The languages must match otherwise we haven't found the end
66+
let delimiter_end = end.get(1).unwrap().as_str();
67+
if delimiter_end != delimiter_start {
68+
continue;
69+
}
6970

70-
let body = &content_as_str[body_start..body_end];
71-
let replaced =
72-
pre_process_input(body.as_bytes().to_vec(), &lang.to_ascii_lowercase());
71+
let body = &content_as_str[body_start..body_end];
72+
let replaced =
73+
pre_process_input(body.as_bytes().to_vec(), &lang.to_ascii_lowercase());
7374

74-
result.replace_range(body_start..body_end, replaced);
75-
break;
75+
result.replace_range(body_start..body_end, replaced);
76+
break;
77+
}
7678
}
7779
}
7880

@@ -444,4 +446,24 @@ mod tests {
444446
vec!["text-amber-600", "text-sky-500", "text-green-500"],
445447
);
446448
}
449+
450+
#[test]
451+
fn test_invalid_utf8_does_not_panic() {
452+
// Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte
453+
let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82];
454+
455+
let processor = Ruby::default();
456+
457+
// Should not panic, just return the input unchanged
458+
let result = processor.process(invalid_utf8);
459+
assert_eq!(result, invalid_utf8);
460+
}
461+
462+
#[test]
463+
fn test_valid_utf8_with_multibyte_chars() {
464+
// Test that valid UTF-8 with multi-byte characters (like em-dashes) works
465+
let input = "# Comment with em—dash\n%w[flex px-2.5]";
466+
467+
Ruby::test_extract_contains(input, vec!["flex", "px-2.5"]);
468+
}
447469
}

crates/oxide/src/extractor/pre_processors/vue.rs

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ pub struct Vue;
1414
impl PreProcessor for Vue {
1515
fn process(&self, content: &[u8]) -> Vec<u8> {
1616
let mut result = content.to_vec();
17-
let content_as_str = std::str::from_utf8(content).unwrap();
18-
for (_, [lang, body]) in TEMPLATE_REGEX
19-
.captures_iter(content_as_str)
20-
.map(|c| c.extract())
21-
{
22-
let replaced = pre_process_input(body.as_bytes().to_vec(), lang);
23-
result = result.replace(body, replaced);
17+
18+
// Only process template tags if content is valid UTF-8
19+
if let Ok(content_as_str) = std::str::from_utf8(content) {
20+
for (_, [lang, body]) in TEMPLATE_REGEX
21+
.captures_iter(content_as_str)
22+
.map(|c| c.extract())
23+
{
24+
let replaced = pre_process_input(body.as_bytes().to_vec(), lang);
25+
result = result.replace(body, replaced);
26+
}
2427
}
2528

2629
result
@@ -42,4 +45,16 @@ mod tests {
4245

4346
Vue::test_extract_contains(input, vec!["bg-neutral-900", "text-red-500"]);
4447
}
48+
49+
#[test]
50+
fn test_invalid_utf8_does_not_panic() {
51+
// Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte
52+
let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82];
53+
54+
let processor = Vue::default();
55+
56+
// Should not panic, just return the input unchanged
57+
let result = processor.process(invalid_utf8);
58+
assert_eq!(result, invalid_utf8);
59+
}
4560
}

0 commit comments

Comments
 (0)