Skip to content

Commit f2daa22

Browse files
authored
fix(tokenizer): align edge-case parsing behavior with HTML spec (#2382)
1 parent 24ca601 commit f2daa22

6 files changed

Lines changed: 611 additions & 18 deletions

File tree

src/Parser.events.spec.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,18 @@ describe("Events", () => {
164164

165165
it("Scripts ending with <", () => runTest("<script><</script>"));
166166

167+
it("Special end tags ending with /> in script", () =>
168+
runTest("<script>safe</script/><img>"));
169+
170+
it("Special end tags ending with /> in style", () =>
171+
runTest("<style>safe</style/><img>"));
172+
173+
it("Special end tags ending with /> in title", () =>
174+
runTest("<title>safe</title/><img>"));
175+
176+
it("Special end tags ending with /> in textarea", () =>
177+
runTest("<textarea>safe</textarea/><img>"));
178+
167179
it("CDATA more edge-cases", () =>
168180
runTest("<![CDATA[foo]bar]>baz]]>", { recognizeCDATA: true }));
169181

src/Tokenizer.spec.ts

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ import { describe, expect, it } from "vitest";
22
import { Tokenizer } from "./index.js";
33
import type { Callbacks } from "./Tokenizer.js";
44

5-
function tokenize(data: string, options = {}) {
5+
function tokenize(
6+
data: string | ((tokenizer: Tokenizer, log: unknown[][]) => void),
7+
options = {},
8+
) {
69
const log: unknown[][] = [];
710
const tokenizer = new Tokenizer(
811
options,
@@ -17,8 +20,12 @@ function tokenize(data: string, options = {}) {
1720
) as Callbacks,
1821
);
1922

20-
tokenizer.write(data);
21-
tokenizer.end();
23+
if (typeof data === "function") {
24+
data(tokenizer, log);
25+
} else {
26+
tokenizer.write(data);
27+
tokenizer.end();
28+
}
2229

2330
return log;
2431
}
@@ -82,6 +89,23 @@ describe("Tokenizer", () => {
8289
});
8390
});
8491

92+
describe("should close special tags on end tags ending with />", () => {
93+
it("for script tag", () => {
94+
expect(tokenize("<script>safe</script/><img>")).toMatchSnapshot();
95+
});
96+
it("for style tag", () => {
97+
expect(tokenize("<style>safe</style/><img>")).toMatchSnapshot();
98+
});
99+
it("for title tag", () => {
100+
expect(tokenize("<title>safe</title/><img>")).toMatchSnapshot();
101+
});
102+
it("for textarea tag", () => {
103+
expect(
104+
tokenize("<textarea>safe</textarea/><img>"),
105+
).toMatchSnapshot();
106+
});
107+
});
108+
85109
describe("should correctly mark attributes", () => {
86110
it("for no value attribute", () => {
87111
expect(tokenize("<div aaaaaaa >")).toMatchSnapshot();
@@ -128,6 +152,42 @@ describe("Tokenizer", () => {
128152
expect(tokenize("&NotGreaterFullEqual;")).toMatchSnapshot());
129153
});
130154

155+
it("should close comments on --!>", () => {
156+
expect(
157+
tokenize("<!-- --!><img src=x onerror=alert(1)>-->"),
158+
).toMatchSnapshot();
159+
});
160+
161+
it.each([
162+
"script",
163+
"style",
164+
"title",
165+
"textarea",
166+
])("should reset after an unclosed %s tag", (tag) => {
167+
expect(
168+
tokenize((tokenizer, events) => {
169+
tokenizer.write(`<${tag}>body{color:red}`);
170+
tokenizer.end();
171+
events.length = 0;
172+
tokenizer.reset();
173+
tokenizer.write("<div>hello</div>");
174+
tokenizer.end();
175+
}).map(([event]) => event),
176+
).toEqual([
177+
"onopentagname",
178+
"onopentagend",
179+
"ontext",
180+
"onclosetag",
181+
"onend",
182+
]);
183+
});
184+
185+
it("should terminate XML processing instructions on ?>", () => {
186+
expect(
187+
tokenize("<?target data > injected ?>", { xmlMode: true }),
188+
).toMatchSnapshot();
189+
});
190+
131191
it("should not lose data when pausing", () => {
132192
const log: unknown[][] = [];
133193
const tokenizer = new Tokenizer(

src/Tokenizer.ts

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ const Sequences = {
138138
Empty: new Uint8Array(0),
139139
Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[
140140
CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]>
141-
CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), // `-->`
141+
CommentEnd: new Uint8Array([0x2d, 0x2d, 0x21, 0x3e]), // `--!>`
142142
ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script`
143143
StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style`
144144
TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
@@ -196,7 +196,9 @@ export default class Tokenizer {
196196
this.sectionStart = 0;
197197
this.index = 0;
198198
this.baseState = State.Text;
199+
this.isSpecial = false;
199200
this.currentSequence = Sequences.Empty;
201+
this.sequenceIndex = 0;
200202
this.running = true;
201203
this.offset = 0;
202204
}
@@ -265,7 +267,7 @@ export default class Tokenizer {
265267
*/
266268
private stateInSpecialTag(c: number): void {
267269
if (this.sequenceIndex === this.currentSequence.length) {
268-
if (c === CharCodes.Gt || isWhitespace(c)) {
270+
if (isEndOfTagSection(c)) {
269271
const endOfText = this.index - this.currentSequence.length;
270272

271273
if (this.sectionStart < endOfText) {
@@ -352,12 +354,29 @@ export default class Tokenizer {
352354
* @param c Current character code point.
353355
*/
354356
private stateInCommentLike(c: number): void {
355-
if (c === this.currentSequence[this.sequenceIndex]) {
357+
if (
358+
this.currentSequence === Sequences.CommentEnd &&
359+
this.sequenceIndex === 2 &&
360+
c === CharCodes.Gt
361+
) {
362+
// `!` is optional here, so the same sequence also accepts `-->`.
363+
this.cbs.oncomment(this.sectionStart, this.index, 2);
364+
365+
this.sequenceIndex = 0;
366+
this.sectionStart = this.index + 1;
367+
this.state = State.Text;
368+
} else if (
369+
this.currentSequence === Sequences.CommentEnd &&
370+
this.sequenceIndex === this.currentSequence.length - 1 &&
371+
c !== CharCodes.Gt
372+
) {
373+
this.sequenceIndex = Number(c === CharCodes.Dash);
374+
} else if (c === this.currentSequence[this.sequenceIndex]) {
356375
if (++this.sequenceIndex === this.currentSequence.length) {
357376
if (this.currentSequence === Sequences.CdataEnd) {
358377
this.cbs.oncdata(this.sectionStart, this.index, 2);
359378
} else {
360-
this.cbs.oncomment(this.sectionStart, this.index, 2);
379+
this.cbs.oncomment(this.sectionStart, this.index, 3);
361380
}
362381

363382
this.sequenceIndex = 0;
@@ -399,6 +418,7 @@ export default class Tokenizer {
399418
this.sectionStart = this.index + 1;
400419
} else if (c === CharCodes.Questionmark) {
401420
this.state = State.InProcessingInstruction;
421+
this.sequenceIndex = 0;
402422
this.sectionStart = this.index + 1;
403423
} else if (this.isTagStartChar(c)) {
404424
const lower = c | 0x20;
@@ -443,7 +463,7 @@ export default class Tokenizer {
443463
}
444464
}
445465
private stateInClosingTagName(c: number): void {
446-
if (c === CharCodes.Gt || isWhitespace(c)) {
466+
if (isEndOfTagSection(c)) {
447467
this.cbs.onclosetag(this.sectionStart, this.index);
448468
this.sectionStart = -1;
449469
this.state = State.AfterClosingTagName;
@@ -574,7 +594,25 @@ export default class Tokenizer {
574594
}
575595
}
576596
private stateInProcessingInstruction(c: number): void {
577-
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
597+
if (this.xmlMode) {
598+
if (c === CharCodes.Questionmark) {
599+
// Remember that we just consumed `?`, so the next `>` closes the PI.
600+
this.sequenceIndex = 1;
601+
} else if (c === CharCodes.Gt && this.sequenceIndex === 1) {
602+
this.cbs.onprocessinginstruction(
603+
this.sectionStart,
604+
this.index - 1,
605+
);
606+
this.sequenceIndex = 0;
607+
this.state = State.Text;
608+
this.sectionStart = this.index + 1;
609+
} else {
610+
// Keep scanning for the next `?`, which can start a closing `?>`.
611+
this.sequenceIndex = Number(
612+
this.fastForwardTo(CharCodes.Questionmark),
613+
);
614+
}
615+
} else if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
578616
this.cbs.onprocessinginstruction(this.sectionStart, this.index);
579617
this.state = State.Text;
580618
this.sectionStart = this.index + 1;

0 commit comments

Comments
 (0)