Skip to content

Commit 0b800d7

Browse files
committed
HTML API: Fix splitting single text node.
When `next_token()` was introduced, it brought a subtle bug. When encountering a `<` in the HTML stream which did not lead to a tag or comment or other token, it was treating the full text span to that point as one text node, and the following span another text node. The entire span should be one text node. In this patch the Tag Processor properly detects this scenario and combines the spans into one text node. Follow-up to [57348] Props jonsurrell Fixes #60385 git-svn-id: https://develop.svn.wordpress.org/trunk@57489 602fd350-edb4-49c9-b593-d223f7449a82
1 parent a172e31 commit 0b800d7

2 files changed

Lines changed: 43 additions & 10 deletions

File tree

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,16 +1512,6 @@ private function parse_next_tag() {
15121512
while ( false !== $at && $at < $doc_length ) {
15131513
$at = strpos( $html, '<', $at );
15141514

1515-
if ( $at > $was_at ) {
1516-
$this->parser_state = self::STATE_TEXT_NODE;
1517-
$this->token_starts_at = $was_at;
1518-
$this->token_length = $at - $was_at;
1519-
$this->text_starts_at = $was_at;
1520-
$this->text_length = $this->token_length;
1521-
$this->bytes_already_parsed = $at;
1522-
return true;
1523-
}
1524-
15251515
/*
15261516
* This does not imply an incomplete parse; it indicates that there
15271517
* can be nothing left in the document other than a #text node.
@@ -1536,6 +1526,37 @@ private function parse_next_tag() {
15361526
return true;
15371527
}
15381528

1529+
if ( $at > $was_at ) {
1530+
/*
1531+
* A "<" has been found in the document. That may be the start of another node, or
1532+
* it may be an "ivalid-first-character-of-tag-name" error. If this is not the start
1533+
* of another node the "<" should be included in this text node and another
1534+
* termination point should be found for the text node.
1535+
*
1536+
* @see https://html.spec.whatwg.org/#tag-open-state
1537+
*/
1538+
if ( strlen( $html ) > $at + 1 ) {
1539+
$next_character = $html[ $at + 1 ];
1540+
$at_another_node =
1541+
'!' === $next_character ||
1542+
'/' === $next_character ||
1543+
'?' === $next_character ||
1544+
( 'A' <= $next_character && $next_character <= 'z' );
1545+
if ( ! $at_another_node ) {
1546+
++$at;
1547+
continue;
1548+
}
1549+
}
1550+
1551+
$this->parser_state = self::STATE_TEXT_NODE;
1552+
$this->token_starts_at = $was_at;
1553+
$this->token_length = $at - $was_at;
1554+
$this->text_starts_at = $was_at;
1555+
$this->text_length = $this->token_length;
1556+
$this->bytes_already_parsed = $at;
1557+
return true;
1558+
}
1559+
15391560
$this->token_starts_at = $at;
15401561

15411562
if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) {

tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2715,4 +2715,16 @@ public function test_handles_malformed_taglike_close_short_html() {
27152715
$result = $p->next_tag();
27162716
$this->assertFalse( $result, 'Did not handle "</ " html properly.' );
27172717
}
2718+
2719+
/**
2720+
* Ensures that non-tag syntax starting with `<` is consumed inside a text node.
2721+
*
2722+
* @ticket 60385
2723+
*/
2724+
public function test_single_text_node_with_taglike_text() {
2725+
$p = new WP_HTML_Tag_Processor( 'test< /A>' );
2726+
$p->next_token();
2727+
$this->assertSame( '#text', $p->get_token_type(), 'Did not find text node.' );
2728+
$this->assertSame( 'test< /A>', $p->get_modifiable_text(), 'Did not find complete text node.' );
2729+
}
27182730
}

0 commit comments

Comments
 (0)