Skip to content

Commit c756dfe

Browse files
committed
KSES: Preserve some additional invalid HTML comment syntaxes.
When `wp_kses_split` processes a document it attempts to leave HTML comments alone. It makes minor adjustments, but leaves the comments in the document in its output. Unfortunately it only recognizes one kind of HTML comment and rejects many others. This patch makes a minor adjustment to the algorithm in `wp_kses_split` to recognize and preserve an additional kind of HTML comment: closing tags with an invalid tag name, e.g. `</%dolly>`. These invalid closing tags must be interpreted as comments by a browser. This bug fix aligns the implementation of `wp_kses_split()` more closely with its stated goal of leaving HTML comments as comments. It doesn't attempt to fully fix the mis-parsed comments, but it does propose a minor fix that hopefully won't break any existing code or projects. Developed in #6395 Discussed in https://core.trac.wordpress.org/ticket/61009 Props ellatrix, dmsnell, joemcgill, jorbin, westonruter, zieladam. See #61009. git-svn-id: https://develop.svn.wordpress.org/trunk@58418 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 738c03d commit c756dfe

2 files changed

Lines changed: 82 additions & 3 deletions

File tree

src/wp-includes/kses.php

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,7 @@ function wp_kses_version() {
963963
* It also matches stray `>` characters.
964964
*
965965
* @since 1.0.0
966+
* @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments.
966967
*
967968
* @global array[]|string $pass_allowed_html An array of allowed HTML elements and attributes,
968969
* or a context name such as 'post'.
@@ -981,7 +982,18 @@ function wp_kses_split( $content, $allowed_html, $allowed_protocols ) {
981982
$pass_allowed_html = $allowed_html;
982983
$pass_allowed_protocols = $allowed_protocols;
983984

984-
return preg_replace_callback( '%(<!--.*?(-->|$))|(<[^>]*(>|$)|>)%', '_wp_kses_split_callback', $content );
985+
$token_pattern = <<<REGEX
986+
~
987+
( # Detect comments of various flavors before attempting to find tags.
988+
(<!--.*?(-->|$)) # - Normative HTML comments.
989+
|
990+
</[^a-zA-Z][^>]*> # - Closing tags with invalid tag names.
991+
)
992+
|
993+
(<[^>]*(>|$)|>) # Tag-like spans of text.
994+
~x
995+
REGEX;
996+
return preg_replace_callback( $token_pattern, '_wp_kses_split_callback', $content );
985997
}
986998

987999
/**
@@ -1069,23 +1081,61 @@ function _wp_kses_split_callback( $matches ) {
10691081
* @access private
10701082
* @ignore
10711083
* @since 1.0.0
1084+
* @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments.
10721085
*
10731086
* @param string $content Content to filter.
10741087
* @param array[]|string $allowed_html An array of allowed HTML elements and attributes,
10751088
* or a context name such as 'post'. See wp_kses_allowed_html()
10761089
* for the list of accepted context names.
10771090
* @param string[] $allowed_protocols Array of allowed URL protocols.
1091+
*
10781092
* @return string Fixed HTML element
10791093
*/
10801094
function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) {
10811095
$content = wp_kses_stripslashes( $content );
10821096

1083-
// It matched a ">" character.
1097+
/*
1098+
* The regex pattern used to split HTML into chunks attempts
1099+
* to split on HTML token boundaries. This function should
1100+
* thus receive chunks that _either_ start with meaningful
1101+
* syntax tokens, like a tag `<div>` or a comment `<!-- ... -->`.
1102+
*
1103+
* If the first character of the `$content` chunk _isn't_ one
1104+
* of these syntax elements, which always starts with `<`, then
1105+
* the match had to be for the final alternation of `>`. In such
1106+
* case, it's probably standing on its own and could be encoded
1107+
* with a character reference to remove ambiguity.
1108+
*
1109+
* In other words, if this chunk isn't from a match of a syntax
1110+
* token, it's just a plaintext greater-than (`>`) sign.
1111+
*/
10841112
if ( ! str_starts_with( $content, '<' ) ) {
10851113
return '&gt;';
10861114
}
10871115

1088-
// Allow HTML comments.
1116+
/*
1117+
* When a closing tag appears with a name that isn't a valid tag name,
1118+
* it must be interpreted as an HTML comment. It extends until the
1119+
* first `>` character after the initial opening `</`.
1120+
*
1121+
* Preserve these comments and do not treat them like tags.
1122+
*/
1123+
if ( 1 === preg_match( '~^</[^a-zA-Z][^>]*>$~', $content ) ) {
1124+
$content = substr( $content, 2, -1 );
1125+
$transformed = null;
1126+
1127+
while ( $transformed !== $content ) {
1128+
$transformed = wp_kses( $content, $allowed_html, $allowed_protocols );
1129+
$content = $transformed;
1130+
}
1131+
1132+
return "</{$transformed}>";
1133+
}
1134+
1135+
/*
1136+
* Normative HTML comments should be handled separately as their
1137+
* parsing rules differ from those for tags and text nodes.
1138+
*/
10891139
if ( str_starts_with( $content, '<!--' ) ) {
10901140
$content = str_replace( array( '<!--', '-->' ), '', $content );
10911141

tests/phpunit/tests/kses.php

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1931,6 +1931,35 @@ public function filter_wp_kses_object_added_in_html_filter( $tags, $context ) {
19311931
return $tags;
19321932
}
19331933

1934+
/**
1935+
* Ensures that `wp_kses()` preserves various kinds of HTML comments, both valid and invalid.
1936+
*
1937+
* @ticket 61009
1938+
*
1939+
* @param string $html_comment HTML containing a comment; must not be a valid comment
1940+
* but must be syntax which a browser interprets as a comment.
1941+
* @param string $expected_output How `wp_kses()` ought to transform the comment.
1942+
*/
1943+
public function wp_kses_preserves_html_comments( $html_comment, $expected_output ) {
1944+
$this->assertSame(
1945+
$expected_output,
1946+
wp_kses( $html_comment, array() ),
1947+
'Failed to properly preserve HTML comment.'
1948+
);
1949+
}
1950+
1951+
/**
1952+
* Data provider.
1953+
*
1954+
* @return array[].
1955+
*/
1956+
public static function data_html_containing_various_kinds_of_html_comments() {
1957+
return array(
1958+
'Normative HTML comment' => array( 'before<!-- this is a comment -->after', 'before<!-- this is a comment -->after' ),
1959+
'Closing tag with invalid tag name' => array( 'before<//not a tag>after', 'before<//not a tag>after' ),
1960+
);
1961+
}
1962+
19341963
/**
19351964
* Test that attributes with a list of allowed values are filtered correctly.
19361965
*

0 commit comments

Comments
 (0)