From 2ce9263366600a123095099ff3d105565cb4a7b1 Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Thu, 22 Oct 2020 11:37:51 +0200 Subject: [PATCH 1/4] Fix #80268: loadHTML() truncates at NUL bytes libxml2 has no issues parsing HTML strings with NUL bytes; these are just ignored. Particularly, `::loadHTMLFile()` already supports NUL bytes, so `::loadHTML()` should as well. --- ext/dom/document.c | 1 - ext/dom/tests/bug80268.phpt | 13 +++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 ext/dom/tests/bug80268.phpt diff --git a/ext/dom/document.c b/ext/dom/document.c index 22bb90d5d88db..0e15e7a110652 100644 --- a/ext/dom/document.c +++ b/ext/dom/document.c @@ -2024,7 +2024,6 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */ } ctxt = htmlCreateFileParserCtxt(source, NULL); } else { - source_len = xmlStrlen((xmlChar *) source); if (ZEND_SIZE_T_INT_OVFL(source_len)) { php_error_docref(NULL, E_WARNING, "Input string is too long"); RETURN_FALSE; diff --git a/ext/dom/tests/bug80268.phpt b/ext/dom/tests/bug80268.phpt new file mode 100644 index 0000000000000..d1ee3760b31c7 --- /dev/null +++ b/ext/dom/tests/bug80268.phpt @@ -0,0 +1,13 @@ +--TEST-- +Bug #80268 (loadHTML() truncates at NUL bytes) +--SKIPIF-- + +--FILE-- +loadHTML("

foobar

"); +$html = $doc->saveHTML(); +var_dump(strpos($html, 'foobar') !== false); +?> +--EXPECT-- +bool(true) From b63f093d1e4ff8d155dd94c0d8e0961338067bba Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Thu, 22 Oct 2020 17:41:47 +0200 Subject: [PATCH 2/4] Correct analysis of behavior and adapt test case accordingly Actually, libxml does not replace NUL bytes with spaces, but rather truncates text content at NUL bytes, but generally continues parsing. Anyhow, `loadHTML()` and `loadHTMLFile()` should behave the same in this regard. --- ext/dom/tests/bug80268.phpt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ext/dom/tests/bug80268.phpt b/ext/dom/tests/bug80268.phpt index d1ee3760b31c7..db34b4e6bc35b 100644 --- a/ext/dom/tests/bug80268.phpt +++ b/ext/dom/tests/bug80268.phpt @@ -5,9 +5,16 @@ Bug #80268 (loadHTML() truncates at NUL bytes) --FILE-- loadHTML("

foobar

"); +$doc->loadHTML("

foo\0bar

"); $html = $doc->saveHTML(); -var_dump(strpos($html, 'foobar') !== false); +var_dump(strpos($html, '

foo

') !== false); + +file_put_contents(__DIR__ . '/80268.html', "

foo\0bar

"); +$doc = new DOMDocument; +$doc->loadHTMLFile(__DIR__ . '/80268.html'); +$html = $doc->saveHTML(); +var_dump(strpos($html, '

foo

') !== false); ?> --EXPECT-- bool(true) +bool(true) From 3bbc2a3b154ea76b53ad4b81d4db28178589711f Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Thu, 22 Oct 2020 18:04:54 +0200 Subject: [PATCH 3/4] Clean up --- ext/dom/tests/bug80268.phpt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ext/dom/tests/bug80268.phpt b/ext/dom/tests/bug80268.phpt index db34b4e6bc35b..a48027caea3b4 100644 --- a/ext/dom/tests/bug80268.phpt +++ b/ext/dom/tests/bug80268.phpt @@ -15,6 +15,10 @@ $doc->loadHTMLFile(__DIR__ . '/80268.html'); $html = $doc->saveHTML(); var_dump(strpos($html, '

foo

') !== false); ?> +--CLEAN-- + --EXPECT-- bool(true) bool(true) From 86a9e2158c5795d5f50b3831ce14313f1f2a290a Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Fri, 23 Oct 2020 11:06:30 +0200 Subject: [PATCH 4/4] Don't use short tags --- ext/dom/tests/bug80268.phpt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/dom/tests/bug80268.phpt b/ext/dom/tests/bug80268.phpt index a48027caea3b4..0fe50b85e8611 100644 --- a/ext/dom/tests/bug80268.phpt +++ b/ext/dom/tests/bug80268.phpt @@ -16,7 +16,7 @@ $html = $doc->saveHTML(); var_dump(strpos($html, '

foo

') !== false); ?> --CLEAN-- - --EXPECT--