Skip to content

Commit 99f6eb5

Browse files
committed
Merge branch 'lxml-4.6'
2 parents add0d3d + a3eacbc commit 99f6eb5

File tree

4 files changed

+167
-15
lines changed

4 files changed

+167
-15
lines changed

CHANGES.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,19 @@ Other changes
3131
* Wheels include libxml2 2.9.12+ and libxslt 1.1.34 (also on Windows).
3232

3333

34+
4.6.5 (2021-12-12)
35+
==================
36+
37+
Bugs fixed
38+
----------
39+
40+
* A vulnerability (GHSL-2021-1038) in the HTML cleaner allowed sneaking script
41+
content through SVG images.
42+
43+
* A vulnerability (GHSL-2021-1037) in the HTML cleaner allowed sneaking script
44+
content through CSS imports and other crafted constructs.
45+
46+
3447
4.6.4 (2021-11-01)
3548
==================
3649

doc/main.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ Index <http://pypi.python.org/pypi/lxml/>`_ (PyPI). It has the source
159159
that compiles on various platforms. The source distribution is signed
160160
with `this key <pubkey.asc>`_.
161161

162-
The latest version is `lxml 4.6.4`_, released 2021-11-01
163-
(`changes for 4.6.4`_). `Older versions <#old-versions>`_
162+
The latest version is `lxml 4.6.5`_, released 2021-12-12
163+
(`changes for 4.6.5`_). `Older versions <#old-versions>`_
164164
are listed below.
165165

166166
Please take a look at the
@@ -256,7 +256,9 @@ See the websites of lxml
256256
..
257257
and the `latest in-development version <http://lxml.de/dev/>`_.
258258

259-
.. _`PDF documentation`: lxmldoc-4.6.4.pdf
259+
.. _`PDF documentation`: lxmldoc-4.6.5.pdf
260+
261+
* `lxml 4.6.5`_, released 2021-12-12 (`changes for 4.6.5`_)
260262

261263
* `lxml 4.6.4`_, released 2021-11-01 (`changes for 4.6.4`_)
262264

@@ -284,6 +286,7 @@ See the websites of lxml
284286

285287
* `older releases <http://lxml.de/4.3/#old-versions>`_
286288

289+
.. _`lxml 4.6.5`: /files/lxml-4.6.5.tgz
287290
.. _`lxml 4.6.4`: /files/lxml-4.6.4.tgz
288291
.. _`lxml 4.6.3`: /files/lxml-4.6.3.tgz
289292
.. _`lxml 4.6.2`: /files/lxml-4.6.2.tgz
@@ -297,6 +300,7 @@ See the websites of lxml
297300
.. _`lxml 4.4.1`: /files/lxml-4.4.1.tgz
298301
.. _`lxml 4.4.0`: /files/lxml-4.4.0.tgz
299302

303+
.. _`changes for 4.6.5`: /changes-4.6.5.html
300304
.. _`changes for 4.6.4`: /changes-4.6.4.html
301305
.. _`changes for 4.6.3`: /changes-4.6.3.html
302306
.. _`changes for 4.6.2`: /changes-4.6.2.html

src/lxml/html/clean.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -75,18 +75,23 @@
7575

7676
# All kinds of schemes besides just javascript: that can cause
7777
# execution:
78-
_is_image_dataurl = re.compile(
79-
r'^data:image/.+;base64', re.I).search
80-
_is_possibly_malicious_scheme = re.compile(
81-
r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
82-
re.I).search
83-
def _is_javascript_scheme(s):
84-
if _is_image_dataurl(s):
85-
return None
86-
return _is_possibly_malicious_scheme(s)
78+
_find_image_dataurls = re.compile(
79+
r'data:image/(.+);base64,', re.I).findall
80+
_possibly_malicious_schemes = re.compile(
81+
r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
82+
re.I).findall
83+
# SVG images can contain script content
84+
_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search
85+
86+
def _has_javascript_scheme(s):
87+
safe_image_urls = 0
88+
for image_type in _find_image_dataurls(s):
89+
if _is_unsafe_image_type(image_type):
90+
return True
91+
safe_image_urls += 1
92+
return len(_possibly_malicious_schemes(s)) > safe_image_urls
8793

8894
_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
89-
# FIXME: should data: be blocked?
9095

9196
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
9297
_conditional_comment_re = re.compile(
@@ -515,7 +520,7 @@ def _kill_elements(self, doc, condition, iterate=None):
515520
def _remove_javascript_link(self, link):
516521
# links like "j a v a s c r i p t:" might be interpreted in IE
517522
new = _substitute_whitespace('', unquote_plus(link))
518-
if _is_javascript_scheme(new):
523+
if _has_javascript_scheme(new):
519524
# FIXME: should this be None to delete?
520525
return ''
521526
return link
@@ -537,10 +542,12 @@ def _has_sneaky_javascript(self, style):
537542
style = style.replace('\\', '')
538543
style = _substitute_whitespace('', style)
539544
style = style.lower()
540-
if 'javascript:' in style:
545+
if _has_javascript_scheme(style):
541546
return True
542547
if 'expression(' in style:
543548
return True
549+
if '@import' in style:
550+
return True
544551
if '</noscript' in style:
545552
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
546553
return True

src/lxml/html/tests/test_clean.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import base64
2+
import gzip
13
import unittest
24
from lxml.tests.common_imports import make_doctest
35

@@ -123,6 +125,132 @@ def test_sneaky_js_in_math_style(self):
123125
b'<math><style>/* deleted */</style></math>',
124126
lxml.html.tostring(clean_html(s)))
125127

128+
def test_sneaky_import_in_style(self):
129+
# Prevent "@@importimport" -> "@import" replacement etc.
130+
style_codes = [
131+
"@@importimport(extstyle.css)",
132+
"@ @ import import(extstyle.css)",
133+
"@ @ importimport(extstyle.css)",
134+
"@@ import import(extstyle.css)",
135+
"@ @import import(extstyle.css)",
136+
"@@importimport()",
137+
"@@importimport() ()",
138+
"@/* ... */import()",
139+
"@im/* ... */port()",
140+
"@ @import/* ... */import()",
141+
"@ /* ... */ import()",
142+
]
143+
for style_code in style_codes:
144+
html = '<style>%s</style>' % style_code
145+
s = lxml.html.fragment_fromstring(html)
146+
147+
cleaned = lxml.html.tostring(clean_html(s))
148+
self.assertEqual(
149+
b'<style>/* deleted */</style>',
150+
cleaned,
151+
"%s -> %s" % (style_code, cleaned))
152+
153+
def test_sneaky_schemes_in_style(self):
154+
style_codes = [
155+
"javasjavascript:cript:",
156+
"javascriptjavascript::",
157+
"javascriptjavascript:: :",
158+
"vbjavascript:cript:",
159+
]
160+
for style_code in style_codes:
161+
html = '<style>%s</style>' % style_code
162+
s = lxml.html.fragment_fromstring(html)
163+
164+
cleaned = lxml.html.tostring(clean_html(s))
165+
self.assertEqual(
166+
b'<style>/* deleted */</style>',
167+
cleaned,
168+
"%s -> %s" % (style_code, cleaned))
169+
170+
def test_sneaky_urls_in_style(self):
171+
style_codes = [
172+
"url(data:image/svg+xml;base64,...)",
173+
"url(javasjavascript:cript:)",
174+
"url(javasjavascript:cript: ::)",
175+
"url(vbjavascript:cript:)",
176+
"url(vbjavascript:cript: :)",
177+
]
178+
for style_code in style_codes:
179+
html = '<style>%s</style>' % style_code
180+
s = lxml.html.fragment_fromstring(html)
181+
182+
cleaned = lxml.html.tostring(clean_html(s))
183+
self.assertEqual(
184+
b'<style>url()</style>',
185+
cleaned,
186+
"%s -> %s" % (style_code, cleaned))
187+
188+
def test_svg_data_links(self):
189+
# Remove SVG images with potentially insecure content.
190+
svg = b'<svg onload="alert(123)" />'
191+
svgz = gzip.compress(svg)
192+
svg_b64 = base64.b64encode(svg).decode('ASCII')
193+
svgz_b64 = base64.b64encode(svgz).decode('ASCII')
194+
urls = [
195+
"data:image/svg+xml;base64," + svg_b64,
196+
"data:image/svg+xml-compressed;base64," + svgz_b64,
197+
]
198+
for url in urls:
199+
html = '<img src="%s">' % url
200+
s = lxml.html.fragment_fromstring(html)
201+
202+
cleaned = lxml.html.tostring(clean_html(s))
203+
self.assertEqual(
204+
b'<img src="">',
205+
cleaned,
206+
"%s -> %s" % (url, cleaned))
207+
208+
def test_image_data_links(self):
209+
data = b'123'
210+
data_b64 = base64.b64encode(data).decode('ASCII')
211+
urls = [
212+
"data:image/jpeg;base64," + data_b64,
213+
"data:image/apng;base64," + data_b64,
214+
"data:image/png;base64," + data_b64,
215+
"data:image/gif;base64," + data_b64,
216+
"data:image/webp;base64," + data_b64,
217+
"data:image/bmp;base64," + data_b64,
218+
"data:image/tiff;base64," + data_b64,
219+
"data:image/x-icon;base64," + data_b64,
220+
]
221+
for url in urls:
222+
html = '<img src="%s">' % url
223+
s = lxml.html.fragment_fromstring(html)
224+
225+
cleaned = lxml.html.tostring(clean_html(s))
226+
self.assertEqual(
227+
html.encode("UTF-8"),
228+
cleaned,
229+
"%s -> %s" % (url, cleaned))
230+
231+
def test_image_data_links_in_style(self):
232+
data = b'123'
233+
data_b64 = base64.b64encode(data).decode('ASCII')
234+
urls = [
235+
"data:image/jpeg;base64," + data_b64,
236+
"data:image/apng;base64," + data_b64,
237+
"data:image/png;base64," + data_b64,
238+
"data:image/gif;base64," + data_b64,
239+
"data:image/webp;base64," + data_b64,
240+
"data:image/bmp;base64," + data_b64,
241+
"data:image/tiff;base64," + data_b64,
242+
"data:image/x-icon;base64," + data_b64,
243+
]
244+
for url in urls:
245+
html = '<style> url(%s) </style>' % url
246+
s = lxml.html.fragment_fromstring(html)
247+
248+
cleaned = lxml.html.tostring(clean_html(s))
249+
self.assertEqual(
250+
html.encode("UTF-8"),
251+
cleaned,
252+
"%s -> %s" % (url, cleaned))
253+
126254
def test_formaction_attribute_in_button_input(self):
127255
# The formaction attribute overrides the form's action and should be
128256
# treated as a malicious link attribute

0 commit comments

Comments
 (0)