Skip to content

Commit 77b14a6

Browse files
authored
gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215)
According to the HTML5 spec, named character references in attribute values should only be processed if they are not followed by an ASCII alphanumeric, or an equals sign. https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
1 parent 3dfed23 commit 77b14a6

File tree

3 files changed

+57
-9
lines changed

3 files changed

+57
-9
lines changed

Lib/html/parser.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import _markupbase
1313

1414
from html import unescape
15+
from html.entities import html5 as html5_entities
1516

1617

1718
__all__ = ['HTMLParser']
@@ -23,6 +24,7 @@
2324

2425
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
2526
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27+
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2628

2729
starttagopen = re.compile('<[a-zA-Z]')
2830
piclose = re.compile('>')
@@ -57,6 +59,22 @@
5759
# </ and the tag name, so maybe this should be fixed
5860
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
5961

62+
# Character reference processing logic specific to attribute values
63+
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
64+
def _replace_attr_charref(match):
65+
ref = match.group(0)
66+
# Numeric / hex char refs must always be unescaped
67+
if ref.startswith('&#'):
68+
return unescape(ref)
69+
# Named character / entity references must only be unescaped
70+
# if they are an exact match, and they are not followed by an equals sign
71+
if not ref.endswith('=') and ref[1:] in html5_entities:
72+
return unescape(ref)
73+
# Otherwise do not unescape
74+
return ref
75+
76+
def _unescape_attrvalue(s):
77+
return attr_charref.sub(_replace_attr_charref, s)
6078

6179

6280
class HTMLParser(_markupbase.ParserBase):
@@ -323,7 +341,7 @@ def parse_starttag(self, i):
323341
attrvalue[:1] == '"' == attrvalue[-1:]:
324342
attrvalue = attrvalue[1:-1]
325343
if attrvalue:
326-
attrvalue = unescape(attrvalue)
344+
attrvalue = _unescape_attrvalue(attrvalue)
327345
attrs.append((attrname.lower(), attrvalue))
328346
k = m.end()
329347

Lib/test/test_htmlparser.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -348,18 +348,16 @@ def test_convert_charrefs(self):
348348
collector = lambda: EventCollectorCharrefs()
349349
self.assertTrue(collector().convert_charrefs)
350350
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
351-
# check charrefs in the middle of the text/attributes
352-
expected = [('starttag', 'a', [('href', 'foo"zar')]),
353-
('data', 'a"z'), ('endtag', 'a')]
351+
# check charrefs in the middle of the text
352+
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
354353
for charref in charrefs:
355-
self._run_check('<a href="https://github.com/foo%7B0%7Dzar">a{0}z</a>'.format(charref),
354+
self._run_check('<a>a{0}z</a>'.format(charref),
356355
expected, collector=collector())
357-
# check charrefs at the beginning/end of the text/attributes
358-
expected = [('data', '"'),
359-
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
356+
# check charrefs at the beginning/end of the text
357+
expected = [('data', '"'), ('starttag', 'a', []),
360358
('data', '"'), ('endtag', 'a'), ('data', '"')]
361359
for charref in charrefs:
362-
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
360+
self._run_check('{0}<a>'
363361
'{0}</a>{0}'.format(charref),
364362
expected, collector=collector())
365363
# check charrefs in <script>/<style> elements
@@ -382,6 +380,35 @@ def test_convert_charrefs(self):
382380
self._run_check('no charrefs here', [('data', 'no charrefs here')],
383381
collector=collector())
384382

383+
def test_convert_charrefs_in_attribute_values(self):
384+
# default value for convert_charrefs is now True
385+
collector = lambda: EventCollectorCharrefs()
386+
self.assertTrue(collector().convert_charrefs)
387+
388+
# always unescape terminated entity refs, numeric and hex char refs:
389+
# - regardless whether they are at start, middle, end of attribute
390+
# - or followed by alphanumeric, non-alphanumeric, or equals char
391+
charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
392+
expected = [('starttag', 'a',
393+
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
394+
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
395+
('endtag', 'a')]
396+
for charref in charrefs:
397+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
398+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
399+
.format(charref), expected, collector=collector())
400+
401+
# only unescape unterminated entity matches if they are not followed by
402+
# an alphanumeric or an equals sign
403+
charref = '&cent'
404+
expected = [('starttag', 'a',
405+
[('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
406+
('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
407+
('endtag', 'a')]
408+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
409+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
410+
.format(charref), expected, collector=collector())
411+
385412
# the remaining tests were for the "tolerant" parser (which is now
386413
# the default), and check various kind of broken markup
387414
def test_tolerant_parsing(self):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix :class:`html.parser.HTMLParser` to not unescape character entities in
2+
attribute values if they are followed by an ASCII alphanumeric or an equals
3+
sign.

0 commit comments

Comments
 (0)