Skip to content

Commit e71b8f2

Browse files
committed
Merge pull request scrapy#676 from csalazar/fix_xxe_flaw
Fixed XXE flaw in sitemap reader
2 parents f3d1508 + 554102f commit e71b8f2

File tree

4 files changed

+34
-3
lines changed

4 files changed

+34
-3
lines changed

scrapy/selector/unified.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,17 @@
1515

1616
__all__ = ['Selector', 'SelectorList']
1717

18+
19+
class SafeXMLParser(etree.XMLParser):
20+
def __init__(self, *args, **kwargs):
21+
kwargs.setdefault('resolve_entities', False)
22+
super(SafeXMLParser, self).__init__(*args, **kwargs)
23+
1824
_ctgroup = {
1925
'html': {'_parser': etree.HTMLParser,
2026
'_csstranslator': ScrapyHTMLTranslator(),
2127
'_tostring_method': 'html'},
22-
'xml': {'_parser': etree.XMLParser,
28+
'xml': {'_parser': SafeXMLParser,
2329
'_csstranslator': ScrapyGenericTranslator(),
2430
'_tostring_method': 'xml'},
2531
}

scrapy/tests/test_selector.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,16 @@ class SmartStringsSelector(Selector):
332332
div_class = x.xpath('//div/@class')
333333
self.assertTrue(all(map(lambda e: hasattr(e._root, 'getparent'), div_class)))
334334

335+
def test_xml_entity_expansion(self):
336+
malicious_xml = '<?xml version="1.0" encoding="ISO-8859-1"?>'\
337+
'<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\
338+
'"file:///etc/passwd" >]><foo>&xxe;</foo>'
339+
340+
response = XmlResponse('http://example.com', body=malicious_xml)
341+
sel = self.sscls(response=response)
342+
343+
self.assertEqual(sel.extract(), '<foo>&xxe;</foo>')
344+
335345

336346
class DeprecatedXpathSelectorTest(unittest.TestCase):
337347

scrapy/tests/test_utils_sitemap.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,28 @@ def test_alternate(self):
188188
<xhtml:link rel="alternate" hreflang="en"/><!-- wrong tag without href -->
189189
</url>
190190
</urlset>""")
191-
191+
192192
self.assertEqual(list(s), [
193193
{'loc': 'http://www.example.com/english/',
194194
'alternate': ['http://www.example.com/deutsch/', 'http://www.example.com/schweiz-deutsch/', 'http://www.example.com/english/']
195195
}
196196
])
197197

198+
def test_xml_entity_expansion(self):
199+
s = Sitemap("""<?xml version="1.0" encoding="utf-8"?>
200+
<!DOCTYPE foo [
201+
<!ELEMENT foo ANY >
202+
<!ENTITY xxe SYSTEM "file:///etc/passwd" >
203+
]>
204+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
205+
<url>
206+
<loc>http://127.0.0.1:8000/&xxe;</loc>
207+
</url>
208+
</urlset>
209+
""")
210+
211+
self.assertEqual(list(s), [{'loc': 'http://127.0.0.1:8000/'}])
212+
198213

199214
if __name__ == '__main__':
200215
unittest.main()

scrapy/utils/sitemap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class Sitemap(object):
1212
(type=sitemapindex) files"""
1313

1414
def __init__(self, xmltext):
15-
xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True)
15+
xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True, resolve_entities=False)
1616
self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
1717
rt = self._root.tag
1818
self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt

0 commit comments

Comments
 (0)