Skip to content

Commit ce8137b

Browse files
committed
Replace unknown characters in sgml link extractor, to deal more gracefully with encoding errors in the page. Closes scrapy#309
1 parent 181d1c0 commit ce8137b

File tree

5 files changed

+19
-6
lines changed

5 files changed

+19
-6
lines changed

scrapy/contrib/linkextractors/lxmlparser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def _extract_links(self, response_text, response_url, response_encoding):
3333
for link in links:
3434
link.url = urljoin_rfc(base_url, link.url, response_encoding)
3535
link.url = safe_url_string(link.url, response_encoding)
36-
link.text = str_to_unicode(link.text, response_encoding)
36+
link.text = str_to_unicode(link.text, response_encoding, errors='replace')
3737
ret.append(link)
3838

3939
return ret

scrapy/contrib/linkextractors/sgml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
3333
for link in self.links:
3434
link.url = urljoin_rfc(base_url, link.url, response_encoding)
3535
link.url = safe_url_string(link.url, response_encoding)
36-
link.text = str_to_unicode(link.text, response_encoding)
36+
link.text = str_to_unicode(link.text, response_encoding, errors='replace')
3737
ret.append(link)
3838

3939
return ret

scrapy/tests/test_contrib_linkextractors.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ def test_base_url(http://github.com/self):
5151
self.assertEqual(lx.extract_links(response),
5252
[Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
5353

54+
def test_link_text_wrong_encoding(self):
55+
html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>"""
56+
response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8')
57+
lx = BaseSgmlLinkExtractor()
58+
self.assertEqual(lx.extract_links(response),
59+
[Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd')])
60+
5461
def test_extraction_encoding(self):
5562
body = get_testdata('link_extractor', 'linkextractor_noenc.html')
5663
response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})

scrapy/tests/test_utils_python.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ def test_str_to_unicode(self):
2222
# converting a strange object should raise TypeError
2323
self.assertRaises(TypeError, str_to_unicode, 423)
2424

25+
# check errors argument works
26+
self.assertEqual(str_to_unicode('a\xedb', 'utf-8', errors='replace'), u'a\ufffdb')
27+
2528
def test_unicode_to_str(self):
2629
# converting a unicode object to an utf-8 encoded string
2730
self.assertEqual(unicode_to_str(u'\xa3 49'), '\xc2\xa3 49')
@@ -35,6 +38,9 @@ def test_unicode_to_str(self):
3538
# converting a strange object should raise TypeError
3639
self.assertRaises(TypeError, unicode_to_str, unittest)
3740

41+
# check errors argument works
42+
(str_to_unicode('a\xedb', 'latin-', errors='replace'), u'a?b')
43+
3844
def test_memoizemethod_noargs(self):
3945
class A(object):
4046

scrapy/utils/python.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def unique(list_, key=lambda x: x):
6363
return result
6464

6565

66-
def str_to_unicode(text, encoding=None):
66+
def str_to_unicode(text, encoding=None, errors='strict'):
6767
"""Return the unicode representation of text in the given encoding. Unlike
6868
.encode(encoding) this function can be applied directly to a unicode
6969
object without the risk of double-decoding problems (which can happen if
@@ -73,13 +73,13 @@ def str_to_unicode(text, encoding=None):
7373
if encoding is None:
7474
encoding = 'utf-8'
7575
if isinstance(text, str):
76-
return text.decode(encoding)
76+
return text.decode(encoding, errors)
7777
elif isinstance(text, unicode):
7878
return text
7979
else:
8080
raise TypeError('str_to_unicode must receive a str or unicode object, got %s' % type(text).__name__)
8181

82-
def unicode_to_str(text, encoding=None):
82+
def unicode_to_str(text, encoding=None, errors='strict'):
8383
"""Return the str representation of text in the given encoding. Unlike
8484
.encode(encoding) this function can be applied directly to a str
8585
object without the risk of double-decoding problems (which can happen if
@@ -89,7 +89,7 @@ def unicode_to_str(text, encoding=None):
8989
if encoding is None:
9090
encoding = 'utf-8'
9191
if isinstance(text, unicode):
92-
return text.encode(encoding)
92+
return text.encode(encoding, errors)
9393
elif isinstance(text, str):
9494
return text
9595
else:

0 commit comments

Comments
 (0)