java2man
diff --git a/‎scrapy/contrib/linkextractors/lxmlparser.py
Lines changed: 1 addition & 1 deletion b/‎scrapy/contrib/linkextractors/lxmlparser.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎scrapy/contrib/linkextractors/sgml.py
Lines changed: 1 addition & 1 deletion b/‎scrapy/contrib/linkextractors/sgml.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎scrapy/tests/test_contrib_linkextractors.py
Lines changed: 7 additions & 0 deletions b/‎scrapy/tests/test_contrib_linkextractors.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎scrapy/tests/test_utils_python.py
Lines changed: 6 additions & 0 deletions b/‎scrapy/tests/test_utils_python.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎scrapy/utils/python.py
Lines changed: 4 additions & 4 deletions b/‎scrapy/utils/python.py
Lines changed: 4 additions & 4 deletions
@@ -33,7 +33,7 @@ def _extract_links(self, response_text, response_url, response_encoding):
         for link in links:
             link.url = urljoin_rfc(base_url, link.url, response_encoding)
             link.url = safe_url_string(link.url, response_encoding)
-            link.text = str_to_unicode(link.text, response_encoding)
+            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
             ret.append(link)
 
         return ret
 
@@ -33,7 +33,7 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
         for link in self.links:
             link.url = urljoin_rfc(base_url, link.url, response_encoding)
             link.url = safe_url_string(link.url, response_encoding)
-            link.text = str_to_unicode(link.text, response_encoding)
+            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
             ret.append(link)
 
         return ret
 
@@ -51,6 +51,13 @@ def test_base_url(http://github.com/self):
         self.assertEqual(lx.extract_links(response),
                          [Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
 
+    def test_link_text_wrong_encoding(self):
+        html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>"""
+        response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8')
+        lx = BaseSgmlLinkExtractor()
+        self.assertEqual(lx.extract_links(response),
+             [Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd')])
+
     def test_extraction_encoding(self):
         body = get_testdata('link_extractor', 'linkextractor_noenc.html')
         response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
 
@@ -22,6 +22,9 @@ def test_str_to_unicode(self):
         # converting a strange object should raise TypeError
         self.assertRaises(TypeError, str_to_unicode, 423)
 
+        # check errors argument works
+        self.assertEqual(str_to_unicode('a\xedb', 'utf-8', errors='replace'), u'a\ufffdb')
+
     def test_unicode_to_str(self):
         # converting a unicode object to an utf-8 encoded string
         self.assertEqual(unicode_to_str(u'\xa3 49'), '\xc2\xa3 49')
@@ -35,6 +38,9 @@ def test_unicode_to_str(self):
         # converting a strange object should raise TypeError
         self.assertRaises(TypeError, unicode_to_str, unittest)
 
+        # check errors argument works
+        (str_to_unicode('a\xedb', 'latin-', errors='replace'), u'a?b')
+
     def test_memoizemethod_noargs(self):
         class A(object):
 
 
@@ -63,7 +63,7 @@ def unique(list_, key=lambda x: x):
     return result
 
 
-def str_to_unicode(text, encoding=None):
+def str_to_unicode(text, encoding=None, errors='strict'):
     """Return the unicode representation of text in the given encoding. Unlike
     .encode(encoding) this function can be applied directly to a unicode
     object without the risk of double-decoding problems (which can happen if
@@ -73,13 +73,13 @@ def str_to_unicode(text, encoding=None):
     if encoding is None:
         encoding = 'utf-8'
     if isinstance(text, str):
-        return text.decode(encoding)
+        return text.decode(encoding, errors)
     elif isinstance(text, unicode):
         return text
     else:
         raise TypeError('str_to_unicode must receive a str or unicode object, got %s' % type(text).__name__)
 
-def unicode_to_str(text, encoding=None):
+def unicode_to_str(text, encoding=None, errors='strict'):
     """Return the str representation of text in the given encoding. Unlike
     .encode(encoding) this function can be applied directly to a str
     object without the risk of double-decoding problems (which can happen if
@@ -89,7 +89,7 @@ def unicode_to_str(text, encoding=None):
     if encoding is None:
         encoding = 'utf-8'
     if isinstance(text, unicode):
-        return text.encode(encoding)
+        return text.encode(encoding, errors)
     elif isinstance(text, str):
         return text
     else: