Skip to content

Commit de15fcd

Browse files
Scorpilnyov
authored andcommitted
[LinkExtractors] Ignore bogus links
(rebased the code for scrapy 1.0 and made a few code improvements --nyov)
1 parent 9adb5c3 commit de15fcd

File tree

5 files changed

+80
-9
lines changed

5 files changed

+80
-9
lines changed

scrapy/linkextractors/htmlparser.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ def _extract_links(self, response_text, response_url, response_encoding):
4141
for link in links:
4242
if isinstance(link.url, unicode):
4343
link.url = link.url.encode(response_encoding)
44-
link.url = urljoin(base_url, link.url)
44+
try:
45+
link.url = urljoin(base_url, link.url)
46+
except ValueError:
47+
continue
4548
link.url = safe_url_string(link.url, response_encoding)
4649
link.text = link.text.decode(response_encoding)
4750
ret.append(link)

scrapy/linkextractors/lxmlhtml.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,14 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
4949
# hacky way to get the underlying lxml parsed document
5050
for el, attr, attr_val in self._iter_links(selector.root):
5151
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
52-
attr_val = urljoin(base_url, attr_val)
53-
url = self.process_attr(attr_val)
54-
if url is None:
55-
continue
52+
try:
53+
attr_val = urljoin(base_url, attr_val)
54+
except ValueError:
55+
continue # skipping bogus links
56+
else:
57+
url = self.process_attr(attr_val)
58+
if url is None:
59+
continue
5660
if isinstance(url, unicode):
5761
url = url.encode(response_encoding)
5862
# to fix relative links after process_value

scrapy/linkextractors/regex.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,25 @@ def clean_link(link_text):
1414
"""Remove leading and trailing whitespace and punctuation"""
1515
return link_text.strip("\t\r\n '\"")
1616

17+
1718
class RegexLinkExtractor(SgmlLinkExtractor):
1819
"""High performant link extractor"""
1920

2021
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
22+
def clean_text(text):
23+
return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
24+
25+
def clean_url(url):
26+
clean_url = ''
27+
try:
28+
clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
29+
except ValueError:
30+
pass
31+
return clean_url
32+
2133
if base_url is None:
2234
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
2335

24-
clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
25-
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
26-
2736
links_text = linkre.findall(response_text)
2837
return [Link(clean_url(url).encode(response_encoding),
2938
clean_text(text))

scrapy/linkextractors/sgml.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
4242
for link in self.links:
4343
if isinstance(link.url, unicode):
4444
link.url = link.url.encode(response_encoding)
45-
link.url = urljoin(base_url, link.url)
45+
try:
46+
link.url = urljoin(base_url, link.url)
47+
except ValueError:
48+
continue
4649
link.url = safe_url_string(link.url, response_encoding)
4750
link.text = to_unicode(link.text, response_encoding, errors='replace').strip()
4851
ret.append(link)

tests/test_linkextractors.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,10 +491,36 @@ def test_xhtml(self):
491491
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
492492
)
493493

494+
def test_link_wrong_href(self):
495+
html = """
496+
<a href="http://example.org/item1.html">Item 1</a>
497+
<a href="http://[example.org/item2.html">Item 2</a>
498+
<a href="http://example.org/item3.html">Item 3</a>
499+
"""
500+
response = HtmlResponse("http://example.org/index.html", body=html)
501+
lx = self.extractor_cls()
502+
self.assertEqual([link for link in lx.extract_links(response)], [
503+
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
504+
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
505+
])
506+
494507

495508
class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):
496509
extractor_cls = LxmlLinkExtractor
497510

511+
def test_link_wrong_href(self):
512+
html = """
513+
<a href="http://example.org/item1.html">Item 1</a>
514+
<a href="http://[example.org/item2.html">Item 2</a>
515+
<a href="http://example.org/item3.html">Item 3</a>
516+
"""
517+
response = HtmlResponse("http://example.org/index.html", body=html)
518+
lx = self.extractor_cls()
519+
self.assertEqual([link for link in lx.extract_links(response)], [
520+
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
521+
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
522+
])
523+
498524

499525
class HtmlParserLinkExtractorTestCase(unittest.TestCase):
500526

@@ -512,6 +538,19 @@ def test_extraction(self):
512538
Link(url='http://www.google.com/something', text=u''),
513539
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
514540

541+
def test_link_wrong_href(self):
542+
html = """
543+
<a href="http://example.org/item1.html">Item 1</a>
544+
<a href="http://[example.org/item2.html">Item 2</a>
545+
<a href="http://example.org/item3.html">Item 3</a>
546+
"""
547+
response = HtmlResponse("http://example.org/index.html", body=html)
548+
lx = HtmlParserLinkExtractor()
549+
self.assertEqual([link for link in lx.extract_links(response)], [
550+
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
551+
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
552+
])
553+
515554

516555
class RegexLinkExtractorTestCase(unittest.TestCase):
517556

@@ -528,6 +567,19 @@ def test_extraction(self):
528567
Link(url='http://www.google.com/something', text=u''),
529568
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
530569

570+
def test_link_wrong_href(self):
571+
html = """
572+
<a href="http://example.org/item1.html">Item 1</a>
573+
<a href="http://[example.org/item2.html">Item 2</a>
574+
<a href="http://example.org/item3.html">Item 3</a>
575+
"""
576+
response = HtmlResponse("http://example.org/index.html", body=html)
577+
lx = RegexLinkExtractor()
578+
self.assertEqual([link for link in lx.extract_links(response)], [
579+
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
580+
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
581+
])
582+
531583

532584
if __name__ == "__main__":
533585
unittest.main()

0 commit comments

Comments
 (0)