@@ -491,10 +491,36 @@ def test_xhtml(self):
491
491
Link (url = 'http://example.com/nofollow2.html' , text = u'Choose to follow or not' , fragment = '' , nofollow = False )]
492
492
)
493
493
494
+ def test_link_wrong_href (self ):
495
+ html = """
496
+ <a href="http://example.org/item1.html">Item 1</a>
497
+ <a href="http://[example.org/item2.html">Item 2</a>
498
+ <a href="http://example.org/item3.html">Item 3</a>
499
+ """
500
+ response = HtmlResponse ("http://example.org/index.html" , body = html )
501
+ lx = self .extractor_cls ()
502
+ self .assertEqual ([link for link in lx .extract_links (response )], [
503
+ Link (url = 'http://example.org/item1.html' , text = u'Item 1' , nofollow = False ),
504
+ Link (url = 'http://example.org/item3.html' , text = u'Item 3' , nofollow = False ),
505
+ ])
506
+
494
507
495
508
class LxmlLinkExtractorTestCase (SgmlLinkExtractorTestCase ):
496
509
extractor_cls = LxmlLinkExtractor
497
510
511
+ def test_link_wrong_href (self ):
512
+ html = """
513
+ <a href="http://example.org/item1.html">Item 1</a>
514
+ <a href="http://[example.org/item2.html">Item 2</a>
515
+ <a href="http://example.org/item3.html">Item 3</a>
516
+ """
517
+ response = HtmlResponse ("http://example.org/index.html" , body = html )
518
+ lx = self .extractor_cls ()
519
+ self .assertEqual ([link for link in lx .extract_links (response )], [
520
+ Link (url = 'http://example.org/item1.html' , text = u'Item 1' , nofollow = False ),
521
+ Link (url = 'http://example.org/item3.html' , text = u'Item 3' , nofollow = False ),
522
+ ])
523
+
498
524
499
525
class HtmlParserLinkExtractorTestCase (unittest .TestCase ):
500
526
@@ -512,6 +538,19 @@ def test_extraction(self):
512
538
Link (url = 'http://www.google.com/something' , text = u'' ),
513
539
Link (url = 'http://example.com/innertag.html' , text = u'inner tag' ),])
514
540
541
+ def test_link_wrong_href (self ):
542
+ html = """
543
+ <a href="http://example.org/item1.html">Item 1</a>
544
+ <a href="http://[example.org/item2.html">Item 2</a>
545
+ <a href="http://example.org/item3.html">Item 3</a>
546
+ """
547
+ response = HtmlResponse ("http://example.org/index.html" , body = html )
548
+ lx = HtmlParserLinkExtractor ()
549
+ self .assertEqual ([link for link in lx .extract_links (response )], [
550
+ Link (url = 'http://example.org/item1.html' , text = u'Item 1' , nofollow = False ),
551
+ Link (url = 'http://example.org/item3.html' , text = u'Item 3' , nofollow = False ),
552
+ ])
553
+
515
554
516
555
class RegexLinkExtractorTestCase (unittest .TestCase ):
517
556
@@ -528,6 +567,19 @@ def test_extraction(self):
528
567
Link (url = 'http://www.google.com/something' , text = u'' ),
529
568
Link (url = 'http://example.com/innertag.html' , text = u'inner tag' ),])
530
569
570
+ def test_link_wrong_href (self ):
571
+ html = """
572
+ <a href="http://example.org/item1.html">Item 1</a>
573
+ <a href="http://[example.org/item2.html">Item 2</a>
574
+ <a href="http://example.org/item3.html">Item 3</a>
575
+ """
576
+ response = HtmlResponse ("http://example.org/index.html" , body = html )
577
+ lx = RegexLinkExtractor ()
578
+ self .assertEqual ([link for link in lx .extract_links (response )], [
579
+ Link (url = 'http://example.org/item1.html' , text = u'Item 1' , nofollow = False ),
580
+ Link (url = 'http://example.org/item3.html' , text = u'Item 3' , nofollow = False ),
581
+ ])
582
+
531
583
532
584
if __name__ == "__main__" :
533
585
unittest .main ()
0 commit comments