diff --git a/CHANGELOG b/CHANGELOG index 3d12741..52629da 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +Unreleased + +* Ignore `data:` URIs. + 2.4.0 (2025-09-28) * Add index to Link (David Venhoff, #202) diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py index ba0949e..9967cdb 100644 --- a/linkcheck/__init__.py +++ b/linkcheck/__init__.py @@ -32,7 +32,7 @@ def handle_starttag(self, tag, attrs): self.text += f' [image:{src[0]}] ' def handle_endtag(self, tag): - if tag == 'a' and self.in_a: + if tag == 'a' and self.in_a and not self.url.startswith("data:"): self.urls.append((self.text[:256], self.url)) self.in_a = False self.text = '' @@ -48,7 +48,7 @@ class ImageLister(Lister): def handle_starttag(self, tag, attrs): if tag == 'img': src = [v for k, v in attrs if k == 'src'] - if src: + if src and not src[0].startswith("data:"): self.urls.append(('', src[0])) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index d0b005d..3dd3f57 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -943,6 +943,19 @@ def test_urls_exceeding_max_length(self): ) self.assertEqual(Url.objects.all().count(), 1) + def test_data_urls_ignored(self): + self.assertEqual(Url.objects.all().count(), 0) + Book.objects.create( + title="My Title", + description=( + 'This is a normal link: Example, ' + 'This is a data link: Example 2, ' + 'This is a data img: ' + ) + ) + # Only the normal link is extracted + self.assertEqual(Url.objects.all().count(), 1) + def test_empty_url_field(self): """ Test that URLField empty content is excluded depending on ignore_empty list.