Skip to content

Commit 1986e25

Browse files
committed
Fix issue #89, introduce flag option to keep images in summary.
1 parent 40256f4 commit 1986e25

File tree

4 files changed

+58
-5
lines changed

4 files changed

+58
-5
lines changed

readability/readability.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -210,12 +210,13 @@ def get_clean_html(self):
210210
"""
211211
return clean_attributes(tounicode(self.html, method="html"))
212212

213-
def summary(self, html_partial=False):
213+
def summary(self, html_partial=False, keep_all_images=False):
214214
"""
215215
Given a HTML file, extracts the text of the article.
216216
217217
:param html_partial: return only the div of the document, don't wrap
218218
in html and body tags.
219+
:param keep_all_images: Keep all images in summary.
219220
220221
Warning: It mutates internal DOM representation of the HTML document,
221222
so it is better to call other API methods before this one.
@@ -257,7 +258,7 @@ def summary(self, html_partial=False):
257258
article = self.html.find("body")
258259
if article is None:
259260
article = self.html
260-
cleaned_article = self.sanitize(article, candidates)
261+
cleaned_article = self.sanitize(article, candidates, keep_all_images)
261262

262263
article_length = len(cleaned_article or "")
263264
retry_length = self.retry_length
@@ -502,7 +503,7 @@ def reverse_tags(self, node, *tag_names):
502503
for tag_name in tag_names:
503504
yield from reversed(node.findall(".//%s" % tag_name))
504505

505-
def sanitize(self, node, candidates):
506+
def sanitize(self, node, candidates, keep_all_images=False):
506507
MIN_LEN = self.min_text_length
507508
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
508509
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
@@ -563,8 +564,8 @@ def sanitize(self, node, candidates):
563564
to_remove = False
564565
reason = ""
565566

566-
# if el.tag == 'div' and counts["img"] >= 1:
567-
# continue
567+
if keep_all_images and el.tag == 'div' and counts["img"] >= 1:
568+
continue
568569
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
569570
reason = "too many images (%s)" % counts["img"]
570571
to_remove = True

requirements-dev.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
lxml
2+
lxml_html_clean
3+
pytest
24
chardet
35
nose
46
pep8
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head></head>
4+
<body>
5+
<h2>
6+
<span>
7+
H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline
8+
</span>
9+
</h2>
10+
<p>
11+
<spa>
12+
Text Text Text Text Text Text Text Text Text Text
13+
</spa>
14+
</p>
15+
<div>
16+
<span>
17+
<a>
18+
<img src=""
19+
/>
20+
</a>
21+
</span>
22+
</div>
23+
<p>
24+
<spa>
25+
Text Text Text Text Text Text Text Text Text Text
26+
</spa>
27+
</p>
28+
</body>
29+
</html>

tests/test_article_only.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,24 @@ def test_author_absent(self):
133133
sample = load_sample("si-game.sample.html")
134134
doc = Document(sample)
135135
assert '[no-author]' == doc.author()
136+
137+
def test_keep_images_present(self):
138+
sample = load_sample("summary-keep-all-images.sample.html")
139+
140+
doc = Document(sample)
141+
142+
assert "<img" in doc.summary(keep_all_images=True)
143+
144+
def test_keep_images_absent(self):
145+
sample = load_sample("summary-keep-all-images.sample.html")
146+
147+
doc = Document(sample)
148+
149+
assert "<img" not in doc.summary(keep_all_images=False)
150+
151+
def test_keep_images_absent_by_defautl(self):
152+
sample = load_sample("summary-keep-all-images.sample.html")
153+
154+
doc = Document(sample)
155+
156+
assert "<img" not in doc.summary()

0 commit comments

Comments
 (0)