Skip to content

Commit f73b5f0

Browse files
author
gfxmonk
committed
split out into content and summary methods
1 parent c952f42 commit f73b5f0

File tree

1 file changed

+32
-15
lines changed

1 file changed

+32
-15
lines changed

readability/readability.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python
22
from BeautifulSoup import NavigableString
3-
from page_parser import parse
3+
from page_parser import parse, get_title, get_body
44
import logging
55
import re
66

@@ -33,24 +33,38 @@ def __init__(self, input, **options):
3333
self.options = defaultdict(lambda: None)
3434
for k, v in options.items():
3535
self.options[k] = v
36+
self.html = None
3637

37-
def make_html(self):
38-
self.html = parse(self.input, self.options['url'])
39-
38+
def _html(self, force=False):
39+
if force or self.html is None:
40+
self.html = parse(self.input, self.options['url'])
41+
return self.html
42+
4043
def content(self):
44+
return get_body(self._html())
45+
46+
def title(self):
47+
return get_title(self._html())
48+
49+
def summary(self):
4150
ruthless = True
4251
while True:
43-
self.make_html()
52+
self._html(True)
4453
[i.extract() for i in self.tags(self.html, 'script', 'style')]
4554

4655
if ruthless: self.remove_unlikely_candidates()
4756
self.transform_misused_divs_into_paragraphs()
4857
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
4958
best_candidate = self.select_best_candidate(candidates)
50-
if ruthless and best_candidate is None:
51-
ruthless = False
52-
continue
53-
article = self.get_article(candidates, best_candidate)
59+
if best_candidate:
60+
article = self.get_article(candidates, best_candidate)
61+
else:
62+
if ruthless:
63+
ruthless = False
64+
# try again
65+
continue
66+
else:
67+
article = self.html.find('body') or self.html
5468

5569
cleaned_article = self.sanitize(article, candidates)
5670
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
@@ -88,16 +102,19 @@ def get_article(self, candidates, best_candidate):
88102
if append:
89103
output.append(sibling)
90104

105+
if not output: output.append(best_candidate)
91106
return output
92107

93108
def select_best_candidate(self, candidates):
94109
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
95-
self.debug("Top 5 canidates:")
110+
self.debug("Top 5 candidates:")
96111
for candidate in sorted_candidates[:5]:
97112
elem = candidate['elem']
98113
self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
99114

100-
best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
115+
if len(sorted_candidates) == 0:
116+
return None
117+
best_candidate = sorted_candidates[0]
101118
self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
102119
return best_candidate
103120

@@ -108,7 +125,7 @@ def get_link_density(self, elem):
108125

109126
def score_paragraphs(self, min_text_length):
110127
candidates = {}
111-
elems = self.html.findAll("p") + self.html.findAll("td")
128+
elems = self.tags(self.html, "p","td")
112129

113130
for elem in elems:
114131
parent_node = elem.parent
@@ -201,7 +218,7 @@ def sanitize(self, node, candidates):
201218
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
202219
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract()
203220

204-
for elem in self.tags(node, "form", "object", "iframe", "embed"):
221+
for elem in self.tags(node, "form", "iframe"):
205222
elem.extract()
206223

207224
# remove empty <p> tags
@@ -265,7 +282,7 @@ def sanitize(self, node, candidates):
265282
if not (self.options['attributes']):
266283
el.attrMap = {}
267284

268-
return str(node)
285+
return unicode(node)
269286

270287
class HashableElement():
271288
def __init__(self, node):
@@ -312,7 +329,7 @@ def main():
312329
else:
313330
file = open(args[0])
314331
try:
315-
print Document(file.read(), debug=options.verbose).content()
332+
print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore')
316333
finally:
317334
file.close()
318335

0 commit comments

Comments
 (0)