Skip to content

Commit c952f42

Browse files
author
gfxmonk
committed
clean up content method and debug
1 parent c0ca60e commit c952f42

File tree

1 file changed

+26
-26
lines changed

1 file changed

+26
-26
lines changed

readability/readability.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22
from BeautifulSoup import NavigableString
33
from page_parser import parse
4+
import logging
45
import re
56

67
REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
@@ -32,28 +33,32 @@ def __init__(self, input, **options):
3233
self.options = defaultdict(lambda: None)
3334
for k, v in options.items():
3435
self.options[k] = v
35-
self.make_html()
3636

3737
def make_html(self):
3838
self.html = parse(self.input, self.options['url'])
3939

40-
def content(self, remove_unlikely_candidates = True):
41-
def remove(tag): [i.extract() for i in self.html.findAll(tag)]
42-
remove('script')
43-
remove('style')
44-
45-
if remove_unlikely_candidates: self.remove_unlikely_candidates()
46-
self.transform_misused_divs_into_paragraphs()
47-
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
48-
best_candidate = self.select_best_candidate(candidates)
49-
article = self.get_article(candidates, best_candidate)
50-
51-
cleaned_article = self.sanitize(article, candidates)
52-
if remove_unlikely_candidates and len(cleaned_article or '') < (self.options['retry_length'] or self.RETRY_LENGTH):
40+
def content(self):
41+
ruthless = True
42+
while True:
5343
self.make_html()
54-
return self.content(False)
55-
else:
56-
return cleaned_article
44+
[i.extract() for i in self.tags(self.html, 'script', 'style')]
45+
46+
if ruthless: self.remove_unlikely_candidates()
47+
self.transform_misused_divs_into_paragraphs()
48+
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
49+
best_candidate = self.select_best_candidate(candidates)
50+
if ruthless and best_candidate is None:
51+
ruthless = False
52+
continue
53+
article = self.get_article(candidates, best_candidate)
54+
55+
cleaned_article = self.sanitize(article, candidates)
56+
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
57+
if ruthless and not of_acceptable_length:
58+
ruthless = False
59+
continue # try again
60+
else:
61+
return cleaned_article
5762

5863
def get_article(self, candidates, best_candidate):
5964
# Now that we have the top candidate, look through its siblings for content that might also be related.
@@ -87,18 +92,13 @@ def get_article(self, candidates, best_candidate):
8792

8893
def select_best_candidate(self, candidates):
8994
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
90-
9195
self.debug("Top 5 canidates:")
9296
for candidate in sorted_candidates[:5]:
9397
elem = candidate['elem']
94-
self.debug("Candidate %s with score %s" % (
95-
describe(elem), candidate['content_score']))
98+
self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
9699

97100
best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
98-
elem = best_candidate['elem']
99-
self.debug("Best candidate %s#%s.%s with score %s" % (
100-
elem.name, elem.get('id',''), elem.get('class',''), best_candidate['content_score']))
101-
101+
self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
102102
return best_candidate
103103

104104
def get_link_density(self, elem):
@@ -173,9 +173,9 @@ def score_node(self, elem):
173173
content_score -= 5
174174
return { 'content_score': content_score, 'elem': elem }
175175

176-
def debug(self, str):
176+
def debug(self, *a):
177177
if self.options['debug']:
178-
print(str)
178+
logging.debug(*a)
179179

180180
def remove_unlikely_candidates(self):
181181
for elem in self.html.findAll():

0 commit comments

Comments
 (0)