|
1 | 1 | #!/usr/bin/env python
|
2 | 2 | from BeautifulSoup import NavigableString
|
3 | 3 | from page_parser import parse
|
| 4 | +import logging |
4 | 5 | import re
|
5 | 6 |
|
6 | 7 | REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
|
@@ -32,28 +33,32 @@ def __init__(self, input, **options):
|
32 | 33 | self.options = defaultdict(lambda: None)
|
33 | 34 | for k, v in options.items():
|
34 | 35 | self.options[k] = v
|
35 |
| - self.make_html() |
36 | 36 |
|
37 | 37 | def make_html(self):
|
38 | 38 | self.html = parse(self.input, self.options['url'])
|
39 | 39 |
|
40 |
| - def content(self, remove_unlikely_candidates = True): |
41 |
| - def remove(tag): [i.extract() for i in self.html.findAll(tag)] |
42 |
| - remove('script') |
43 |
| - remove('style') |
44 |
| - |
45 |
| - if remove_unlikely_candidates: self.remove_unlikely_candidates() |
46 |
| - self.transform_misused_divs_into_paragraphs() |
47 |
| - candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) |
48 |
| - best_candidate = self.select_best_candidate(candidates) |
49 |
| - article = self.get_article(candidates, best_candidate) |
50 |
| - |
51 |
| - cleaned_article = self.sanitize(article, candidates) |
52 |
| - if remove_unlikely_candidates and len(cleaned_article or '') < (self.options['retry_length'] or self.RETRY_LENGTH): |
| 40 | + def content(self): |
| 41 | + ruthless = True |
| 42 | + while True: |
53 | 43 | self.make_html()
|
54 |
| - return self.content(False) |
55 |
| - else: |
56 |
| - return cleaned_article |
| 44 | + [i.extract() for i in self.tags(self.html, 'script', 'style')] |
| 45 | + |
| 46 | + if ruthless: self.remove_unlikely_candidates() |
| 47 | + self.transform_misused_divs_into_paragraphs() |
| 48 | + candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) |
| 49 | + best_candidate = self.select_best_candidate(candidates) |
| 50 | + if ruthless and best_candidate is None: |
| 51 | + ruthless = False |
| 52 | + continue |
| 53 | + article = self.get_article(candidates, best_candidate) |
| 54 | + |
| 55 | + cleaned_article = self.sanitize(article, candidates) |
| 56 | + of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH) |
| 57 | + if ruthless and not of_acceptable_length: |
| 58 | + ruthless = False |
| 59 | + continue # try again |
| 60 | + else: |
| 61 | + return cleaned_article |
57 | 62 |
|
58 | 63 | def get_article(self, candidates, best_candidate):
|
59 | 64 | # Now that we have the top candidate, look through its siblings for content that might also be related.
|
@@ -87,18 +92,13 @@ def get_article(self, candidates, best_candidate):
|
87 | 92 |
|
88 | 93 | def select_best_candidate(self, candidates):
|
89 | 94 | sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
90 |
| - |
91 | 95 | self.debug("Top 5 canidates:")
|
92 | 96 | for candidate in sorted_candidates[:5]:
|
93 | 97 | elem = candidate['elem']
|
94 |
| - self.debug("Candidate %s with score %s" % ( |
95 |
| - describe(elem), candidate['content_score'])) |
| 98 | + self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score'])) |
96 | 99 |
|
97 | 100 | best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
|
98 |
| - elem = best_candidate['elem'] |
99 |
| - self.debug("Best candidate %s#%s.%s with score %s" % ( |
100 |
| - elem.name, elem.get('id',''), elem.get('class',''), best_candidate['content_score'])) |
101 |
| - |
| 101 | + self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score'])) |
102 | 102 | return best_candidate
|
103 | 103 |
|
104 | 104 | def get_link_density(self, elem):
|
@@ -173,9 +173,9 @@ def score_node(self, elem):
|
173 | 173 | content_score -= 5
|
174 | 174 | return { 'content_score': content_score, 'elem': elem }
|
175 | 175 |
|
176 |
| - def debug(self, str): |
| 176 | + def debug(self, *a): |
177 | 177 | if self.options['debug']:
|
178 |
| - print(str) |
| 178 | + logging.debug(*a) |
179 | 179 |
|
180 | 180 | def remove_unlikely_candidates(self):
|
181 | 181 | for elem in self.html.findAll():
|
|
0 commit comments