Skip to content

Commit 0eacd95

Browse files
author
gfxmonk
committed
failsafe parsing and more logging
1 parent 87ad057 commit 0eacd95

File tree

1 file changed

+33
-28
lines changed

1 file changed

+33
-28
lines changed

readability/readability.py

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,17 @@ class Document:
2828
TEXT_LENGTH_THRESHOLD = 25
2929
RETRY_LENGTH = 250
3030

31-
def __init__(self, input, **options):
32-
self.input = inpuunicodear
31+
def __init__(self, input, notify=None, **options):
32+
self.input = input
3333
self.options = defaultdict(lambda: None)
3434
for k, v in options.items():
3535
self.options[k] = v
36+
self.notify = notify or logging.info
3637
self.html = None
3738

3839
def _html(self, force=False):
3940
if force or self.html is None:
40-
notify = self.options['notify'] or (lambda x: None)
41-
self.html = parse(self.input, self.options['url'], notify=notify)
41+
self.html = parse(self.input, self.options['url'], notify=self.notify)
4242
return self.html
4343

4444
def content(self):
@@ -48,32 +48,36 @@ def title(self):
4848
return get_title(self._html())
4949

5050
def summary(self):
51-
ruthless = True
52-
while True:
53-
self._html(True)
54-
[i.extract() for i in self.tags(self.html, 'script', 'style')]
55-
56-
if ruthless: self.remove_unlikely_candidates()
57-
self.transform_misused_divs_into_paragraphs()
58-
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
59-
best_candidate = self.select_best_candidate(candidates)
60-
if best_candidate:
61-
article = self.get_article(candidates, best_candidate)
62-
else:
63-
if ruthless:
51+
try:
52+
ruthless = True
53+
while True:
54+
self._html(True)
55+
[i.extract() for i in self.tags(self.html, 'script', 'style')]
56+
57+
if ruthless: self.remove_unlikely_candidates()
58+
self.transform_misused_divs_into_paragraphs()
59+
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
60+
best_candidate = self.select_best_candidate(candidates)
61+
if best_candidate:
62+
article = self.get_article(candidates, best_candidate)
63+
else:
64+
if ruthless:
65+
ruthless = False
66+
# try again
67+
continue
68+
else:
69+
article = self.html.find('body') or self.html
70+
71+
cleaned_article = self.sanitize(article, candidates)
72+
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
73+
if ruthless and not of_acceptable_length:
6474
ruthless = False
65-
# try again
66-
continue
75+
continue # try again
6776
else:
68-
article = self.html.find('body') or self.html
69-
70-
cleaned_article = self.sanitize(article, candidates)
71-
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
72-
if ruthless and not of_acceptable_length:
73-
ruthless = False
74-
continue # try again
75-
else:
76-
return cleaned_article
77+
return cleaned_article
78+
except StandardError, e:
79+
logging.exception('error getting summary:')
80+
raise Unparseable(str(e))
7781

7882
def get_article(self, candidates, best_candidate):
7983
# Now that we have the top candidate, look through its siblings for content that might also be related.
@@ -322,6 +326,7 @@ def main():
322326
if not (len(args) == 1 or options.url):
323327
parser.print_help()
324328
sys.exit(1)
329+
logging.basicConfig(level=logging.DEBUG)
325330

326331
file = None
327332
if options.url:

0 commit comments

Comments
 (0)