@@ -28,17 +28,17 @@ class Document:
28
28
TEXT_LENGTH_THRESHOLD = 25
29
29
RETRY_LENGTH = 250
30
30
31
- def __init__ (self , input , ** options ):
32
- self .input = inpuunicodear
31
+ def __init__ (self , input , notify = None , ** options ):
32
+ self .input = input
33
33
self .options = defaultdict (lambda : None )
34
34
for k , v in options .items ():
35
35
self .options [k ] = v
36
+ self .notify = notify or logging .info
36
37
self .html = None
37
38
38
39
def _html (self , force = False ):
39
40
if force or self .html is None :
40
- notify = self .options ['notify' ] or (lambda x : None )
41
- self .html = parse (self .input , self .options ['url' ], notify = notify )
41
+ self .html = parse (self .input , self .options ['url' ], notify = self .notify )
42
42
return self .html
43
43
44
44
def content (self ):
@@ -48,32 +48,36 @@ def title(self):
48
48
return get_title (self ._html ())
49
49
50
50
def summary (self ):
51
- ruthless = True
52
- while True :
53
- self ._html (True )
54
- [i .extract () for i in self .tags (self .html , 'script' , 'style' )]
55
-
56
- if ruthless : self .remove_unlikely_candidates ()
57
- self .transform_misused_divs_into_paragraphs ()
58
- candidates = self .score_paragraphs (self .options .get ('min_text_length' , self .TEXT_LENGTH_THRESHOLD ))
59
- best_candidate = self .select_best_candidate (candidates )
60
- if best_candidate :
61
- article = self .get_article (candidates , best_candidate )
62
- else :
63
- if ruthless :
51
+ try :
52
+ ruthless = True
53
+ while True :
54
+ self ._html (True )
55
+ [i .extract () for i in self .tags (self .html , 'script' , 'style' )]
56
+
57
+ if ruthless : self .remove_unlikely_candidates ()
58
+ self .transform_misused_divs_into_paragraphs ()
59
+ candidates = self .score_paragraphs (self .options .get ('min_text_length' , self .TEXT_LENGTH_THRESHOLD ))
60
+ best_candidate = self .select_best_candidate (candidates )
61
+ if best_candidate :
62
+ article = self .get_article (candidates , best_candidate )
63
+ else :
64
+ if ruthless :
65
+ ruthless = False
66
+ # try again
67
+ continue
68
+ else :
69
+ article = self .html .find ('body' ) or self .html
70
+
71
+ cleaned_article = self .sanitize (article , candidates )
72
+ of_acceptable_length = len (cleaned_article or '' ) >= (self .options ['retry_length' ] or self .RETRY_LENGTH )
73
+ if ruthless and not of_acceptable_length :
64
74
ruthless = False
65
- # try again
66
- continue
75
+ continue # try again
67
76
else :
68
- article = self .html .find ('body' ) or self .html
69
-
70
- cleaned_article = self .sanitize (article , candidates )
71
- of_acceptable_length = len (cleaned_article or '' ) >= (self .options ['retry_length' ] or self .RETRY_LENGTH )
72
- if ruthless and not of_acceptable_length :
73
- ruthless = False
74
- continue # try again
75
- else :
76
- return cleaned_article
77
+ return cleaned_article
78
+ except StandardError , e :
79
+ logging .exception ('error getting summary:' )
80
+ raise Unparseable (str (e ))
77
81
78
82
def get_article (self , candidates , best_candidate ):
79
83
# Now that we have the top candidate, look through its siblings for content that might also be related.
@@ -322,6 +326,7 @@ def main():
322
326
if not (len (args ) == 1 or options .url ):
323
327
parser .print_help ()
324
328
sys .exit (1 )
329
+ logging .basicConfig (level = logging .DEBUG )
325
330
326
331
file = None
327
332
if options .url :
0 commit comments