1
1
#!/usr/bin/env python
2
2
from BeautifulSoup import NavigableString
3
- from page_parser import parse
3
+ from page_parser import parse , get_title , get_body
4
4
import logging
5
5
import re
6
6
@@ -33,24 +33,38 @@ def __init__(self, input, **options):
33
33
self .options = defaultdict (lambda : None )
34
34
for k , v in options .items ():
35
35
self .options [k ] = v
36
+ self .html = None
36
37
37
- def make_html (self ):
38
- self .html = parse (self .input , self .options ['url' ])
39
-
38
+ def _html (self , force = False ):
39
+ if force or self .html is None :
40
+ self .html = parse (self .input , self .options ['url' ])
41
+ return self .html
42
+
40
43
def content (self ):
44
+ return get_body (self ._html ())
45
+
46
+ def title (self ):
47
+ return get_title (self ._html ())
48
+
49
+ def summary (self ):
41
50
ruthless = True
42
51
while True :
43
- self .make_html ( )
52
+ self ._html ( True )
44
53
[i .extract () for i in self .tags (self .html , 'script' , 'style' )]
45
54
46
55
if ruthless : self .remove_unlikely_candidates ()
47
56
self .transform_misused_divs_into_paragraphs ()
48
57
candidates = self .score_paragraphs (self .options .get ('min_text_length' , self .TEXT_LENGTH_THRESHOLD ))
49
58
best_candidate = self .select_best_candidate (candidates )
50
- if ruthless and best_candidate is None :
51
- ruthless = False
52
- continue
53
- article = self .get_article (candidates , best_candidate )
59
+ if best_candidate :
60
+ article = self .get_article (candidates , best_candidate )
61
+ else :
62
+ if ruthless :
63
+ ruthless = False
64
+ # try again
65
+ continue
66
+ else :
67
+ article = self .html .find ('body' ) or self .html
54
68
55
69
cleaned_article = self .sanitize (article , candidates )
56
70
of_acceptable_length = len (cleaned_article or '' ) >= (self .options ['retry_length' ] or self .RETRY_LENGTH )
@@ -88,16 +102,19 @@ def get_article(self, candidates, best_candidate):
88
102
if append :
89
103
output .append (sibling )
90
104
105
+ if not output : output .append (best_candidate )
91
106
return output
92
107
93
108
def select_best_candidate (self , candidates ):
94
109
sorted_candidates = sorted (candidates .values (), key = lambda x : x ['content_score' ], reverse = True )
95
- self .debug ("Top 5 canidates :" )
110
+ self .debug ("Top 5 candidates :" )
96
111
for candidate in sorted_candidates [:5 ]:
97
112
elem = candidate ['elem' ]
98
113
self .debug ("Candidate %s with score %s" % (describe (elem ), candidate ['content_score' ]))
99
114
100
- best_candidate = sorted_candidates [0 ] if len (sorted_candidates ) > 1 else { 'elem' : self .html .find ("body" ), 'content_score' : 0 }
115
+ if len (sorted_candidates ) == 0 :
116
+ return None
117
+ best_candidate = sorted_candidates [0 ]
101
118
self .debug ("Best candidate %s with score %s" % (describe (best_candidate ['elem' ]), best_candidate ['content_score' ]))
102
119
return best_candidate
103
120
@@ -108,7 +125,7 @@ def get_link_density(self, elem):
108
125
109
126
def score_paragraphs (self , min_text_length ):
110
127
candidates = {}
111
- elems = self .html . findAll ( "p" ) + self .html . findAll ( "td" )
128
+ elems = self .tags ( self .html , "p" , "td" )
112
129
113
130
for elem in elems :
114
131
parent_node = elem .parent
@@ -201,7 +218,7 @@ def sanitize(self, node, candidates):
201
218
for header in self .tags (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" ):
202
219
if self .class_weight (header ) < 0 or self .get_link_density (header ) > 0.33 : header .extract ()
203
220
204
- for elem in self .tags (node , "form" , "object" , " iframe" , "embed " ):
221
+ for elem in self .tags (node , "form" , "iframe" ):
205
222
elem .extract ()
206
223
207
224
# remove empty <p> tags
@@ -265,7 +282,7 @@ def sanitize(self, node, candidates):
265
282
if not (self .options ['attributes' ]):
266
283
el .attrMap = {}
267
284
268
- return str (node )
285
+ return unicode (node )
269
286
270
287
class HashableElement ():
271
288
def __init__ (self , node ):
@@ -312,7 +329,7 @@ def main():
312
329
else :
313
330
file = open (args [0 ])
314
331
try :
315
- print Document (file .read (), debug = options .verbose ).content ( )
332
+ print Document (file .read (), debug = options .verbose ).summary (). encode ( 'ascii' , 'ignore' )
316
333
finally :
317
334
file .close ()
318
335
0 commit comments