Skip to content

Commit ad3d52a

Browse files
author
gfxmonk
committed
initial
0 parents  commit ad3d52a

File tree

4 files changed

+332
-0
lines changed

4 files changed

+332
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.pyc

README

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2+
3+
This is a python port of a ruby port of arc90's readability project
4+
5+
http://lab.arc90.com/experiments/readability/
6+
7+
Given a html document, it pulls out the main body text and cleans it up.
8+
9+
Ruby port by starrhorne and iterationlabs
10+
Python port by gfxmonk

readability/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from readability import Document, main

readability/readability.py

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
#!/usr/bin/env python
2+
from BeautifulSoup import BeautifulSoup, NavigableString
3+
import re
4+
5+
REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
6+
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I),
7+
'positiveRe': re.compile('article|body|content|entry|hentry|page|pagination|post|text',re.I),
8+
'negativeRe': re.compile('combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
9+
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
10+
'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
11+
'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
12+
'trimRe': re.compile('^\s+|\s+$/'),
13+
'normalizeRe': re.compile('\s{2,}/'),
14+
'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
15+
'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
16+
}
17+
18+
from collections import defaultdict
19+
def describe(node):
20+
if not hasattr(node, 'name'):
21+
return "[text]"
22+
return "%s#%s.%s" % (
23+
node.name, node.get('id', ''), node.get('class',''))
24+
25+
class Document:
26+
TEXT_LENGTH_THRESHOLD = 25
27+
RETRY_LENGTH = 250
28+
29+
def __init__(self, input, **options):
30+
self.input = input
31+
self.options = defaultdict(lambda: None)
32+
for k, v in options.items():
33+
self.options[k] = v
34+
self.make_html()
35+
36+
def make_html(self):
37+
self.html = BeautifulSoup(self.input)
38+
39+
40+
def content(self, remove_unlikely_candidates = True):
41+
def remove(tag): [i.extract() for i in self.html.findAll(tag)]
42+
remove('script')
43+
remove('style')
44+
45+
if remove_unlikely_candidates: self.remove_unlikely_candidates()
46+
self.transform_misused_divs_into_paragraphs()
47+
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
48+
best_candidate = self.select_best_candidate(candidates)
49+
article = self.get_article(candidates, best_candidate)
50+
51+
cleaned_article = self.sanitize(article, candidates)
52+
if remove_unlikely_candidates and len(cleaned_article or '') < (self.options['retry_length'] or self.RETRY_LENGTH):
53+
self.make_html()
54+
return self.content(False)
55+
else:
56+
return cleaned_article
57+
58+
def get_article(self, candidates, best_candidate):
59+
# Now that we have the top candidate, look through its siblings for content that might also be related.
60+
# Things like preambles, content split by ads that we removed, etc.
61+
62+
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
63+
output = BeautifulSoup("<div/>")
64+
for sibling in best_candidate['elem'].parent.contents:
65+
if isinstance(sibling, NavigableString): continue
66+
append = False
67+
if sibling is best_candidate['elem']:
68+
append = True
69+
sibling_key = HashableElement(sibling)
70+
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
71+
append = True
72+
73+
if sibling.name == "p":
74+
link_density = self.get_link_density(sibling)
75+
node_content = sibling.string or ""
76+
node_length = len(node_content)
77+
78+
if node_length > 80 and link_density < 0.25:
79+
append = True
80+
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
81+
append = True
82+
83+
if append:
84+
output.append(sibling)
85+
86+
return output
87+
88+
def select_best_candidate(self, candidates):
89+
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
90+
91+
self.debug("Top 5 canidates:")
92+
for candidate in sorted_candidates[:5]:
93+
elem = candidate['elem']
94+
self.debug("Candidate %s with score %s" % (
95+
describe(elem), candidate['content_score']))
96+
97+
best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
98+
elem = best_candidate['elem']
99+
self.debug("Best candidate %s#%s.%s with score %s" % (
100+
elem.name, elem.get('id',''), elem.get('class',''), best_candidate['content_score']))
101+
102+
return best_candidate
103+
104+
def get_link_density(self, elem):
105+
link_length = len("".join([i.text or "" for i in elem.findAll("a")]))
106+
text_length = len(elem.text or "")
107+
return float(link_length) / max(text_length, 1)
108+
109+
def score_paragraphs(self, min_text_length):
110+
candidates = {}
111+
elems = self.html.findAll("p") + self.html.findAll("td")
112+
113+
for elem in elems:
114+
parent_node = elem.parent
115+
grand_parent_node = parent_node.parent
116+
parent_key = HashableElement(parent_node)
117+
grand_parent_key = HashableElement(grand_parent_node)
118+
119+
inner_text = elem.string
120+
121+
# If this paragraph is less than 25 characters, don't even count it.
122+
if (not inner_text) or len(inner_text) < min_text_length:
123+
continue
124+
125+
if parent_key not in candidates:
126+
candidates[parent_key] = self.score_node(parent_node)
127+
if grand_parent_node and grand_parent_key not in candidates:
128+
candidates[grand_parent_key] = self.score_node(grand_parent_node)
129+
130+
content_score = 1
131+
content_score += len(inner_text.split(','))
132+
content_score += min([(len(inner_text) / 100), 3])
133+
134+
candidates[parent_key]['content_score'] += content_score
135+
if grand_parent_node:
136+
candidates[grand_parent_key]['content_score'] += content_score / 2.0
137+
138+
# Scale the final candidates score based on link density. Good content should have a
139+
# relatively small link density (5% or less) and be mostly unaffected by this operation.
140+
for elem, candidate in candidates.items():
141+
candidate['content_score'] = candidate['content_score'] * (1 - self.get_link_density(elem))
142+
143+
return candidates
144+
145+
def class_weight(self, e):
146+
weight = 0
147+
if e.get('class', None):
148+
if REGEXES['negativeRe'].search(e['class']):
149+
weight -= 25
150+
151+
if REGEXES['positiveRe'].search(e['class']):
152+
weight += 25
153+
154+
if e.get('id', None):
155+
if REGEXES['negativeRe'].search(e['id']):
156+
weight -= 25
157+
158+
if REGEXES['positiveRe'].search(e['id']):
159+
weight += 25
160+
161+
return weight
162+
163+
def score_node(self, elem):
164+
content_score = self.class_weight(elem)
165+
name = elem.name.lower()
166+
if name == "div":
167+
content_score += 5
168+
elif name == "blockquote":
169+
content_score += 3
170+
elif name == "form":
171+
content_score -= 3
172+
elif name == "th":
173+
content_score -= 5
174+
return { 'content_score': content_score, 'elem': elem }
175+
176+
def debug(self, str):
177+
if self.options['debug']:
178+
print(str)
179+
180+
def remove_unlikely_candidates(self):
181+
for elem in self.html.findAll():
182+
s = "%s%s" % (elem.get('class', ''), elem.get('id'))
183+
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.name != 'body':
184+
self.debug("Removing unlikely candidate - %s" % (s,))
185+
elem.extract()
186+
187+
def transform_misused_divs_into_paragraphs(self):
188+
for elem in self.html.findAll():
189+
if elem.name.lower() == "div":
190+
# transform <div>s that do not contain other block elements into <p>s
191+
if REGEXES['divToPElementsRe'].search(''.join(map(str, elem.contents))):
192+
self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
193+
elem.name = "p"
194+
195+
def tags(self, node, *tag_names):
196+
for tag_name in tag_names:
197+
for e in node.findAll(tag_name):
198+
yield e
199+
200+
def sanitize(self, node, candidates):
201+
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
202+
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract()
203+
204+
for elem in self.tags(node, "form", "object", "iframe", "embed"):
205+
elem.extract()
206+
207+
# remove empty <p> tags
208+
for elem in node.findAll("p"):
209+
if not (elem.string or elem.contents):
210+
elem.extract()
211+
212+
# Conditionally clean <table>s, <ul>s, and <div>s
213+
for el in self.tags(node, "table", "ul", "div"):
214+
weight = self.class_weight(el)
215+
el_key = HashableElement(el)
216+
if el_key in candidates:
217+
content_score = candidates[el_key]['content_score']
218+
else:
219+
content_score = 0
220+
name = el.name
221+
222+
if weight + content_score < 0:
223+
el.extract()
224+
self.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." %
225+
(describe(el), weight, content_score))
226+
elif len((el.text or "").split(",")) < 10:
227+
counts = {}
228+
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
229+
counts[kind] = len(el.findAll(kind))
230+
counts["li"] -= 100
231+
232+
content_length = len(el.text or "") # Count the text length excluding any surrounding whitespace
233+
link_density = self.get_link_density(el)
234+
to_remove = False
235+
reason = ""
236+
237+
if counts["img"] > counts["p"]:
238+
reason = "too many images"
239+
to_remove = True
240+
elif counts["li"] > counts["p"] and name != "ul" and name != "ol":
241+
reason = "more <li>s than <p>s"
242+
to_remove = True
243+
elif counts["input"] > (counts["p"] / 3):
244+
reason = "less than 3x <p>s than <input>s"
245+
to_remove = True
246+
elif content_length < (self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) and (counts["img"] == 0 or counts["img"] > 2):
247+
reason = "too short a content length without a single image"
248+
to_remove = True
249+
elif weight < 25 and link_density > 0.2:
250+
reason = "too many links for its weight (#{weight})"
251+
to_remove = True
252+
elif weight >= 25 and link_density > 0.5:
253+
reason = "too many links for its weight (#{weight})"
254+
to_remove = True
255+
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
256+
reason = "<embed>s with too short a content length, or too many <embed>s"
257+
to_remove = True
258+
259+
if to_remove:
260+
self.debug("Conditionally cleaned %s#%s.%s with weight %s and content score %s because it has %s." %
261+
(el.name, el.get('id',''), el.get('class', ''), weight, content_score, reason))
262+
el.extract()
263+
264+
for el in ([node] + node.findAll()):
265+
if not (self.options['attributes']):
266+
el.attrMap = {}
267+
268+
return str(node)
269+
270+
class HashableElement():
271+
def __init__(self, node):
272+
self.node = node
273+
self._path = None
274+
275+
def _get_path(self):
276+
if self._path is None:
277+
reverse_path = []
278+
node = self.node
279+
while node:
280+
node_id = (node.name, tuple(node.attrs), node.string)
281+
reverse_path.append(node_id)
282+
node = node.parent
283+
self._path = tuple(reverse_path)
284+
return self._path
285+
path = property(_get_path)
286+
287+
def __hash__(self):
288+
return hash(self.path)
289+
290+
def __eq__(self, other):
291+
return self.path == other.path
292+
293+
def __getattr__(self, name):
294+
return getattr(self.node, name)
295+
296+
def main():
297+
import sys
298+
from optparse import OptionParser
299+
parser = OptionParser(usage="%prog: [options] [file]")
300+
parser.add_option('-v', '--verbose', action='store_true')
301+
parser.add_option('-u', '--url', help="use URL instead of a local file")
302+
(options, args) = parser.parse_args()
303+
304+
if not (len(args) == 1 or options.url):
305+
parser.print_help()
306+
sys.exit(1)
307+
308+
file = None
309+
if options.url:
310+
import urllib
311+
file = urllib.urlopen(options.url)
312+
else:
313+
file = open(args[0])
314+
try:
315+
print Document(file.read(), debug=options.verbose).content()
316+
finally:
317+
file.close()
318+
319+
if __name__ == '__main__':
320+
main()

0 commit comments

Comments
 (0)