|
| 1 | +#!/usr/bin/env python |
| 2 | +from BeautifulSoup import BeautifulSoup, NavigableString |
| 3 | +import re |
| 4 | + |
| 5 | +REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I), |
| 6 | + 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I), |
| 7 | + 'positiveRe': re.compile('article|body|content|entry|hentry|page|pagination|post|text',re.I), |
| 8 | + 'negativeRe': re.compile('combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I), |
| 9 | + 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), |
| 10 | + 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I), |
| 11 | + 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), |
| 12 | + 'trimRe': re.compile('^\s+|\s+$/'), |
| 13 | + 'normalizeRe': re.compile('\s{2,}/'), |
| 14 | + 'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'), |
| 15 | + 'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), |
| 16 | +} |
| 17 | + |
| 18 | +from collections import defaultdict |
| 19 | +def describe(node): |
| 20 | + if not hasattr(node, 'name'): |
| 21 | + return "[text]" |
| 22 | + return "%s#%s.%s" % ( |
| 23 | + node.name, node.get('id', ''), node.get('class','')) |
| 24 | + |
| 25 | +class Document: |
| 26 | + TEXT_LENGTH_THRESHOLD = 25 |
| 27 | + RETRY_LENGTH = 250 |
| 28 | + |
| 29 | + def __init__(self, input, **options): |
| 30 | + self.input = input |
| 31 | + self.options = defaultdict(lambda: None) |
| 32 | + for k, v in options.items(): |
| 33 | + self.options[k] = v |
| 34 | + self.make_html() |
| 35 | + |
| 36 | + def make_html(self): |
| 37 | + self.html = BeautifulSoup(self.input) |
| 38 | + |
| 39 | + |
| 40 | + def content(self, remove_unlikely_candidates = True): |
| 41 | + def remove(tag): [i.extract() for i in self.html.findAll(tag)] |
| 42 | + remove('script') |
| 43 | + remove('style') |
| 44 | + |
| 45 | + if remove_unlikely_candidates: self.remove_unlikely_candidates() |
| 46 | + self.transform_misused_divs_into_paragraphs() |
| 47 | + candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) |
| 48 | + best_candidate = self.select_best_candidate(candidates) |
| 49 | + article = self.get_article(candidates, best_candidate) |
| 50 | + |
| 51 | + cleaned_article = self.sanitize(article, candidates) |
| 52 | + if remove_unlikely_candidates and len(cleaned_article or '') < (self.options['retry_length'] or self.RETRY_LENGTH): |
| 53 | + self.make_html() |
| 54 | + return self.content(False) |
| 55 | + else: |
| 56 | + return cleaned_article |
| 57 | + |
| 58 | + def get_article(self, candidates, best_candidate): |
| 59 | + # Now that we have the top candidate, look through its siblings for content that might also be related. |
| 60 | + # Things like preambles, content split by ads that we removed, etc. |
| 61 | + |
| 62 | + sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) |
| 63 | + output = BeautifulSoup("<div/>") |
| 64 | + for sibling in best_candidate['elem'].parent.contents: |
| 65 | + if isinstance(sibling, NavigableString): continue |
| 66 | + append = False |
| 67 | + if sibling is best_candidate['elem']: |
| 68 | + append = True |
| 69 | + sibling_key = HashableElement(sibling) |
| 70 | + if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: |
| 71 | + append = True |
| 72 | + |
| 73 | + if sibling.name == "p": |
| 74 | + link_density = self.get_link_density(sibling) |
| 75 | + node_content = sibling.string or "" |
| 76 | + node_length = len(node_content) |
| 77 | + |
| 78 | + if node_length > 80 and link_density < 0.25: |
| 79 | + append = True |
| 80 | + elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): |
| 81 | + append = True |
| 82 | + |
| 83 | + if append: |
| 84 | + output.append(sibling) |
| 85 | + |
| 86 | + return output |
| 87 | + |
| 88 | + def select_best_candidate(self, candidates): |
| 89 | + sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) |
| 90 | + |
| 91 | + self.debug("Top 5 canidates:") |
| 92 | + for candidate in sorted_candidates[:5]: |
| 93 | + elem = candidate['elem'] |
| 94 | + self.debug("Candidate %s with score %s" % ( |
| 95 | + describe(elem), candidate['content_score'])) |
| 96 | + |
| 97 | + best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 } |
| 98 | + elem = best_candidate['elem'] |
| 99 | + self.debug("Best candidate %s#%s.%s with score %s" % ( |
| 100 | + elem.name, elem.get('id',''), elem.get('class',''), best_candidate['content_score'])) |
| 101 | + |
| 102 | + return best_candidate |
| 103 | + |
| 104 | + def get_link_density(self, elem): |
| 105 | + link_length = len("".join([i.text or "" for i in elem.findAll("a")])) |
| 106 | + text_length = len(elem.text or "") |
| 107 | + return float(link_length) / max(text_length, 1) |
| 108 | + |
| 109 | + def score_paragraphs(self, min_text_length): |
| 110 | + candidates = {} |
| 111 | + elems = self.html.findAll("p") + self.html.findAll("td") |
| 112 | + |
| 113 | + for elem in elems: |
| 114 | + parent_node = elem.parent |
| 115 | + grand_parent_node = parent_node.parent |
| 116 | + parent_key = HashableElement(parent_node) |
| 117 | + grand_parent_key = HashableElement(grand_parent_node) |
| 118 | + |
| 119 | + inner_text = elem.string |
| 120 | + |
| 121 | + # If this paragraph is less than 25 characters, don't even count it. |
| 122 | + if (not inner_text) or len(inner_text) < min_text_length: |
| 123 | + continue |
| 124 | + |
| 125 | + if parent_key not in candidates: |
| 126 | + candidates[parent_key] = self.score_node(parent_node) |
| 127 | + if grand_parent_node and grand_parent_key not in candidates: |
| 128 | + candidates[grand_parent_key] = self.score_node(grand_parent_node) |
| 129 | + |
| 130 | + content_score = 1 |
| 131 | + content_score += len(inner_text.split(',')) |
| 132 | + content_score += min([(len(inner_text) / 100), 3]) |
| 133 | + |
| 134 | + candidates[parent_key]['content_score'] += content_score |
| 135 | + if grand_parent_node: |
| 136 | + candidates[grand_parent_key]['content_score'] += content_score / 2.0 |
| 137 | + |
| 138 | + # Scale the final candidates score based on link density. Good content should have a |
| 139 | + # relatively small link density (5% or less) and be mostly unaffected by this operation. |
| 140 | + for elem, candidate in candidates.items(): |
| 141 | + candidate['content_score'] = candidate['content_score'] * (1 - self.get_link_density(elem)) |
| 142 | + |
| 143 | + return candidates |
| 144 | + |
| 145 | + def class_weight(self, e): |
| 146 | + weight = 0 |
| 147 | + if e.get('class', None): |
| 148 | + if REGEXES['negativeRe'].search(e['class']): |
| 149 | + weight -= 25 |
| 150 | + |
| 151 | + if REGEXES['positiveRe'].search(e['class']): |
| 152 | + weight += 25 |
| 153 | + |
| 154 | + if e.get('id', None): |
| 155 | + if REGEXES['negativeRe'].search(e['id']): |
| 156 | + weight -= 25 |
| 157 | + |
| 158 | + if REGEXES['positiveRe'].search(e['id']): |
| 159 | + weight += 25 |
| 160 | + |
| 161 | + return weight |
| 162 | + |
| 163 | + def score_node(self, elem): |
| 164 | + content_score = self.class_weight(elem) |
| 165 | + name = elem.name.lower() |
| 166 | + if name == "div": |
| 167 | + content_score += 5 |
| 168 | + elif name == "blockquote": |
| 169 | + content_score += 3 |
| 170 | + elif name == "form": |
| 171 | + content_score -= 3 |
| 172 | + elif name == "th": |
| 173 | + content_score -= 5 |
| 174 | + return { 'content_score': content_score, 'elem': elem } |
| 175 | + |
| 176 | + def debug(self, str): |
| 177 | + if self.options['debug']: |
| 178 | + print(str) |
| 179 | + |
| 180 | + def remove_unlikely_candidates(self): |
| 181 | + for elem in self.html.findAll(): |
| 182 | + s = "%s%s" % (elem.get('class', ''), elem.get('id')) |
| 183 | + if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.name != 'body': |
| 184 | + self.debug("Removing unlikely candidate - %s" % (s,)) |
| 185 | + elem.extract() |
| 186 | + |
| 187 | + def transform_misused_divs_into_paragraphs(self): |
| 188 | + for elem in self.html.findAll(): |
| 189 | + if elem.name.lower() == "div": |
| 190 | + # transform <div>s that do not contain other block elements into <p>s |
| 191 | + if REGEXES['divToPElementsRe'].search(''.join(map(str, elem.contents))): |
| 192 | + self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', ''))) |
| 193 | + elem.name = "p" |
| 194 | + |
| 195 | + def tags(self, node, *tag_names): |
| 196 | + for tag_name in tag_names: |
| 197 | + for e in node.findAll(tag_name): |
| 198 | + yield e |
| 199 | + |
| 200 | + def sanitize(self, node, candidates): |
| 201 | + for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): |
| 202 | + if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract() |
| 203 | + |
| 204 | + for elem in self.tags(node, "form", "object", "iframe", "embed"): |
| 205 | + elem.extract() |
| 206 | + |
| 207 | + # remove empty <p> tags |
| 208 | + for elem in node.findAll("p"): |
| 209 | + if not (elem.string or elem.contents): |
| 210 | + elem.extract() |
| 211 | + |
| 212 | + # Conditionally clean <table>s, <ul>s, and <div>s |
| 213 | + for el in self.tags(node, "table", "ul", "div"): |
| 214 | + weight = self.class_weight(el) |
| 215 | + el_key = HashableElement(el) |
| 216 | + if el_key in candidates: |
| 217 | + content_score = candidates[el_key]['content_score'] |
| 218 | + else: |
| 219 | + content_score = 0 |
| 220 | + name = el.name |
| 221 | + |
| 222 | + if weight + content_score < 0: |
| 223 | + el.extract() |
| 224 | + self.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." % |
| 225 | + (describe(el), weight, content_score)) |
| 226 | + elif len((el.text or "").split(",")) < 10: |
| 227 | + counts = {} |
| 228 | + for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: |
| 229 | + counts[kind] = len(el.findAll(kind)) |
| 230 | + counts["li"] -= 100 |
| 231 | + |
| 232 | + content_length = len(el.text or "") # Count the text length excluding any surrounding whitespace |
| 233 | + link_density = self.get_link_density(el) |
| 234 | + to_remove = False |
| 235 | + reason = "" |
| 236 | + |
| 237 | + if counts["img"] > counts["p"]: |
| 238 | + reason = "too many images" |
| 239 | + to_remove = True |
| 240 | + elif counts["li"] > counts["p"] and name != "ul" and name != "ol": |
| 241 | + reason = "more <li>s than <p>s" |
| 242 | + to_remove = True |
| 243 | + elif counts["input"] > (counts["p"] / 3): |
| 244 | + reason = "less than 3x <p>s than <input>s" |
| 245 | + to_remove = True |
| 246 | + elif content_length < (self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) and (counts["img"] == 0 or counts["img"] > 2): |
| 247 | + reason = "too short a content length without a single image" |
| 248 | + to_remove = True |
| 249 | + elif weight < 25 and link_density > 0.2: |
| 250 | + reason = "too many links for its weight (#{weight})" |
| 251 | + to_remove = True |
| 252 | + elif weight >= 25 and link_density > 0.5: |
| 253 | + reason = "too many links for its weight (#{weight})" |
| 254 | + to_remove = True |
| 255 | + elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: |
| 256 | + reason = "<embed>s with too short a content length, or too many <embed>s" |
| 257 | + to_remove = True |
| 258 | + |
| 259 | + if to_remove: |
| 260 | + self.debug("Conditionally cleaned %s#%s.%s with weight %s and content score %s because it has %s." % |
| 261 | + (el.name, el.get('id',''), el.get('class', ''), weight, content_score, reason)) |
| 262 | + el.extract() |
| 263 | + |
| 264 | + for el in ([node] + node.findAll()): |
| 265 | + if not (self.options['attributes']): |
| 266 | + el.attrMap = {} |
| 267 | + |
| 268 | + return str(node) |
| 269 | + |
| 270 | +class HashableElement(): |
| 271 | + def __init__(self, node): |
| 272 | + self.node = node |
| 273 | + self._path = None |
| 274 | + |
| 275 | + def _get_path(self): |
| 276 | + if self._path is None: |
| 277 | + reverse_path = [] |
| 278 | + node = self.node |
| 279 | + while node: |
| 280 | + node_id = (node.name, tuple(node.attrs), node.string) |
| 281 | + reverse_path.append(node_id) |
| 282 | + node = node.parent |
| 283 | + self._path = tuple(reverse_path) |
| 284 | + return self._path |
| 285 | + path = property(_get_path) |
| 286 | + |
| 287 | + def __hash__(self): |
| 288 | + return hash(self.path) |
| 289 | + |
| 290 | + def __eq__(self, other): |
| 291 | + return self.path == other.path |
| 292 | + |
| 293 | + def __getattr__(self, name): |
| 294 | + return getattr(self.node, name) |
| 295 | + |
| 296 | +def main(): |
| 297 | + import sys |
| 298 | + from optparse import OptionParser |
| 299 | + parser = OptionParser(usage="%prog: [options] [file]") |
| 300 | + parser.add_option('-v', '--verbose', action='store_true') |
| 301 | + parser.add_option('-u', '--url', help="use URL instead of a local file") |
| 302 | + (options, args) = parser.parse_args() |
| 303 | + |
| 304 | + if not (len(args) == 1 or options.url): |
| 305 | + parser.print_help() |
| 306 | + sys.exit(1) |
| 307 | + |
| 308 | + file = None |
| 309 | + if options.url: |
| 310 | + import urllib |
| 311 | + file = urllib.urlopen(options.url) |
| 312 | + else: |
| 313 | + file = open(args[0]) |
| 314 | + try: |
| 315 | + print Document(file.read(), debug=options.verbose).content() |
| 316 | + finally: |
| 317 | + file.close() |
| 318 | + |
| 319 | +if __name__ == '__main__': |
| 320 | + main() |
0 commit comments