1
1
#!/usr/bin/env python
2
- from __future__ import print_function
3
2
import logging
4
3
import re
5
4
import sys
5
+ import urllib .request
6
+ import urllib .parse
7
+ import urllib .error
6
8
9
+ from lxml .etree import tostring
7
10
from lxml .etree import tounicode
8
11
from lxml .etree import _ElementTree
9
12
from lxml .html import document_fromstring
17
20
from .htmls import get_title
18
21
from .htmls import get_author
19
22
from .htmls import shorten_title
20
- from .compat import str_ , bytes_ , tostring_ , pattern_type
21
23
from .debug import describe , text_content
22
24
23
25
@@ -80,14 +82,14 @@ def text_length(i):
80
82
def compile_pattern (elements ):
81
83
if not elements :
82
84
return None
83
- elif isinstance (elements , pattern_type ):
85
+ elif isinstance (elements , re . Pattern ):
84
86
return elements
85
- elif isinstance (elements , (str_ , bytes_ )):
86
- if isinstance (elements , bytes_ ):
87
- elements = str_ (elements , "utf-8" )
88
- elements = elements .split (u "," )
87
+ elif isinstance (elements , (str , bytes )):
88
+ if isinstance (elements , bytes ):
89
+ elements = str (elements , "utf-8" )
90
+ elements = elements .split ("," )
89
91
if isinstance (elements , (list , tuple )):
90
- return re .compile (u "|" .join ([re .escape (x .strip ()) for x in elements ]), re .U )
92
+ return re .compile ("|" .join ([re .escape (x .strip ()) for x in elements ]), re .U )
91
93
else :
92
94
raise Exception ("Unknown type for the pattern: {}" .format (type (elements )))
93
95
# assume string or string like object
@@ -242,19 +244,15 @@ def summary(self, html_partial=False):
242
244
log .info ("ruthless removal did not work. " )
243
245
ruthless = False
244
246
log .debug (
245
- (
246
247
"ended up stripping too much - "
247
248
"going for a safer _parse"
248
- )
249
249
)
250
250
# try again
251
251
continue
252
252
else :
253
253
log .debug (
254
- (
255
254
"Ruthless and lenient parsing did not work. "
256
255
"Returning raw html"
257
- )
258
256
)
259
257
article = self .html .find ("body" )
260
258
if article is None :
@@ -272,11 +270,7 @@ def summary(self, html_partial=False):
272
270
return cleaned_article
273
271
except Exception as e :
274
272
log .exception ("error getting summary: " )
275
- if sys .version_info [0 ] == 2 :
276
- from .compat .two import raise_with_traceback
277
- else :
278
- from .compat .three import raise_with_traceback
279
- raise_with_traceback (Unparseable , sys .exc_info ()[2 ], str_ (e ))
273
+ raise Unparseable (str (e )).with_traceback (sys .exc_info ()[2 ])
280
274
281
275
def get_article (self , candidates , best_candidate , html_partial = False ):
282
276
# Now that we have the top candidate, look through its siblings for
@@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self):
474
468
# This results in incorrect results in case there is an <img>
475
469
# buried within an <a> for example
476
470
if not REGEXES ["divToPElementsRe" ].search (
477
- str_ (b"" .join (map (tostring_ , list (elem ))))
471
+ str (b"" .join (tostring (s , encoding = 'utf-8' ) for s in elem ))
472
+ # str(b"".join(map(tostring_, list(elem))))
478
473
):
479
474
# log.debug("Altering %s to p" % (describe(elem)))
480
475
elem .tag = "p"
@@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self):
501
496
502
497
def tags (self , node , * tag_names ):
503
498
for tag_name in tag_names :
504
- for e in node .findall (".//%s" % tag_name ):
505
- yield e
499
+ yield from node .findall (".//%s" % tag_name )
506
500
507
501
def reverse_tags (self , node , * tag_names ):
508
502
for tag_name in tag_names :
509
- for e in reversed (node .findall (".//%s" % tag_name )):
510
- yield e
503
+ yield from reversed (node .findall (".//%s" % tag_name ))
511
504
512
505
def sanitize (self , node , candidates ):
513
506
MIN_LEN = self .min_text_length
@@ -594,13 +587,13 @@ def sanitize(self, node, candidates):
594
587
)
595
588
to_remove = True
596
589
elif weight < 25 and link_density > 0.2 :
597
- reason = "too many links % .3f for its weight %s" % (
590
+ reason = "too many links {: .3f} for its weight {}" . format (
598
591
link_density ,
599
592
weight ,
600
593
)
601
594
to_remove = True
602
595
elif weight >= 25 and link_density > 0.5 :
603
- reason = "too many links % .3f for its weight %s" % (
596
+ reason = "too many links {: .3f} for its weight {}" . format (
604
597
link_density ,
605
598
weight ,
606
599
)
@@ -726,18 +719,10 @@ def main():
726
719
file = None
727
720
if options .url :
728
721
headers = {"User-Agent" : "Mozilla/5.0" }
729
- if sys .version_info [0 ] == 3 :
730
- import urllib .request , urllib .parse , urllib .error
731
-
732
- request = urllib .request .Request (options .url , None , headers )
733
- file = urllib .request .urlopen (request )
734
- else :
735
- import urllib2
736
-
737
- request = urllib2 .Request (options .url , None , headers )
738
- file = urllib2 .urlopen (request )
722
+ request = urllib .request .Request (options .url , None , headers )
723
+ file = urllib .request .urlopen (request )
739
724
else :
740
- file = open (args [0 ], "rt" )
725
+ file = open (args [0 ])
741
726
try :
742
727
doc = Document (
743
728
file .read (),
@@ -751,14 +736,8 @@ def main():
751
736
result = "<h2>" + doc .short_title () + "</h2><br/>" + doc .summary ()
752
737
open_in_browser (result )
753
738
else :
754
- enc = (
755
- sys .__stdout__ .encoding or "utf-8"
756
- ) # XXX: this hack could not always work, better to set PYTHONIOENCODING
757
739
result = "Title:" + doc .short_title () + "\n " + doc .summary ()
758
- if sys .version_info [0 ] == 3 :
759
- print (result )
760
- else :
761
- print (result .encode (enc , "replace" ))
740
+ print (result )
762
741
finally :
763
742
file .close ()
764
743
0 commit comments