3
3
import re
4
4
from six .moves .urllib .request import urlopen
5
5
from six .moves .urllib .error import HTTPError
6
+ import urllib2
6
7
import shutil
7
8
import argparse
8
9
import mistune
9
10
import bs4 as BeautifulSoup
10
11
import socket
11
12
import time
13
+ import requests
14
+
15
+ # encoding=utf8
16
+ import sys
17
+
18
+ reload (sys )
19
+ sys .setdefaultencoding ('utf8' )
12
20
13
21
def download_pdf (link , location , name ):
14
22
try :
15
- response = urlopen (link , timeout = 500 )
16
- file = open (os .path .join (location , name ), 'w' )
17
- file .write (response .read () )
18
- file .close ()
23
+ response = requests . get (link )
24
+ with open (os .path .join (location , name ), 'wb' ) as f :
25
+ f .write (response .content )
26
+ f .close ()
19
27
except HTTPError :
20
28
print ('>>> Error 404: cannot be downloaded!\n ' )
21
29
raise
@@ -28,9 +36,11 @@ def clean_pdf_link(link):
28
36
link = link .replace ('abs' , 'pdf' )
29
37
if not (link .endswith ('.pdf' )):
30
38
link = '.' .join ((link , 'pdf' ))
39
+
40
+ print (link )
31
41
return link
32
42
33
- def clean_text (text , replacements = {' ' : '_' , '/' : '_' , '.' : '' , '"' : '' }):
43
+ def clean_text (text , replacements = {':' : '_' , ' ' : '_' , '/' : '_' , '.' : '' , '"' : '' }):
34
44
for key , rep in replacements .items ():
35
45
text = text .replace (key , rep )
36
46
return text
@@ -95,13 +105,14 @@ def shorten_title(title):
95
105
if link is not None :
96
106
link = clean_pdf_link (link .attrs ['href' ])
97
107
ext = get_extension (link )
108
+ print (ext )
98
109
if not ext in forbidden_extensions :
99
110
print (shorten_title (point .text ) + ' (' + link + ')' )
100
111
try :
101
112
name = clean_text (point .text .split ('[' + ext + ']' )[0 ])
102
113
fullname = '.' .join ((name , ext ))
103
114
if not os .path .exists ('/' .join ((current_directory , fullname )) ):
104
- download_pdf (link , current_directory , '.' .join ((name , ext )))
115
+ download_pdf (link , current_directory , '.' .join ((name , ext )))
105
116
except KeyboardInterrupt :
106
117
try :
107
118
print ("Press Ctrl-C in 1 second to quit" )
0 commit comments