8
8
9
9
def download_pdf (link , location , name ):
10
10
try :
11
- response = urllib2 .urlopen (link )
11
+ response = urllib2 .urlopen (link , timeout = 500 )
12
12
file = open (os .path .join (location , name ), 'w' )
13
13
file .write (response .read ())
14
14
file .close ()
15
15
except urllib2 .HTTPError :
16
16
print ('>>> Error 404: cannot be downloaded!\n ' )
17
17
raise
18
+ except socket .timeout :
19
+ print (" " .join (("can't download" , link , "due to connection timeout!" )) )
18
20
19
21
def clean_pdf_link (link ):
20
22
if 'arxiv' in link :
@@ -81,7 +83,8 @@ def shorten_title(title):
81
83
current_directory = h1_directory
82
84
elif point .name == 'h2' :
83
85
current_directory = os .path .join (h1_directory , clean_text (point .text ))
84
- os .makedirs (current_directory )
86
+ if not os .path .exists (current_directory ):
87
+ os .makedirs (current_directory )
85
88
print_title (point .text )
86
89
87
90
if point .name == 'p' :
@@ -93,7 +96,9 @@ def shorten_title(title):
93
96
print (shorten_title (point .text ) + ' (' + link + ')' )
94
97
try :
95
98
name = clean_text (point .text .split ('[' + ext + ']' )[0 ])
96
- download_pdf (link , current_directory , '.' .join ((name , ext )))
99
+ fullname = '.' .join ((name , ext ))
100
+ if not os .path .exists ('/' .join ((current_directory , fullname )) ):
101
+ download_pdf (link , current_directory , '.' .join ((name , ext )))
97
102
except :
98
103
failures .append (point .text )
99
104
0 commit comments