Skip to content

Commit a44ce12

Browse files
committed
Fixed download.py (Empty file downloads, encoding errors)
Fixed the following errors - 1. Python defaulting to ASCII encoding instead of Unicode. Set global encoding to 'utf-8' 2. Sript downloading 0kb empty FILE objects. Some files being generated contained ":", which is a restricted character in filenames on Windows and MacOS and was truncating filenames before the extension, added to the replacements list in clean_text. 3. Download failures. Used the requests library to improve success rates of downloads, which seemed to work a bit better.
1 parent 063b7f4 commit a44ce12

File tree

1 file changed

+17
-6
lines changed

1 file changed

+17
-6
lines changed

download.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,27 @@
33
import re
44
from six.moves.urllib.request import urlopen
55
from six.moves.urllib.error import HTTPError
6+
import urllib2
67
import shutil
78
import argparse
89
import mistune
910
import bs4 as BeautifulSoup
1011
import socket
1112
import time
13+
import requests
14+
15+
# encoding=utf8
16+
import sys
17+
18+
reload(sys)
19+
sys.setdefaultencoding('utf8')
1220

1321
def download_pdf(link, location, name):
1422
try:
15-
response = urlopen(link, timeout=500)
16-
file = open(os.path.join(location, name), 'w')
17-
file.write(response.read())
18-
file.close()
23+
response = requests.get(link)
24+
with open(os.path.join(location, name), 'wb') as f:
25+
f.write(response.content)
26+
f.close()
1927
except HTTPError:
2028
print('>>> Error 404: cannot be downloaded!\n')
2129
raise
@@ -28,9 +36,11 @@ def clean_pdf_link(link):
2836
link = link.replace('abs', 'pdf')
2937
if not(link.endswith('.pdf')):
3038
link = '.'.join((link, 'pdf'))
39+
40+
print(link)
3141
return link
3242

33-
def clean_text(text, replacements = {' ': '_', '/': '_', '.': '', '"': ''}):
43+
def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
3444
for key, rep in replacements.items():
3545
text = text.replace(key, rep)
3646
return text
@@ -95,13 +105,14 @@ def shorten_title(title):
95105
if link is not None:
96106
link = clean_pdf_link(link.attrs['href'])
97107
ext = get_extension(link)
108+
print(ext)
98109
if not ext in forbidden_extensions:
99110
print(shorten_title(point.text) + ' (' + link + ')')
100111
try:
101112
name = clean_text(point.text.split('[' + ext + ']')[0])
102113
fullname = '.'.join((name, ext))
103114
if not os.path.exists('/'.join((current_directory, fullname)) ):
104-
download_pdf(link, current_directory, '.'.join((name, ext)))
115+
download_pdf(link, current_directory, '.'.join((name, ext)))
105116
except KeyboardInterrupt:
106117
try:
107118
print("Press Ctrl-C in 1 second to quit")

0 commit comments

Comments
 (0)