Fixed download.py (Empty file downloads, encoding errors)

shreyaspadhy · shreyaspadhy · commit a44ce12889cd · 2017-07-04T14:11:10.000+05:30
Fixed the following errors -
1. Python defaulting to ASCII encoding instead of Unicode.
Set global encoding to 'utf-8'
2. Sript downloading 0kb empty FILE objects.
Some files being generated contained ":", which is a restricted
character in filenames on Windows and MacOS and was truncating filenames
before the extension, added to the replacements list in clean_text.
3.  Download failures.
Used the requests library to improve success rates of downloads, which
seemed to work a bit better.
diff --git a/download.py b/download.py
@@ -3,19 +3,27 @@
 import re
 from six.moves.urllib.request import urlopen
 from six.moves.urllib.error import HTTPError
+import urllib2
 import shutil
 import argparse
 import mistune
 import bs4 as BeautifulSoup
 import socket
 import time
+import requests
+
+# encoding=utf8  
+import sys  
+
+reload(sys)  
+sys.setdefaultencoding('utf8')
 
 def download_pdf(link, location, name):
     try:
-        response = urlopen(link, timeout=500)
-        file = open(os.path.join(location, name), 'w')
-        file.write(response.read())
-        file.close()
+        response = requests.get(link)
+        with open(os.path.join(location, name), 'wb') as f:
+        	f.write(response.content)
+        	f.close()
     except HTTPError:
         print('>>> Error 404: cannot be downloaded!\n') 
         raise   
@@ -28,9 +36,11 @@ def clean_pdf_link(link):
         link = link.replace('abs', 'pdf')   
         if not(link.endswith('.pdf')):
             link = '.'.join((link, 'pdf'))
+
+    print(link)
     return link
 
-def clean_text(text, replacements = {' ': '_', '/': '_', '.': '', '"': ''}):
+def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
     for key, rep in replacements.items():
         text = text.replace(key, rep)
     return text    
@@ -95,13 +105,14 @@ def shorten_title(title):
                 if link is not None:
                     link = clean_pdf_link(link.attrs['href'])
                     ext = get_extension(link)
+                    print(ext)
                     if not ext in forbidden_extensions:
                         print(shorten_title(point.text) + ' (' + link + ')')
                         try:
                             name = clean_text(point.text.split('[' + ext + ']')[0])
                             fullname = '.'.join((name, ext))
                             if not os.path.exists('/'.join((current_directory, fullname)) ):
-                               download_pdf(link, current_directory, '.'.join((name, ext)))
+                                download_pdf(link, current_directory, '.'.join((name, ext)))
                         except KeyboardInterrupt:
                             try:
                                 print("Press Ctrl-C in 1 second to quit")