writepython
diff --git a/‎README.txt‎
Lines changed: 8 additions & 0 deletions b/‎README.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎config.py‎
Lines changed: 8 additions & 3 deletions b/‎config.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎run.py‎
Lines changed: 1 addition & 1 deletion b/‎run.py‎
Lines changed: 1 addition & 1 deletion
@@ -8,6 +8,10 @@
 - Pages that return only Javascript with a text/html mimetype will be requested again with Selenium using the PhantomJS browser.
 - Additional functionality is available to handle an input file containing a list of files to download.
 
+=== Requirements ===
+
+Curl for downloading binary files
+
 === RUN.PY Usage ===
 
 Edit config.py (Explanation below)
@@ -21,6 +25,8 @@ python download.py -i <input_file> -o <output_dir>
 
 mimetypes_list is an array of mimetypes that determines which files will be downloaded, provided they pass the regular expression filters.
 
+binary_mimetypes_list is an array of mimetypes that determines which files will be downloaded as binary files using Curl, provided they pass the regular expression filters.
+
 file_extensions_list is an array of file extensions that determines which files will be downloaded, provided they pass the regular expression filters.
 
 *Note: It will take less time to process each URL if one or the other of the above are used rather than both.
@@ -41,6 +47,8 @@ ignore_query_strings is a boolean.  Setting this to True means that when new URL
 
 mimetypes_list = [ 'text/html' ]
 
+binary_mimetypes_list = [ 'pdf', 'video', 'audio', 'image' ]
+
 file_extensions_list = [ '.txt' ]
 
 request_delay = 0
 
@@ -1,12 +1,17 @@
-mimetypes_list = [ ]
+mimetypes_list = [ 'html' ]
 
-binary_mimetypes_list = [ 'pdf', 'video', 'audio' ]
+binary_mimetypes_list = [ 'pdf', 'video', 'audio', 'image' ]
 
-file_extensions_list = [ '.html' ]
+file_extensions_list = [ ]
 
 request_delay = 0
 
 urls_to_crawl = [
+    {
+		"url": "http://www.dalailama.com/webcasts/post/360-meeting-with-the-shia-and-sunni-communities-in-leh",
+		"follow_links_containing": "dalailama.com",
+		"ignore_query_strings": True,
+    },    
     {
 		"url": "http://www.cuyoo.com/article-22417-1.html",
 		"follow_links_containing": "http://www.cuyoo.com/article-22417-1.html",
 
@@ -104,7 +104,7 @@ def crawl_url():
                         print "Writing binary file: ", final_url
                         encoding_used = 'binary'
                         filepath = get_filepath(final_url, encoding_used, output_dir)                        
-                        os.system( "wget -o %s %s" % (filepath , final_url) )
+                        os.system( "curl -o %s %s" % (filepath , final_url) )
                     else:
                         if not page_source:
                             print "Requesting URL with Python Requests: ", final_url