1+ from requests_html import HTMLSession
2+ import requests
3+ from tqdm import tqdm
4+ from bs4 import BeautifulSoup as bs
5+ from urllib .parse import urljoin , urlparse
6+
7+ import os
8+
9+
10+ def is_valid (url ):
11+ """
12+ Checks whether `url` is a valid URL.
13+ """
14+ parsed = urlparse (url )
15+ return bool (parsed .netloc ) and bool (parsed .scheme )
16+
17+
18+ def get_all_images (url ):
19+ """
20+ Returns all image URLs on a single `url`
21+ """
22+ # initialize the session
23+ session = HTMLSession ()
24+ # make the HTTP request and retrieve response
25+ response = session .get (url )
26+ # execute Javascript
27+ response .html .render ()
28+ # construct the soup parser
29+ soup = bs (response .html .html , "html.parser" )
30+ urls = []
31+ for img in tqdm (soup .find_all ("img" ), "Extracting images" ):
32+ img_url = img .attrs .get ("src" ) or img .attrs .get ("data-src" )
33+ if not img_url :
34+ # if img does not contain src attribute, just skip
35+ continue
36+ # make the URL absolute by joining domain with the URL that is just extracted
37+ img_url = urljoin (url , img_url )
38+ # remove URLs like '/hsts-pixel.gif?c=3.2.5'
39+ try :
40+ pos = img_url .index ("?" )
41+ img_url = img_url [:pos ]
42+ except ValueError :
43+ pass
44+ # finally, if the url is valid
45+ if is_valid (img_url ):
46+ urls .append (img_url )
47+ return urls
48+
49+
50+ def download (url , pathname ):
51+ """
52+ Downloads a file given an URL and puts it in the folder `pathname`
53+ """
54+ # if path doesn't exist, make that path dir
55+ if not os .path .isdir (pathname ):
56+ os .makedirs (pathname )
57+ # download the body of response by chunk, not immediately
58+ response = requests .get (url , stream = True )
59+
60+ # get the total file size
61+ file_size = int (response .headers .get ("Content-Length" , 0 ))
62+
63+ # get the file name
64+ filename = os .path .join (pathname , url .split ("/" )[- 1 ])
65+
66+ # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
67+ progress = tqdm (response .iter_content (1024 ), f"Downloading { filename } " , total = file_size , unit = "B" , unit_scale = True , unit_divisor = 1024 )
68+ with open (filename , "wb" ) as f :
69+ for data in progress :
70+ # write data read to the file
71+ f .write (data )
72+ # update the progress bar manually
73+ progress .update (len (data ))
74+
75+
76+ def main (url , path ):
77+ # get all images
78+ imgs = get_all_images (url )
79+ for img in imgs :
80+ # for each img, download it
81+ download (img , path )
82+
83+
84+
85+ if __name__ == "__main__" :
86+ import argparse
87+ parser = argparse .ArgumentParser (description = "This script downloads all images from a web page" )
88+ parser .add_argument ("url" , help = "The URL of the web page you want to download images" )
89+ parser .add_argument ("-p" , "--path" , help = "The Directory you want to store your images, default is the domain of URL passed" )
90+
91+ args = parser .parse_args ()
92+ url = args .url
93+ path = args .path
94+
95+ if not path :
96+ # if path isn't specified, use the domain name of that url as the folder name
97+ path = urlparse (url ).netloc
98+
99+ main (url , path )
0 commit comments