Skip to content

Commit d4c7ab6

Browse files
committed
ch12
1 parent 3cebc2f commit d4c7ab6

File tree

11 files changed

+320
-0
lines changed

11 files changed

+320
-0
lines changed

ch12/guiscrape.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# guiscrape.py
2+
from tkinter import *
3+
from tkinter import ttk, filedialog, messagebox
4+
import base64
5+
import json
6+
from pathlib import Path
7+
8+
from bs4 import BeautifulSoup
9+
import requests
10+
11+
12+
config = {}
13+
14+
15+
def fetch_url():
16+
url = _url.get()
17+
config['images'] = []
18+
_images.set(()) # initialised as an empty tuple
19+
try:
20+
page = requests.get(url)
21+
except requests.RequestException as err:
22+
sb(str(err))
23+
else:
24+
soup = BeautifulSoup(page.content, 'html.parser')
25+
images = fetch_images(soup, url)
26+
if images:
27+
_images.set(tuple(img['name'] for img in images))
28+
sb('Images found: {}'.format(len(images)))
29+
else:
30+
sb('No images found')
31+
config['images'] = images
32+
33+
34+
def fetch_images(soup, base_url):
35+
images = []
36+
for img in soup.findAll('img'):
37+
src = img.get('src')
38+
img_url = f'{base_url}/{src}'
39+
name = img_url.split('/')[-1]
40+
images.append(dict(name=name, url=img_url))
41+
return images
42+
43+
44+
def save():
45+
if not config.get('images'):
46+
alert('No images to save')
47+
return
48+
49+
if _save_method.get() == 'img':
50+
dirname = filedialog.askdirectory(mustexist=True)
51+
save_images(dirname)
52+
else:
53+
filename = filedialog.asksaveasfilename(
54+
initialfile='images.json',
55+
filetypes=[('JSON', '.json')])
56+
save_json(filename)
57+
58+
59+
def save_images(dirname):
60+
if dirname and config.get('images'):
61+
for img in config['images']:
62+
img_data = requests.get(img['url']).content
63+
filename = Path(dirname).joinpath(img['name'])
64+
with open(filename, 'wb') as f:
65+
f.write(img_data)
66+
alert('Done')
67+
68+
69+
def save_json(filename):
70+
if filename and config.get('images'):
71+
data = {}
72+
for img in config['images']:
73+
img_data = requests.get(img['url']).content
74+
b64_img_data = base64.b64encode(img_data)
75+
str_img_data = b64_img_data.decode('utf-8')
76+
data[img['name']] = str_img_data
77+
78+
with open(filename, 'w') as ijson:
79+
ijson.write(json.dumps(data))
80+
alert('Done')
81+
82+
83+
def sb(msg):
84+
_status_msg.set(msg)
85+
86+
87+
def alert(msg):
88+
messagebox.showinfo(message=msg)
89+
90+
91+
if __name__ == "__main__":
92+
93+
_root = Tk()
94+
_root.title('Scrape app')
95+
96+
_mainframe = ttk.Frame(_root, padding='5 5 5 5')
97+
_mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
98+
99+
_url_frame = ttk.LabelFrame(
100+
_mainframe, text='URL', padding='5 5 5 5')
101+
_url_frame.grid(row=0, column=0, sticky=(E, W))
102+
_url_frame.columnconfigure(0, weight=1)
103+
_url_frame.rowconfigure(0, weight=1)
104+
105+
_url = StringVar()
106+
_url.set('http://localhost:8000')
107+
_url_entry = ttk.Entry(
108+
_url_frame, width=40, textvariable=_url)
109+
_url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)
110+
111+
_fetch_btn = ttk.Button(
112+
_url_frame, text='Fetch info', command=fetch_url)
113+
_fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
114+
115+
_img_frame = ttk.LabelFrame(
116+
_mainframe, text='Content', padding='9 0 0 0')
117+
_img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
118+
119+
_images = StringVar()
120+
_img_listbox = Listbox(
121+
_img_frame, listvariable=_images, height=6, width=25)
122+
_img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)
123+
_scrollbar = ttk.Scrollbar(
124+
_img_frame, orient=VERTICAL, command=_img_listbox.yview)
125+
_scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)
126+
_img_listbox.configure(yscrollcommand=_scrollbar.set)
127+
128+
_radio_frame = ttk.Frame(_img_frame)
129+
_radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
130+
131+
_choice_lbl = ttk.Label(
132+
_radio_frame, text="Choose how to save images")
133+
_choice_lbl.grid(row=0, column=0, padx=5, pady=5)
134+
135+
_save_method = StringVar()
136+
_save_method.set('img')
137+
_img_only_radio = ttk.Radiobutton(
138+
_radio_frame, text='As Images', variable=_save_method,
139+
value='img')
140+
_img_only_radio.grid(
141+
row=1, column=0, padx=5, pady=2, sticky=W)
142+
_img_only_radio.configure(state='normal')
143+
_json_radio = ttk.Radiobutton(
144+
_radio_frame, text='As JSON', variable=_save_method,
145+
value='json')
146+
_json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
147+
148+
_scrape_btn = ttk.Button(
149+
_mainframe, text='Scrape!', command=save)
150+
_scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
151+
152+
_status_frame = ttk.Frame(
153+
_root, relief='sunken', padding='2 2 2 2')
154+
_status_frame.grid(row=1, column=0, sticky=(E, W, S))
155+
156+
_status_msg = StringVar()
157+
_status_msg.set('Type a URL to start scraping...')
158+
_status = ttk.Label(
159+
_status_frame, textvariable=_status_msg, anchor=W)
160+
_status.grid(row=0, column=0, sticky=(E, W))
161+
162+
_root.mainloop()
163+
164+
165+
"""
166+
Example on reading a JSON file:
167+
168+
with open('images.json', 'r') as f:
169+
data = json.loads(f.read())
170+
171+
for (name, b64val) in data.items():
172+
with open(name, 'wb') as f:
173+
f.write(base64.b64decode(b64val))
174+
"""

ch12/requirements/requirements.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
beautifulsoup4
2+
requests

ch12/requirements/requirements.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#
2+
# This file is autogenerated by pip-compile with python 3.9
3+
# To update, run:
4+
#
5+
# pip-compile requirements.in
6+
#
7+
beautifulsoup4==4.9.3
8+
# via -r requirements.in
9+
certifi==2021.5.30
10+
# via requests
11+
charset-normalizer==2.0.3
12+
# via requests
13+
idna==3.2
14+
# via requests
15+
requests==2.26.0
16+
# via -r requirements.in
17+
soupsieve==2.2.1
18+
# via beautifulsoup4
19+
urllib3==1.26.6
20+
# via requests

ch12/scrape.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# scrape.py
2+
import argparse
3+
import base64
4+
import json
5+
from pathlib import Path
6+
7+
from bs4 import BeautifulSoup
8+
import requests
9+
10+
11+
def scrape(url, format_, type_):
12+
try:
13+
page = requests.get(url)
14+
except requests.RequestException as err:
15+
print(str(err))
16+
else:
17+
soup = BeautifulSoup(page.content, 'html.parser')
18+
images = fetch_images(soup, url)
19+
images = filter_images(images, type_)
20+
save(images, format_)
21+
22+
23+
def fetch_images(soup, base_url):
24+
# Works only with relative src paths.
25+
images = []
26+
for img in soup.findAll('img'):
27+
src = img.get('src')
28+
img_url = f'{base_url}/{src}'
29+
name = img_url.split('/')[-1]
30+
images.append(dict(name=name, url=img_url))
31+
return images
32+
33+
34+
def filter_images(images, type_):
35+
if type_ == 'all':
36+
return images
37+
ext_map = {
38+
'png': ['.png'],
39+
'jpg': ['.jpg', '.jpeg'],
40+
}
41+
return [
42+
img for img in images
43+
if matches_extension(img['name'], ext_map[type_])
44+
]
45+
46+
47+
def matches_extension(filename, extension_list):
48+
extension = Path(filename.lower()).suffix
49+
return extension in extension_list
50+
51+
52+
def save(images, format_):
53+
if images:
54+
if format_ == 'img':
55+
save_images(images)
56+
else:
57+
save_json(images)
58+
print('Done')
59+
else:
60+
print('No images to save.')
61+
62+
63+
def save_images(images):
64+
for img in images:
65+
img_data = requests.get(img['url']).content
66+
with open(img['name'], 'wb') as f:
67+
f.write(img_data)
68+
69+
70+
def save_json(images):
71+
data = {}
72+
for img in images:
73+
img_data = requests.get(img['url']).content
74+
b64_img_data = base64.b64encode(img_data)
75+
str_img_data = b64_img_data.decode('utf-8')
76+
data[img['name']] = str_img_data
77+
78+
with open('images.json', 'w') as ijson:
79+
ijson.write(json.dumps(data))
80+
81+
82+
if __name__ == "__main__":
83+
84+
parser = argparse.ArgumentParser(
85+
description='Scrape a webpage.')
86+
parser.add_argument(
87+
'-t',
88+
'--type',
89+
choices=['all', 'png', 'jpg'],
90+
default='all',
91+
help='The image type we want to scrape.')
92+
93+
parser.add_argument(
94+
'-f',
95+
'--format',
96+
choices=['img', 'json'],
97+
default='img',
98+
help='The format images are saved to.')
99+
100+
parser.add_argument(
101+
'url',
102+
help='The URL we want to scrape for images.')
103+
104+
args = parser.parse_args()
105+
scrape(args.url, args.format, args.type)
32.3 KB
Loading
24.6 KB
Loading
36.7 KB
Loading
58.3 KB
Loading
71.1 KB
Loading

ch12/simple_server/index.html

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head><title>Cool Owls!</title></head>
4+
<body>
5+
<h1>Welcome to our owl gallery</h1>
6+
<div>
7+
<img src="img/owl-alcohol.png" height="128" />
8+
<img src="img/owl-book.png" height="128" />
9+
<img src="img/owl-books.png" height="128" />
10+
<img src="img/owl-ebook.jpg" height="128" />
11+
<img src="img/owl-rose.jpeg" height="128" />
12+
</div>
13+
<p>Do you like these owls?</p>
14+
</body>
15+
</html>

0 commit comments

Comments
 (0)