| 
 | 1 | +# guiscrape.py  | 
 | 2 | +from tkinter import *  | 
 | 3 | +from tkinter import ttk, filedialog, messagebox  | 
 | 4 | +import base64  | 
 | 5 | +import json  | 
 | 6 | +from pathlib import Path  | 
 | 7 | + | 
 | 8 | +from bs4 import BeautifulSoup  | 
 | 9 | +import requests  | 
 | 10 | + | 
 | 11 | + | 
 | 12 | +config = {}  | 
 | 13 | + | 
 | 14 | + | 
 | 15 | +def fetch_url():  | 
 | 16 | +    url = _url.get()  | 
 | 17 | +    config['images'] = []  | 
 | 18 | +    _images.set(())  # initialised as an empty tuple  | 
 | 19 | +    try:  | 
 | 20 | +        page = requests.get(url)  | 
 | 21 | +    except requests.RequestException as err:  | 
 | 22 | +        sb(str(err))  | 
 | 23 | +    else:  | 
 | 24 | +        soup = BeautifulSoup(page.content, 'html.parser')  | 
 | 25 | +        images = fetch_images(soup, url)  | 
 | 26 | +        if images:  | 
 | 27 | +            _images.set(tuple(img['name'] for img in images))  | 
 | 28 | +            sb('Images found: {}'.format(len(images)))  | 
 | 29 | +        else:  | 
 | 30 | +            sb('No images found')  | 
 | 31 | +        config['images'] = images  | 
 | 32 | + | 
 | 33 | + | 
 | 34 | +def fetch_images(soup, base_url):  | 
 | 35 | +    images = []  | 
 | 36 | +    for img in soup.findAll('img'):  | 
 | 37 | +        src = img.get('src')  | 
 | 38 | +        img_url = f'{base_url}/{src}'  | 
 | 39 | +        name = img_url.split('/')[-1]  | 
 | 40 | +        images.append(dict(name=name, url=img_url))  | 
 | 41 | +    return images  | 
 | 42 | + | 
 | 43 | + | 
 | 44 | +def save():  | 
 | 45 | +    if not config.get('images'):  | 
 | 46 | +        alert('No images to save')  | 
 | 47 | +        return  | 
 | 48 | + | 
 | 49 | +    if _save_method.get() == 'img':  | 
 | 50 | +        dirname = filedialog.askdirectory(mustexist=True)  | 
 | 51 | +        save_images(dirname)  | 
 | 52 | +    else:  | 
 | 53 | +        filename = filedialog.asksaveasfilename(  | 
 | 54 | +            initialfile='images.json',  | 
 | 55 | +            filetypes=[('JSON', '.json')])  | 
 | 56 | +        save_json(filename)  | 
 | 57 | + | 
 | 58 | + | 
 | 59 | +def save_images(dirname):  | 
 | 60 | +    if dirname and config.get('images'):  | 
 | 61 | +        for img in config['images']:  | 
 | 62 | +            img_data = requests.get(img['url']).content  | 
 | 63 | +            filename = Path(dirname).joinpath(img['name'])  | 
 | 64 | +            with open(filename, 'wb') as f:  | 
 | 65 | +                f.write(img_data)  | 
 | 66 | +        alert('Done')  | 
 | 67 | + | 
 | 68 | + | 
 | 69 | +def save_json(filename):  | 
 | 70 | +    if filename and config.get('images'):  | 
 | 71 | +        data = {}  | 
 | 72 | +        for img in config['images']:  | 
 | 73 | +            img_data = requests.get(img['url']).content  | 
 | 74 | +            b64_img_data = base64.b64encode(img_data)  | 
 | 75 | +            str_img_data = b64_img_data.decode('utf-8')  | 
 | 76 | +            data[img['name']] = str_img_data  | 
 | 77 | + | 
 | 78 | +        with open(filename, 'w') as ijson:  | 
 | 79 | +            ijson.write(json.dumps(data))  | 
 | 80 | +        alert('Done')  | 
 | 81 | + | 
 | 82 | + | 
 | 83 | +def sb(msg):  | 
 | 84 | +    _status_msg.set(msg)  | 
 | 85 | + | 
 | 86 | + | 
 | 87 | +def alert(msg):  | 
 | 88 | +    messagebox.showinfo(message=msg)  | 
 | 89 | + | 
 | 90 | + | 
 | 91 | +if __name__ == "__main__":  | 
 | 92 | + | 
 | 93 | +    _root = Tk()  | 
 | 94 | +    _root.title('Scrape app')  | 
 | 95 | + | 
 | 96 | +    _mainframe = ttk.Frame(_root, padding='5 5 5 5')  | 
 | 97 | +    _mainframe.grid(row=0, column=0, sticky=(E, W, N, S))  | 
 | 98 | + | 
 | 99 | +    _url_frame = ttk.LabelFrame(  | 
 | 100 | +        _mainframe, text='URL', padding='5 5 5 5')  | 
 | 101 | +    _url_frame.grid(row=0, column=0, sticky=(E, W))  | 
 | 102 | +    _url_frame.columnconfigure(0, weight=1)  | 
 | 103 | +    _url_frame.rowconfigure(0, weight=1)  | 
 | 104 | + | 
 | 105 | +    _url = StringVar()  | 
 | 106 | +    _url.set('http://localhost:8000')  | 
 | 107 | +    _url_entry = ttk.Entry(  | 
 | 108 | +        _url_frame, width=40, textvariable=_url)  | 
 | 109 | +    _url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)  | 
 | 110 | + | 
 | 111 | +    _fetch_btn = ttk.Button(  | 
 | 112 | +        _url_frame, text='Fetch info', command=fetch_url)  | 
 | 113 | +    _fetch_btn.grid(row=0, column=1, sticky=W, padx=5)  | 
 | 114 | + | 
 | 115 | +    _img_frame = ttk.LabelFrame(  | 
 | 116 | +        _mainframe, text='Content', padding='9 0 0 0')  | 
 | 117 | +    _img_frame.grid(row=1, column=0, sticky=(N, S, E, W))  | 
 | 118 | + | 
 | 119 | +    _images = StringVar()  | 
 | 120 | +    _img_listbox = Listbox(  | 
 | 121 | +        _img_frame, listvariable=_images, height=6, width=25)  | 
 | 122 | +    _img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)  | 
 | 123 | +    _scrollbar = ttk.Scrollbar(  | 
 | 124 | +        _img_frame, orient=VERTICAL, command=_img_listbox.yview)  | 
 | 125 | +    _scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)  | 
 | 126 | +    _img_listbox.configure(yscrollcommand=_scrollbar.set)  | 
 | 127 | + | 
 | 128 | +    _radio_frame = ttk.Frame(_img_frame)  | 
 | 129 | +    _radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))  | 
 | 130 | + | 
 | 131 | +    _choice_lbl = ttk.Label(  | 
 | 132 | +        _radio_frame, text="Choose how to save images")  | 
 | 133 | +    _choice_lbl.grid(row=0, column=0, padx=5, pady=5)  | 
 | 134 | + | 
 | 135 | +    _save_method = StringVar()  | 
 | 136 | +    _save_method.set('img')  | 
 | 137 | +    _img_only_radio = ttk.Radiobutton(  | 
 | 138 | +        _radio_frame, text='As Images', variable=_save_method,  | 
 | 139 | +        value='img')  | 
 | 140 | +    _img_only_radio.grid(  | 
 | 141 | +        row=1, column=0, padx=5, pady=2, sticky=W)  | 
 | 142 | +    _img_only_radio.configure(state='normal')  | 
 | 143 | +    _json_radio = ttk.Radiobutton(  | 
 | 144 | +        _radio_frame, text='As JSON', variable=_save_method,  | 
 | 145 | +        value='json')  | 
 | 146 | +    _json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)  | 
 | 147 | + | 
 | 148 | +    _scrape_btn = ttk.Button(  | 
 | 149 | +        _mainframe, text='Scrape!', command=save)  | 
 | 150 | +    _scrape_btn.grid(row=2, column=0, sticky=E, pady=5)  | 
 | 151 | + | 
 | 152 | +    _status_frame = ttk.Frame(  | 
 | 153 | +        _root, relief='sunken', padding='2 2 2 2')  | 
 | 154 | +    _status_frame.grid(row=1, column=0, sticky=(E, W, S))  | 
 | 155 | + | 
 | 156 | +    _status_msg = StringVar()  | 
 | 157 | +    _status_msg.set('Type a URL to start scraping...')  | 
 | 158 | +    _status = ttk.Label(  | 
 | 159 | +        _status_frame, textvariable=_status_msg, anchor=W)  | 
 | 160 | +    _status.grid(row=0, column=0, sticky=(E, W))  | 
 | 161 | + | 
 | 162 | +    _root.mainloop()  | 
 | 163 | + | 
 | 164 | + | 
 | 165 | +"""  | 
 | 166 | +Example on reading a JSON file:  | 
 | 167 | +
  | 
 | 168 | +with open('images.json', 'r') as f:  | 
 | 169 | +    data = json.loads(f.read())  | 
 | 170 | +
  | 
 | 171 | +for (name, b64val) in data.items():  | 
 | 172 | +    with open(name, 'wb') as f:  | 
 | 173 | +        f.write(base64.b64decode(b64val))  | 
 | 174 | +"""  | 
0 commit comments