Skip to content

Commit ec82582

Browse files
author
Karan Goel
committed
Web scraper done
1 parent 0b7f692 commit ec82582

File tree

1 file changed

+33
-0
lines changed

1 file changed

+33
-0
lines changed

Web/page_scraper.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# -*- coding: cp1252 -*-
2+
"""
3+
Page Scraper - Create an application which connects to a
4+
site and pulls out all links, or images, and saves them to
5+
a list. Optional: Organize the indexed content and don’t
6+
allow duplicates. Have it put the results into an easily
7+
searchable index file.
8+
"""
9+
10+
import urllib2
11+
from bs4 import BeautifulSoup
12+
13+
14+
def print_list(stuff):
15+
print '\n'.join(stuff)
16+
print '\n====================\n'
17+
18+
if __name__ == '__main__':
19+
20+
url = raw_input('Enter a URL: ')
21+
22+
choice = input('What to scrape?\n1. Links\n2. Images\n3. Both\n')
23+
24+
soup = BeautifulSoup(urllib2.urlopen(url).read())
25+
26+
if choice == 1 or choice == 3:
27+
urls = [link.get('href') for link in soup.findAll('a')]
28+
print 'URLs:'
29+
print_list(urls)
30+
if choice == 2 or choice ==3:
31+
images = [image['src'] for image in soup.findAll("img")]
32+
print 'Images:'
33+
print_list(images)

0 commit comments

Comments
 (0)