Skip to content

Commit 2488d0e

Browse files
committed
Finished Week 4 (Parsing HTML with BeautifulSoup)
1 parent 74f7af3 commit 2488d0e

File tree

3 files changed

+61
-0
lines changed

3 files changed

+61
-0
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import urllib.request as ur
2+
from bs4 import *
3+
4+
url = input('Enter the url to scrape - ')
5+
6+
html = ur.urlopen(url).read()
7+
soup = BeautifulSoup(html, 'html.parser')
8+
9+
count_of_spans = 0
10+
sum = 0
11+
12+
spans = soup('span')
13+
for span in spans:
14+
sum += int(span.contents[0])
15+
count_of_spans += 1
16+
17+
print('Count ', count_of_spans)
18+
print('Sum ', sum)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import urllib.request as ur
2+
from bs4 import *
3+
4+
current_repeat_count = 0
5+
url = input('Enter URL: ')
6+
repeat_count = int(input('Enter count: '))
7+
position = int(input('Enter position: '))
8+
9+
10+
def parse_html(url):
11+
html = ur.urlopen(url).read()
12+
soup = BeautifulSoup(html, 'html.parser')
13+
tags = soup('a')
14+
return tags
15+
16+
while current_repeat_count < repeat_count:
17+
print('Retrieving: ', url)
18+
tags = parse_html(url)
19+
for index, item in enumerate(tags):
20+
if index == position - 1:
21+
url = item.get('href', None)
22+
name = item.contents[0]
23+
break
24+
else:
25+
continue
26+
current_repeat_count += 1
27+
print('Last Url: ', url)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import urllib.request as ur
2+
from bs4 import *
3+
4+
url = input('Enter the url to scrape - ')
5+
# http://www.dr-chuck.com
6+
7+
html = ur.urlopen(url).read()
8+
soup = BeautifulSoup(html, 'html.parser')
9+
# wrapping the whole HTML into a single soup object
10+
11+
tags = soup('a')
12+
# extracts all 'a' tag from the HTML object
13+
14+
for tag in tags:
15+
print(tag.get('href'), None)
16+
# each tag is returned as a dictionary of its attributes

0 commit comments

Comments
 (0)