Skip to content

Commit d90d1ca

Browse files
authored
Update 08_basic_email_web_crawler.py
1 parent c9d1f05 commit d90d1ca

File tree

1 file changed

+13
-14
lines changed

1 file changed

+13
-14
lines changed

scripts/08_basic_email_web_crawler.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
1+
from bs4 import BeautifulSoup
12
import requests
2-
import re
3+
34

45
# get url
56
url = input('Enter a URL (include `http://`): ')
7+
response = requests.get(url)
8+
html = response.text
9+
soup = BeautifulSoup(html, "html.parser")
10+
print(html)
611

7-
# connect to the url
8-
website = requests.get(url)
9-
10-
# read html
11-
html = website.text
12-
13-
# use re.findall to grab all the links
14-
links = re.findall('"((http|ftp)s?://.*?)"', html)
15-
emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html)
16-
12+
links = []
13+
for i in soup.find_all("a",href= True):
14+
links.append(i)
15+
print("leitud link: ", i)
1716

1817
# print the number of links in the list
19-
print("\nFound {} links".format(len(links)))
20-
for email in emails:
21-
print(email)
18+
# print("\nFound {} links".format(len(links)))
19+
# for email in emails:
20+
# print(email)

0 commit comments

Comments
 (0)