Skip to content

Commit 780cad2

Browse files
committed
Merge pull request realpython#5 from RajuKoushik/patch-1
Update 08_basic_email_web_crawler.py
2 parents 7591683 + 761e0ec commit 780cad2

File tree

1 file changed

+14
-33
lines changed

1 file changed

+14
-33
lines changed

08_basic_email_web_crawler.py

+14-33
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,26 @@
11
import requests
22
import re
3-
try:
4-
from urllib.parse import urljoin
5-
except ImportError:
6-
from urlparse import urljoin
73

8-
# regex
9-
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
10-
link_re = re.compile(r'href="http://github.com/(.*?)"')
4+
#get url
5+
#url=input('Enter a URL (include 'http://'):')--this is wrong
6+
url = input('Enter a URL (include `http://`): ')
117

128

13-
def crawl(url):
9+
#connect to the url
10+
website=requests.get(url)
1411

15-
result = set()
12+
#read html
13+
html=website.text
1614

17-
req = requests.get(url)
1815

19-
# Check if successful
20-
if(req.status_code != 200):
21-
return []
16+
#use re.findall to grab all the links
17+
links = re.findall('"((http|ftp)s?://.*?)"', html)
2218

23-
# Find links
24-
links = link_re.findall(req.text)
19+
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
2520

26-
print("\nFound {} links".format(len(links)))
2721

28-
# Search links for emails
29-
for link in links:
22+
#prints the number of links in the list
23+
print("\nFound {} links".format(len(links)))
3024

31-
# Get an absolute URL for a link
32-
link = urljoin(url, link)
33-
34-
# Find all emails on current page
35-
result.update(email_re.findall(req.text))
36-
37-
return result
38-
39-
if __name__ == '__main__':
40-
emails = crawl('http://www.realpython.com')
41-
42-
print("\nScrapped e-mail addresses:")
43-
for email in emails:
44-
print(email)
45-
print("\n")
25+
for email in emails:
26+
print(email)

0 commit comments

Comments
 (0)