|
1 | 1 | import requests
|
2 | 2 | import re
|
3 |
| -try: |
4 |
| - from urllib.parse import urljoin |
5 |
| -except ImportError: |
6 |
| - from urlparse import urljoin |
7 | 3 |
|
8 |
| -# regex |
9 |
| -email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') |
10 |
| -link_re = re.compile(r'href="http://github.com/(.*?)"') |
| 4 | +#get url |
| 5 | +#url=input('Enter a URL (include 'http://'):')--this is wrong |
| 6 | +url = input('Enter a URL (include `http://`): ') |
11 | 7 |
|
12 | 8 |
|
13 |
| -def crawl(url): |
| 9 | +#connect to the url |
| 10 | +website=requests.get(url) |
14 | 11 |
|
15 |
| - result = set() |
| 12 | +#read html |
| 13 | +html=website.text |
16 | 14 |
|
17 |
| - req = requests.get(url) |
18 | 15 |
|
19 |
| - # Check if successful |
20 |
| - if(req.status_code != 200): |
21 |
| - return [] |
| 16 | +#use re.findall to grab all the links |
| 17 | +links = re.findall('"((http|ftp)s?://.*?)"', html) |
22 | 18 |
|
23 |
| - # Find links |
24 |
| - links = link_re.findall(req.text) |
| 19 | +emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html) |
25 | 20 |
|
26 |
| - print("\nFound {} links".format(len(links))) |
27 | 21 |
|
28 |
| - # Search links for emails |
29 |
| - for link in links: |
| 22 | +#prints the number of links in the list |
| 23 | +print("\nFound {} links".format(len(links))) |
30 | 24 |
|
31 |
| - # Get an absolute URL for a link |
32 |
| - link = urljoin(url, link) |
33 |
| - |
34 |
| - # Find all emails on current page |
35 |
| - result.update(email_re.findall(req.text)) |
36 |
| - |
37 |
| - return result |
38 |
| - |
39 |
| -if __name__ == '__main__': |
40 |
| - emails = crawl('http://www.realpython.com') |
41 |
| - |
42 |
| - print("\nScrapped e-mail addresses:") |
43 |
| - for email in emails: |
44 |
| - print(email) |
45 |
| - print("\n") |
| 25 | +for email in emails: |
| 26 | + print(email) |
0 commit comments