1+ from requests_html import HTMLSession
2+ from urllib .parse import urlparse , urljoin
3+ from bs4 import BeautifulSoup
4+ import colorama
5+
6+ # init the colorama module
7+ colorama .init ()
8+
9+ GREEN = colorama .Fore .GREEN
10+ GRAY = colorama .Fore .LIGHTBLACK_EX
11+ RESET = colorama .Fore .RESET
12+
13+ # initialize the set of links (unique links)
14+ internal_urls = set ()
15+ external_urls = set ()
16+
17+ total_urls_visited = 0
18+
19+
20+ def is_valid (url ):
21+ """
22+ Checks whether `url` is a valid URL.
23+ """
24+ parsed = urlparse (url )
25+ return bool (parsed .netloc ) and bool (parsed .scheme )
26+
27+
28+ def get_all_website_links (url ):
29+ """
30+ Returns all URLs that is found on `url` in which it belongs to the same website
31+ """
32+ # all URLs of `url`
33+ urls = set ()
34+ # domain name of the URL without the protocol
35+ domain_name = urlparse (url ).netloc
36+ # initialize an HTTP session
37+ session = HTMLSession ()
38+ # make HTTP request & retrieve response
39+ response = session .get (url )
40+ # execute Javascript
41+ try :
42+ response .html .render ()
43+ except :
44+ pass
45+ soup = BeautifulSoup (response .html .html , "html.parser" )
46+ for a_tag in soup .findAll ("a" ):
47+ href = a_tag .attrs .get ("href" )
48+ if href == "" or href is None :
49+ # href empty tag
50+ continue
51+ # join the URL if it's relative (not absolute link)
52+ href = urljoin (url , href )
53+ parsed_href = urlparse (href )
54+ # remove URL GET parameters, URL fragments, etc.
55+ href = parsed_href .scheme + "://" + parsed_href .netloc + parsed_href .path
56+ if not is_valid (href ):
57+ # not a valid URL
58+ continue
59+ if href in internal_urls :
60+ # already in the set
61+ continue
62+ if domain_name not in href :
63+ # external link
64+ if href not in external_urls :
65+ print (f"{ GRAY } [!] External link: { href } { RESET } " )
66+ external_urls .add (href )
67+ continue
68+ print (f"{ GREEN } [*] Internal link: { href } { RESET } " )
69+ urls .add (href )
70+ internal_urls .add (href )
71+ return urls
72+
73+
74+ def crawl (url , max_urls = 50 ):
75+ """
76+ Crawls a web page and extracts all links.
77+ You'll find all links in `external_urls` and `internal_urls` global set variables.
78+ params:
79+ max_urls (int): number of max urls to crawl, default is 30.
80+ """
81+ global total_urls_visited
82+ total_urls_visited += 1
83+ links = get_all_website_links (url )
84+ for link in links :
85+ if total_urls_visited > max_urls :
86+ break
87+ crawl (link , max_urls = max_urls )
88+
89+
90+ if __name__ == "__main__" :
91+ import argparse
92+ parser = argparse .ArgumentParser (description = "Link Extractor Tool with Python" )
93+ parser .add_argument ("url" , help = "The URL to extract links from." )
94+ parser .add_argument ("-m" , "--max-urls" , help = "Number of max URLs to crawl, default is 30." , default = 30 , type = int )
95+
96+ args = parser .parse_args ()
97+ url = args .url
98+ max_urls = args .max_urls
99+
100+ crawl (url , max_urls = max_urls )
101+
102+ print ("[+] Total Internal links:" , len (internal_urls ))
103+ print ("[+] Total External links:" , len (external_urls ))
104+ print ("[+] Total URLs:" , len (external_urls ) + len (internal_urls ))
105+
106+ domain_name = urlparse (url ).netloc
107+
108+ # save the internal links to a file
109+ with open (f"{ domain_name } _internal_links.txt" , "w" ) as f :
110+ for internal_link in internal_urls :
111+ print (internal_link .strip (), file = f )
112+
113+ # save the external links to a file
114+ with open (f"{ domain_name } _external_links.txt" , "w" ) as f :
115+ for external_link in external_urls :
116+ print (external_link .strip (), file = f )
117+
0 commit comments