added javascript execution to link extractor tool

x4nth055 · x4nth055 · commit 2760d6877410 · 2020-05-13T16:00:01.000+02:00
diff --git a/web-scraping/link-extractor/link_extractor_js.py b/web-scraping/link-extractor/link_extractor_js.py
@@ -0,0 +1,117 @@
+from requests_html import HTMLSession
+from urllib.parse import urlparse, urljoin
+from bs4 import BeautifulSoup
+import colorama
+
+# init the colorama module
+colorama.init()
+
+GREEN = colorama.Fore.GREEN
+GRAY = colorama.Fore.LIGHTBLACK_EX
+RESET = colorama.Fore.RESET
+
+# initialize the set of links (unique links)
+internal_urls = set()
+external_urls = set()
+
+total_urls_visited = 0
+
+
+def is_valid(url):
+    """
+    Checks whether `url` is a valid URL.
+    """
+    parsed = urlparse(url)
+    return bool(parsed.netloc) and bool(parsed.scheme)
+
+
+def get_all_website_links(url):
+    """
+    Returns all URLs that is found on `url` in which it belongs to the same website
+    """
+    # all URLs of `url`
+    urls = set()
+    # domain name of the URL without the protocol
+    domain_name = urlparse(url).netloc
+    # initialize an HTTP session
+    session = HTMLSession()
+    # make HTTP request & retrieve response
+    response = session.get(url)
+    # execute Javascript
+    try:
+        response.html.render()
+    except:
+        pass
+    soup = BeautifulSoup(response.html.html, "html.parser")
+    for a_tag in soup.findAll("a"):
+        href = a_tag.attrs.get("href")
+        if href == "" or href is None:
+            # href empty tag
+            continue
+        # join the URL if it's relative (not absolute link)
+        href = urljoin(url, href)
+        parsed_href = urlparse(href)
+        # remove URL GET parameters, URL fragments, etc.
+        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
+        if not is_valid(href):
+            # not a valid URL
+            continue
+        if href in internal_urls:
+            # already in the set
+            continue
+        if domain_name not in href:
+            # external link
+            if href not in external_urls:
+                print(f"{GRAY}[!] External link: {href}{RESET}")
+                external_urls.add(href)
+            continue
+        print(f"{GREEN}[*] Internal link: {href}{RESET}")
+        urls.add(href)
+        internal_urls.add(href)
+    return urls
+
+
+def crawl(url, max_urls=50):
+    """
+    Crawls a web page and extracts all links.
+    You'll find all links in `external_urls` and `internal_urls` global set variables.
+    params:
+        max_urls (int): number of max urls to crawl, default is 30.
+    """
+    global total_urls_visited
+    total_urls_visited += 1
+    links = get_all_website_links(url)
+    for link in links:
+        if total_urls_visited > max_urls:
+            break
+        crawl(link, max_urls=max_urls)
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
+    parser.add_argument("url", help="The URL to extract links from.")
+    parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
+    
+    args = parser.parse_args()
+    url = args.url
+    max_urls = args.max_urls
+
+    crawl(url, max_urls=max_urls)
+
+    print("[+] Total Internal links:", len(internal_urls))
+    print("[+] Total External links:", len(external_urls))
+    print("[+] Total URLs:", len(external_urls) + len(internal_urls))
+
+    domain_name = urlparse(url).netloc
+
+    # save the internal links to a file
+    with open(f"{domain_name}_internal_links.txt", "w") as f:
+        for internal_link in internal_urls:
+            print(internal_link.strip(), file=f)
+
+    # save the external links to a file
+    with open(f"{domain_name}_external_links.txt", "w") as f:
+        for external_link in external_urls:
+            print(external_link.strip(), file=f)
+