Update 08_basic_email_web_crawler.py

peisje · web-flow · commit d90d1ca228d0 · 2025-05-12T14:52:15.000+03:00
diff --git a/scripts/08_basic_email_web_crawler.py b/scripts/08_basic_email_web_crawler.py
@@ -1,21 +1,20 @@
+from bs4 import BeautifulSoup
 import requests
-import re
+
 
 # get url
 url = input('Enter a URL (include `http://`): ')
+response = requests.get(url)
+html = response.text
+soup = BeautifulSoup(html, "html.parser")
+print(html)
 
-# connect to the url
-website = requests.get(url)
-
-# read html
-html = website.text
-
-# use re.findall to grab all the links
-links = re.findall('"((http|ftp)s?://.*?)"', html)
-emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html)
-
+links = []
+for i in soup.find_all("a",href= True):
+    links.append(i)
+    print("leitud link: ", i)
 
 # print the number of links in the list
-print("\nFound {} links".format(len(links)))
-for email in emails:
-    print(email)
+# print("\nFound {} links".format(len(links)))
+# for email in emails:
+#     print(email)