From 1216cc17de8bebfb1f5acbc37675d3ee9c675e68 Mon Sep 17 00:00:00 2001 From: David de Hilster Date: Tue, 24 Jan 2023 15:00:29 -0500 Subject: [PATCH] NLP-TUTORIALS-012 fixed python certificate problem Signed-off-by: David de Hilster --- tutorial-13/tutorial-13-a/README.md | 6 +++++- tutorial-13/tutorial-13-a/input/urlfetch.py | 5 ++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tutorial-13/tutorial-13-a/README.md b/tutorial-13/tutorial-13-a/README.md index 0fabafa..3cfe76e 100644 --- a/tutorial-13/tutorial-13-a/README.md +++ b/tutorial-13/tutorial-13-a/README.md @@ -1,3 +1,7 @@ # Tutorial 13-a -This analyzer parses the URLs from this link: https://state.1keydata.com/ into a URL list. It then has a python script to fetch the webpages and save them in a folder. This folder then can easily be moved into the second analyzer where the pages will be processed. \ No newline at end of file +This analyzer parses the URLs from this link: https://state.1keydata.com/ into a URL list. It then has a python script to fetch the webpages and save them in a folder. This folder then can easily be moved into the second analyzer where the pages will be processed. + +## NOTE + +You will have to install BeautifulSoup and certifi before using the python script. \ No newline at end of file diff --git a/tutorial-13/tutorial-13-a/input/urlfetch.py b/tutorial-13/tutorial-13-a/input/urlfetch.py index e74d752..cff9b03 100644 --- a/tutorial-13/tutorial-13-a/input/urlfetch.py +++ b/tutorial-13/tutorial-13-a/input/urlfetch.py @@ -5,12 +5,11 @@ from bs4 import BeautifulSoup from pathlib import Path import re +import certifi wordsfile = os.path.join(os.path.dirname(__file__), "urls.txt") file1 = codecs.open(wordsfile, "r", "utf-8") lines = file1.readlines() - -urlbase = "https://state.1keydata.com/" count = 0 for url in lines: @@ -31,7 +30,7 @@ found = False try: - page = urllib.request.urlopen(url) + page = urllib.request.urlopen(url, cafile=certifi.where()) except HTTPError as e: print(' Error code: ', e.code) file1 = open(os.path.join(os.path.dirname(__file__), "urlorphans.txt"), "a")