This repository was archived by the owner on May 25, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5.2k
/
Copy pathscraping_medium.py
71 lines (62 loc) · 1.96 KB
/
scraping_medium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import sys
import requests
import re
from bs4 import BeautifulSoup
# switching to current running python files directory
os.chdir('\\'.join(__file__.split('/')[:-1]))
# function to get the html of the page
def get_page():
global url
url = input('Enter url of a medium article: ')
# handling possible error
if not re.match(r'https?://medium.com/',url):
print('Please enter a valid website, or make sure it is a medium article')
sys.exit(1)
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
return soup
# function to remove all the html tags and replace some with specific strings
def purify(text):
rep = {"<br>": "\n", "<br/>": "\n", "<li>": "\n"}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
text = re.sub('\<(.*?)\>', '', text)
return text
# function to compile all of the scraped text in one string
def collect_text(soup):
fin = f'url: {url}\n\n'
main = (soup.head.title.text).split('|')
global title
title = main[0].strip()
fin += f'Title: {title.upper()}\n{main[1].strip()}'
header = soup.find_all('h1')
j = 1
try:
fin += '\n\nINTRODUCTION\n'
for elem in list(header[j].previous_siblings)[::-1]:
fin += f'\n{purify(str(elem))}'
except:
pass
fin += f'\n\n{header[j].text.upper()}'
for elem in header[j].next_siblings:
if elem.name == 'h1':
j+=1
fin += f'\n\n{header[j].text.upper()}'
continue
fin += f'\n{purify(str(elem))}'
return fin
# function to save file in the current directory
def save_file(fin):
if not os.path.exists('./scraped_articles'):
os.mkdir('./scraped_articles')
fname = './scraped_articles/' + '_'.join(title.split()) + '.txt'
with open(fname, 'w', encoding='utf8') as outfile:
outfile.write(fin)
print(f'File saved in directory {fname}')
# driver code
if __name__ == '__main__':
fin = collect_text(get_page())
save_file(fin)