Skip to content

Commit d42efd6

Browse files
authored
Update webscraping_challenge.md
1 parent ceadac2 commit d42efd6

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

webscraping_challenge.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,70 @@ Choose between one of the two websites below:
1111
- [The Epic List of 250 Legendary Swords](https://hobbylark.com/fandoms/The-Epic-List-of-250-Legendary-Swords)
1212

1313
In both sites, you're presented with a long list of data (names of flowers or swords) as well as a description of each item. Using Beautiful Soup, return ALL the flowers/swords AND THEIR DESCRIPTIONS from the page and place them inside of a dictionary. The format of the dictionary is up to you- what do you think will be the best design?
14+
15+
<!--
16+
#!/usr/bin/python3
17+
"""
18+
Learning to scrape webdata with BeautifulSoup
19+
"""
20+
21+
from requests import get
22+
from requests.exceptions import RequestException
23+
from contextlib import closing
24+
from bs4 import BeautifulSoup
25+
from bs4 import element
26+
import json
27+
28+
def simple_get(url):
29+
"""
30+
Attempts to get the content at `url` by making an HTTP GET request.
31+
If the content-type of response is some kind of HTML/XML, return the
32+
text content, otherwise return None.
33+
"""
34+
try:
35+
with closing(get(url, stream=True)) as resp:
36+
# stream=True means Requests cannot release the connection until closed
37+
# closing() will close "resp" at the end of this block
38+
if is_good_response(resp):
39+
return resp.content
40+
# .content() reads the HTML of the Requests object
41+
else:
42+
return None
43+
44+
except RequestException as e:
45+
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
46+
return None
47+
48+
49+
def is_good_response(resp):
50+
"""
51+
Returns True if the response seems to be HTML, False otherwise.
52+
"""
53+
content_type = resp.headers['Content-Type'].lower()
54+
return (resp.status_code == 200
55+
and content_type is not None
56+
and content_type.find('html') > -1)
57+
58+
'''
59+
return dictionary of flowers with key: name and value: description
60+
'''
61+
def readFlowers(url):
62+
raw_html = simple_get(url)
63+
html = BeautifulSoup(raw_html, 'html.parser')
64+
flowers = {}
65+
for h3 in html.select("h3"):
66+
if not h3.is_empty_element:
67+
id = h3.get('id')
68+
if not id == None:
69+
desc = h3.find_next_sibling('p')
70+
flowers[h3.text] = desc.text
71+
return flowers
72+
73+
def main():
74+
flowers = readFlowers('https://florgeous.com/types-of-flowers/')
75+
with open("/home/student/static/flowersuccess.json","w") as flowerfile:
76+
json.dump(flowers, flowerfile)
77+
78+
if __name__ == "__main__":
79+
main()
80+
-->

0 commit comments

Comments
 (0)