Skip to content

Commit 7bae547

Browse files
committed
updating session 7 slides
1 parent c2f5cfa commit 7bae547

File tree

7 files changed

+1585
-754
lines changed

7 files changed

+1585
-754
lines changed

resources/session07/mashup_1.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
4+
5+
INSPECTION_DOMAIN = 'http://info.kingcounty.gov'
6+
INSPECTION_PATH = '/health/ehs/foodsafety/inspections/Results.aspx'
7+
INSPECTION_PARAMS = {
8+
'Output': 'W',
9+
'Business_Name': '',
10+
'Business_Address': '',
11+
'Longitude': '',
12+
'Latitude': '',
13+
'City': '',
14+
'Zip_Code': '',
15+
'Inspection_Type': 'All',
16+
'Inspection_Start': '',
17+
'Inspection_End': '',
18+
'Inspection_Closed_Business': 'A',
19+
'Violation_Points': '',
20+
'Violation_Red_Points': '',
21+
'Violation_Descr': '',
22+
'Fuzzy_Search': 'N',
23+
'Sort': 'H'
24+
}
25+
26+
27+
def get_inspection_page(**kwargs):
28+
url = INSPECTION_DOMAIN + INSPECTION_PATH
29+
params = INSPECTION_PARAMS.copy()
30+
for key, val in kwargs.items():
31+
if key in INSPECTION_PARAMS:
32+
params[key] = val
33+
resp = requests.get(url, params=params)
34+
resp.raise_for_status()
35+
return resp.content, resp.encoding
36+
37+
38+
def parse_source(html, encoding='utf-8'):
39+
parsed = BeautifulSoup(html, from_encoding=encoding)
40+
return parsed
41+
42+
43+
if __name__ == '__main__':
44+
use_params = {
45+
'Inspection_Start': '2/1/2013',
46+
'Inspection_End': '2/1/2015',
47+
'Zip_Code': '98101'
48+
}
49+
html, encoding = get_inspection_page(**use_params)
50+
parsed = parse_source(html, encoding)
51+
print parsed.prettify(encoding=encoding)

resources/session07/mashup_2.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from bs4 import BeautifulSoup
2+
import re
3+
import requests
4+
5+
6+
INSPECTION_DOMAIN = 'http://info.kingcounty.gov'
7+
INSPECTION_PATH = '/health/ehs/foodsafety/inspections/Results.aspx'
8+
INSPECTION_PARAMS = {
9+
'Output': 'W',
10+
'Business_Name': '',
11+
'Business_Address': '',
12+
'Longitude': '',
13+
'Latitude': '',
14+
'City': '',
15+
'Zip_Code': '',
16+
'Inspection_Type': 'All',
17+
'Inspection_Start': '',
18+
'Inspection_End': '',
19+
'Inspection_Closed_Business': 'A',
20+
'Violation_Points': '',
21+
'Violation_Red_Points': '',
22+
'Violation_Descr': '',
23+
'Fuzzy_Search': 'N',
24+
'Sort': 'H'
25+
}
26+
27+
28+
def get_inspection_page(**kwargs):
29+
url = INSPECTION_DOMAIN + INSPECTION_PATH
30+
params = INSPECTION_PARAMS.copy()
31+
for key, val in kwargs.items():
32+
if key in INSPECTION_PARAMS:
33+
params[key] = val
34+
resp = requests.get(url, params=params)
35+
resp.raise_for_status()
36+
return resp.content, resp.encoding
37+
38+
39+
def parse_source(html, encoding='utf-8'):
40+
parsed = BeautifulSoup(html, from_encoding=encoding)
41+
return parsed
42+
43+
44+
def load_inspection_page(name):
45+
with open(name, 'r') as fh:
46+
content = fh.read()
47+
return content, 'utf-8'
48+
49+
50+
def restaurant_data_generator(html):
51+
id_finder = re.compile(r'PR[\d]+~')
52+
return html.find_all('div', id=id_finder)
53+
54+
55+
if __name__ == '__main__':
56+
use_params = {
57+
'Inspection_Start': '2/1/2013',
58+
'Inspection_End': '2/1/2015',
59+
'Zip_Code': '98101'
60+
}
61+
# html, encoding = get_inspection_page(**use_params)
62+
html, encoding = load_inspection_page('inspection_page.html')
63+
parsed = parse_source(html, encoding)
64+
content_col = parsed.find("td", id="contentcol")
65+
data_list = restaurant_data_generator(content_col)
66+
print data_list[0].prettify()

resources/session07/mashup_3.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from bs4 import BeautifulSoup
2+
import re
3+
import requests
4+
5+
6+
INSPECTION_DOMAIN = 'http://info.kingcounty.gov'
7+
INSPECTION_PATH = '/health/ehs/foodsafety/inspections/Results.aspx'
8+
INSPECTION_PARAMS = {
9+
'Output': 'W',
10+
'Business_Name': '',
11+
'Business_Address': '',
12+
'Longitude': '',
13+
'Latitude': '',
14+
'City': '',
15+
'Zip_Code': '',
16+
'Inspection_Type': 'All',
17+
'Inspection_Start': '',
18+
'Inspection_End': '',
19+
'Inspection_Closed_Business': 'A',
20+
'Violation_Points': '',
21+
'Violation_Red_Points': '',
22+
'Violation_Descr': '',
23+
'Fuzzy_Search': 'N',
24+
'Sort': 'H'
25+
}
26+
27+
28+
def get_inspection_page(**kwargs):
29+
url = INSPECTION_DOMAIN + INSPECTION_PATH
30+
params = INSPECTION_PARAMS.copy()
31+
for key, val in kwargs.items():
32+
if key in INSPECTION_PARAMS:
33+
params[key] = val
34+
resp = requests.get(url, params=params)
35+
resp.raise_for_status()
36+
return resp.content, resp.encoding
37+
38+
39+
def parse_source(html, encoding='utf-8'):
40+
parsed = BeautifulSoup(html, from_encoding=encoding)
41+
return parsed
42+
43+
44+
def load_inspection_page(name):
45+
with open(name, 'r') as fh:
46+
content = fh.read()
47+
return content, 'utf-8'
48+
49+
50+
def restaurant_data_generator(html):
51+
id_finder = re.compile(r'PR[\d]+~')
52+
return html.find_all('div', id=id_finder)
53+
54+
55+
def has_two_tds(elem):
56+
is_tr = elem.name == 'tr'
57+
td_children = elem.find_all('td', recursive=False)
58+
has_two = len(td_children) == 2
59+
return is_tr and has_two
60+
61+
62+
def clean_data(td):
63+
return unicode(td.text).strip(" \n:-")
64+
65+
66+
def extract_restaurant_metadata(elem):
67+
restaurant_data_rows = elem.find('tbody').find_all(
68+
has_two_tds, recursive=False
69+
)
70+
rdata = {}
71+
current_label = ''
72+
for data_row in restaurant_data_rows:
73+
key_cell, val_cell = data_row.find_all('td', recursive=False)
74+
new_label = clean_data(key_cell)
75+
current_label = new_label if new_label else current_label
76+
rdata.setdefault(current_label, []).append(clean_data(val_cell))
77+
return rdata
78+
79+
80+
if __name__ == '__main__':
81+
use_params = {
82+
'Inspection_Start': '2/1/2013',
83+
'Inspection_End': '2/1/2015',
84+
'Zip_Code': '98101'
85+
}
86+
# html, encoding = get_inspection_page(**use_params)
87+
html, encoding = load_inspection_page('inspection_page.html')
88+
parsed = parse_source(html, encoding)
89+
content_col = parsed.find("td", id="contentcol")
90+
data_list = restaurant_data_generator(content_col)
91+
for data_div in data_list:
92+
metadata = extract_restaurant_metadata(data_div)
93+
print metadata

resources/session07/mashup_4.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
from bs4 import BeautifulSoup
2+
import re
3+
import requests
4+
5+
6+
INSPECTION_DOMAIN = 'http://info.kingcounty.gov'
7+
INSPECTION_PATH = '/health/ehs/foodsafety/inspections/Results.aspx'
8+
INSPECTION_PARAMS = {
9+
'Output': 'W',
10+
'Business_Name': '',
11+
'Business_Address': '',
12+
'Longitude': '',
13+
'Latitude': '',
14+
'City': '',
15+
'Zip_Code': '',
16+
'Inspection_Type': 'All',
17+
'Inspection_Start': '',
18+
'Inspection_End': '',
19+
'Inspection_Closed_Business': 'A',
20+
'Violation_Points': '',
21+
'Violation_Red_Points': '',
22+
'Violation_Descr': '',
23+
'Fuzzy_Search': 'N',
24+
'Sort': 'H'
25+
}
26+
27+
28+
def get_inspection_page(**kwargs):
29+
url = INSPECTION_DOMAIN + INSPECTION_PATH
30+
params = INSPECTION_PARAMS.copy()
31+
for key, val in kwargs.items():
32+
if key in INSPECTION_PARAMS:
33+
params[key] = val
34+
resp = requests.get(url, params=params)
35+
resp.raise_for_status()
36+
return resp.content, resp.encoding
37+
38+
39+
def parse_source(html, encoding='utf-8'):
40+
parsed = BeautifulSoup(html, from_encoding=encoding)
41+
return parsed
42+
43+
44+
def load_inspection_page(name):
45+
with open(name, 'r') as fh:
46+
content = fh.read()
47+
return content, 'utf-8'
48+
49+
50+
def restaurant_data_generator(html):
51+
id_finder = re.compile(r'PR[\d]+~')
52+
return html.find_all('div', id=id_finder)
53+
54+
55+
def has_two_tds(elem):
56+
is_tr = elem.name == 'tr'
57+
td_children = elem.find_all('td', recursive=False)
58+
has_two = len(td_children) == 2
59+
return is_tr and has_two
60+
61+
62+
def clean_data(td):
63+
return unicode(td.text).strip(" \n:-")
64+
65+
66+
def extract_restaurant_metadata(elem):
67+
restaurant_data_rows = elem.find('tbody').find_all(
68+
has_two_tds, recursive=False
69+
)
70+
rdata = {}
71+
current_label = ''
72+
for data_row in restaurant_data_rows:
73+
key_cell, val_cell = data_row.find_all('td', recursive=False)
74+
new_label = clean_data(key_cell)
75+
current_label = new_label if new_label else current_label
76+
rdata.setdefault(current_label, []).append(clean_data(val_cell))
77+
return rdata
78+
79+
80+
def is_inspection_data_row(elem):
81+
is_tr = elem.name == 'tr'
82+
if not is_tr:
83+
return False
84+
td_children = elem.find_all('td', recursive=False)
85+
has_four = len(td_children) == 4
86+
this_text = clean_data(td_children[0]).lower()
87+
contains_word = 'inspection' in this_text
88+
does_not_start = not this_text.startswith('inspection')
89+
return is_tr and has_four and contains_word and does_not_start
90+
91+
92+
def get_score_data(elem):
93+
inspection_rows = elem.find_all(is_inspection_data_row)
94+
samples = len(inspection_rows)
95+
total = 0
96+
high_score = 0
97+
average = 0
98+
for row in inspection_rows:
99+
strval = clean_data(row.find_all('td')[2])
100+
try:
101+
intval = int(strval)
102+
except (ValueError, TypeError):
103+
samples -= 1
104+
else:
105+
total += intval
106+
high_score = intval if intval > high_score else high_score
107+
108+
if samples:
109+
average = total/float(samples)
110+
data = {
111+
u'Average Score': average,
112+
u'High Score': high_score,
113+
u'Total Inspections': samples
114+
}
115+
return data
116+
117+
118+
if __name__ == '__main__':
119+
use_params = {
120+
'Inspection_Start': '2/1/2013',
121+
'Inspection_End': '2/1/2015',
122+
'Zip_Code': '98101'
123+
}
124+
# html, encoding = get_inspection_page(**use_params)
125+
html, encoding = load_inspection_page('inspection_page.html')
126+
parsed = parse_source(html, encoding)
127+
content_col = parsed.find("td", id="contentcol")
128+
data_list = restaurant_data_generator(content_col)
129+
for data_div in data_list:
130+
metadata = extract_restaurant_metadata(data_div)
131+
inspection_data = get_score_data(data_div)
132+
metadata.update(inspection_data)
133+
print metadata

0 commit comments

Comments
 (0)