Skip to content

Commit 03bb4d4

Browse files
committed
ch13
1 parent d4c7ab6 commit 03bb4d4

File tree

7 files changed

+2865
-0
lines changed

7 files changed

+2865
-0
lines changed

ch13/ch13-dataprep.ipynb

Lines changed: 372 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,372 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Chapter 9 - Data Science\n",
8+
"## Data Preparation"
9+
]
10+
},
11+
{
12+
"cell_type": "markdown",
13+
"metadata": {},
14+
"source": [
15+
"## 0 - Setting up the notebook"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": 1,
21+
"metadata": {},
22+
"outputs": [],
23+
"source": [
24+
"import json\n",
25+
"import random\n",
26+
"from datetime import date, timedelta\n",
27+
"\n",
28+
"import faker"
29+
]
30+
},
31+
{
32+
"cell_type": "markdown",
33+
"metadata": {},
34+
"source": [
35+
"## 1 - Preparing the Data"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 2,
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"# create the faker to populate the data\n",
45+
"fake = faker.Faker()"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": 3,
51+
"metadata": {},
52+
"outputs": [],
53+
"source": [
54+
"usernames = set()\n",
55+
"usernames_no = 1000\n",
56+
"\n",
57+
"# populate the set with 1000 unique usernames\n",
58+
"while len(usernames) < usernames_no:\n",
59+
" usernames.add(fake.user_name())"
60+
]
61+
},
62+
{
63+
"cell_type": "code",
64+
"execution_count": 4,
65+
"metadata": {},
66+
"outputs": [
67+
{
68+
"data": {
69+
"text/plain": [
70+
"['{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}',\n",
71+
" '{\"username\": \"sarahcarpenter\", \"name\": \"Michael Kane\", \"gender\": \"M\", \"email\": \"[email protected]\", \"age\": 58, \"address\": \"7129 Patrick Walks Suite 215\\\\nLaurenside, LA 97179\"}',\n",
72+
" '{\"username\": \"kevin37\", \"name\": \"Nathaniel Miller\", \"gender\": \"M\", \"email\": \"[email protected]\", \"age\": 36, \"address\": \"8247 Manning Burgs Suite 806\\\\nLopezshire, MS 06606\"}']"
73+
]
74+
},
75+
"execution_count": 4,
76+
"metadata": {},
77+
"output_type": "execute_result"
78+
}
79+
],
80+
"source": [
81+
"def get_random_name_and_gender():\n",
82+
" skew = .6 # 60% of users will be female\n",
83+
" male = random.random() > skew\n",
84+
" if male:\n",
85+
" return fake.name_male(), 'M'\n",
86+
" else:\n",
87+
" return fake.name_female(), 'F'\n",
88+
"\n",
89+
"# for each username, create a complete user profile\n",
90+
"# simulate user data coming from an API. It is a list\n",
91+
"# of JSON strings (users).\n",
92+
"def get_users(usernames):\n",
93+
" users = []\n",
94+
" for username in usernames:\n",
95+
" name, gender = get_random_name_and_gender()\n",
96+
" user = {\n",
97+
" 'username': username,\n",
98+
" 'name': name,\n",
99+
" 'gender': gender,\n",
100+
" 'email': fake.email(),\n",
101+
" 'age': fake.random_int(min=18, max=90),\n",
102+
" 'address': fake.address(),\n",
103+
" }\n",
104+
" users.append(json.dumps(user))\n",
105+
" return users\n",
106+
"\n",
107+
"users = get_users(usernames)\n",
108+
"users[:3]"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 5,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"# campaign name format:\n",
118+
"# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency\n",
119+
"def get_type():\n",
120+
" # just some gibberish internal codes\n",
121+
" types = ['AKX', 'BYU', 'GRZ', 'KTR']\n",
122+
" return random.choice(types)\n",
123+
"\n",
124+
"def get_start_end_dates():\n",
125+
" duration = random.randint(1, 2 * 365)\n",
126+
" offset = random.randint(-365, 365)\n",
127+
" start = date.today() - timedelta(days=offset)\n",
128+
" end = start + timedelta(days=duration)\n",
129+
" \n",
130+
" def _format_date(date_):\n",
131+
" return date_.strftime(\"%Y%m%d\")\n",
132+
" \n",
133+
" return _format_date(start), _format_date(end)\n",
134+
"\n",
135+
"def get_age():\n",
136+
" age = random.randrange(20, 46, 5)\n",
137+
" diff = random.randrange(5, 26, 5)\n",
138+
" return '{}-{}'.format(age, age + diff)\n",
139+
"\n",
140+
"def get_gender():\n",
141+
" return random.choice(('M', 'F', 'B'))\n",
142+
"\n",
143+
"def get_currency():\n",
144+
" return random.choice(('GBP', 'EUR', 'USD'))\n",
145+
"\n",
146+
"def get_campaign_name():\n",
147+
" separator = '_'\n",
148+
" type_ = get_type()\n",
149+
" start, end = get_start_end_dates()\n",
150+
" age = get_age()\n",
151+
" gender = get_gender()\n",
152+
" currency = get_currency()\n",
153+
" return separator.join(\n",
154+
" (type_, start, end, age, gender, currency))"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": 6,
160+
"metadata": {},
161+
"outputs": [],
162+
"source": [
163+
"# campaign data:\n",
164+
"# name, budget, spent, clicks, impressions\n",
165+
"def get_campaign_data():\n",
166+
" name = get_campaign_name()\n",
167+
" budget = random.randint(10**3, 10**6)\n",
168+
" spent = random.randint(10**2, budget) \n",
169+
" clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5)) \n",
170+
" impressions = int(random.gauss(0.5 * 10**6, 2))\n",
171+
" return {\n",
172+
" 'cmp_name': name,\n",
173+
" 'cmp_bgt': budget,\n",
174+
" 'cmp_spent': spent,\n",
175+
" 'cmp_clicks': clicks,\n",
176+
" 'cmp_impr': impressions\n",
177+
" }"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": 7,
183+
"metadata": {},
184+
"outputs": [],
185+
"source": [
186+
"# assemble the logic to get the final version of the rough data\n",
187+
"# data will be a list of dictionaries. Each dictionary will follow\n",
188+
"# this structure:\n",
189+
"# {'user': user_json, 'campaigns': [c1, c2, ...]}\n",
190+
"# where user_json is the JSON string version of a user data dict\n",
191+
"# and c1, c2, ... are campaign dicts as returned by\n",
192+
"# get_campaign_data\n",
193+
"\n",
194+
"def get_data(users):\n",
195+
" data = []\n",
196+
" for user in users:\n",
197+
" campaigns = [get_campaign_data()\n",
198+
" for _ in range(random.randint(2, 8))]\n",
199+
" data.append({'user': user, 'campaigns': campaigns})\n",
200+
" return data"
201+
]
202+
},
203+
{
204+
"cell_type": "markdown",
205+
"metadata": {},
206+
"source": [
207+
"## 2 - Cleaning the data"
208+
]
209+
},
210+
{
211+
"cell_type": "code",
212+
"execution_count": 8,
213+
"metadata": {},
214+
"outputs": [
215+
{
216+
"data": {
217+
"text/plain": [
218+
"[{'user': '{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}',\n",
219+
" 'campaigns': [{'cmp_name': 'GRZ_20210131_20210411_30-40_F_GBP',\n",
220+
" 'cmp_bgt': 253951,\n",
221+
" 'cmp_spent': 17953,\n",
222+
" 'cmp_clicks': 52573,\n",
223+
" 'cmp_impr': 500001},\n",
224+
" {'cmp_name': 'BYU_20210109_20221204_30-35_M_GBP',\n",
225+
" 'cmp_bgt': 150314,\n",
226+
" 'cmp_spent': 125884,\n",
227+
" 'cmp_clicks': 24575,\n",
228+
" 'cmp_impr': 499999},\n",
229+
" {'cmp_name': 'GRZ_20211124_20220921_20-35_B_EUR',\n",
230+
" 'cmp_bgt': 791397,\n",
231+
" 'cmp_spent': 480963,\n",
232+
" 'cmp_clicks': 39668,\n",
233+
" 'cmp_impr': 499999},\n",
234+
" {'cmp_name': 'GRZ_20210727_20220211_35-45_B_EUR',\n",
235+
" 'cmp_bgt': 910204,\n",
236+
" 'cmp_spent': 339997,\n",
237+
" 'cmp_clicks': 16698,\n",
238+
" 'cmp_impr': 500000},\n",
239+
" {'cmp_name': 'BYU_20220216_20220407_20-25_F_EUR',\n",
240+
" 'cmp_bgt': 393134,\n",
241+
" 'cmp_spent': 158930,\n",
242+
" 'cmp_clicks': 46631,\n",
243+
" 'cmp_impr': 500000}]},\n",
244+
" {'user': '{\"username\": \"sarahcarpenter\", \"name\": \"Michael Kane\", \"gender\": \"M\", \"email\": \"[email protected]\", \"age\": 58, \"address\": \"7129 Patrick Walks Suite 215\\\\nLaurenside, LA 97179\"}',\n",
245+
" 'campaigns': [{'cmp_name': 'BYU_20220324_20221230_20-45_B_USD',\n",
246+
" 'cmp_bgt': 819948,\n",
247+
" 'cmp_spent': 105178,\n",
248+
" 'cmp_clicks': 27755,\n",
249+
" 'cmp_impr': 500004},\n",
250+
" {'cmp_name': 'GRZ_20201008_20210604_30-40_B_GBP',\n",
251+
" 'cmp_bgt': 829698,\n",
252+
" 'cmp_spent': 143193,\n",
253+
" 'cmp_clicks': 88114,\n",
254+
" 'cmp_impr': 499998},\n",
255+
" {'cmp_name': 'GRZ_20210710_20211130_25-30_B_USD',\n",
256+
" 'cmp_bgt': 815470,\n",
257+
" 'cmp_spent': 79377,\n",
258+
" 'cmp_clicks': 28283,\n",
259+
" 'cmp_impr': 500002},\n",
260+
" {'cmp_name': 'AKX_20211028_20220112_25-35_F_USD',\n",
261+
" 'cmp_bgt': 944028,\n",
262+
" 'cmp_spent': 657427,\n",
263+
" 'cmp_clicks': 6668,\n",
264+
" 'cmp_impr': 499999},\n",
265+
" {'cmp_name': 'AKX_20211025_20220314_25-35_M_EUR',\n",
266+
" 'cmp_bgt': 39136,\n",
267+
" 'cmp_spent': 29326,\n",
268+
" 'cmp_clicks': 20927,\n",
269+
" 'cmp_impr': 499998},\n",
270+
" {'cmp_name': 'BYU_20211227_20220615_20-35_F_USD',\n",
271+
" 'cmp_bgt': 940412,\n",
272+
" 'cmp_spent': 131757,\n",
273+
" 'cmp_clicks': 57384,\n",
274+
" 'cmp_impr': 500001},\n",
275+
" {'cmp_name': 'AKX_20220323_20230602_35-55_M_GBP',\n",
276+
" 'cmp_bgt': 545483,\n",
277+
" 'cmp_spent': 96427,\n",
278+
" 'cmp_clicks': 43290,\n",
279+
" 'cmp_impr': 499999},\n",
280+
" {'cmp_name': 'AKX_20210917_20220912_35-55_B_USD',\n",
281+
" 'cmp_bgt': 129347,\n",
282+
" 'cmp_spent': 4747,\n",
283+
" 'cmp_clicks': 88217,\n",
284+
" 'cmp_impr': 499999}]}]"
285+
]
286+
},
287+
"execution_count": 8,
288+
"metadata": {},
289+
"output_type": "execute_result"
290+
}
291+
],
292+
"source": [
293+
"# fetch simulated rough data\n",
294+
"rough_data = get_data(users)\n",
295+
"\n",
296+
"rough_data[:2] # let's take a peek"
297+
]
298+
},
299+
{
300+
"cell_type": "code",
301+
"execution_count": 9,
302+
"metadata": {},
303+
"outputs": [
304+
{
305+
"data": {
306+
"text/plain": [
307+
"[{'cmp_name': 'GRZ_20210131_20210411_30-40_F_GBP',\n",
308+
" 'cmp_bgt': 253951,\n",
309+
" 'cmp_spent': 17953,\n",
310+
" 'cmp_clicks': 52573,\n",
311+
" 'cmp_impr': 500001,\n",
312+
" 'user': '{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}'},\n",
313+
" {'cmp_name': 'BYU_20210109_20221204_30-35_M_GBP',\n",
314+
" 'cmp_bgt': 150314,\n",
315+
" 'cmp_spent': 125884,\n",
316+
" 'cmp_clicks': 24575,\n",
317+
" 'cmp_impr': 499999,\n",
318+
" 'user': '{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}'}]"
319+
]
320+
},
321+
"execution_count": 9,
322+
"metadata": {},
323+
"output_type": "execute_result"
324+
}
325+
],
326+
"source": [
327+
"# Let's start from having a different version of the data\n",
328+
"# I want a list whose items will be dicts. Each dict is \n",
329+
"# the original campaign dict plus the user JSON\n",
330+
"\n",
331+
"data = []\n",
332+
"for datum in rough_data:\n",
333+
" for campaign in datum['campaigns']:\n",
334+
" campaign.update({'user': datum['user']})\n",
335+
" data.append(campaign)\n",
336+
"data[:2] # let's take another peek"
337+
]
338+
},
339+
{
340+
"cell_type": "code",
341+
"execution_count": 10,
342+
"metadata": {},
343+
"outputs": [],
344+
"source": [
345+
"# Warning: Uncommenting and executing this cell will overwrite data.json\n",
346+
"#with open('data.json', 'w') as stream:\n",
347+
"# stream.write(json.dumps(data))"
348+
]
349+
}
350+
],
351+
"metadata": {
352+
"kernelspec": {
353+
"display_name": "Python 3 (ipykernel)",
354+
"language": "python",
355+
"name": "python3"
356+
},
357+
"language_info": {
358+
"codemirror_mode": {
359+
"name": "ipython",
360+
"version": 3
361+
},
362+
"file_extension": ".py",
363+
"mimetype": "text/x-python",
364+
"name": "python",
365+
"nbconvert_exporter": "python",
366+
"pygments_lexer": "ipython3",
367+
"version": "3.9.7"
368+
}
369+
},
370+
"nbformat": 4,
371+
"nbformat_minor": 4
372+
}

0 commit comments

Comments
 (0)