| 
 | 1 | +{  | 
 | 2 | + "cells": [  | 
 | 3 | +  {  | 
 | 4 | +   "cell_type": "markdown",  | 
 | 5 | +   "metadata": {},  | 
 | 6 | +   "source": [  | 
 | 7 | +    "# Chapter 9 - Data Science\n",  | 
 | 8 | +    "## Data Preparation"  | 
 | 9 | +   ]  | 
 | 10 | +  },  | 
 | 11 | +  {  | 
 | 12 | +   "cell_type": "markdown",  | 
 | 13 | +   "metadata": {},  | 
 | 14 | +   "source": [  | 
 | 15 | +    "## 0 - Setting up the notebook"  | 
 | 16 | +   ]  | 
 | 17 | +  },  | 
 | 18 | +  {  | 
 | 19 | +   "cell_type": "code",  | 
 | 20 | +   "execution_count": 1,  | 
 | 21 | +   "metadata": {},  | 
 | 22 | +   "outputs": [],  | 
 | 23 | +   "source": [  | 
 | 24 | +    "import json\n",  | 
 | 25 | +    "import random\n",  | 
 | 26 | +    "from datetime import date, timedelta\n",  | 
 | 27 | +    "\n",  | 
 | 28 | +    "import faker"  | 
 | 29 | +   ]  | 
 | 30 | +  },  | 
 | 31 | +  {  | 
 | 32 | +   "cell_type": "markdown",  | 
 | 33 | +   "metadata": {},  | 
 | 34 | +   "source": [  | 
 | 35 | +    "## 1 - Preparing the Data"  | 
 | 36 | +   ]  | 
 | 37 | +  },  | 
 | 38 | +  {  | 
 | 39 | +   "cell_type": "code",  | 
 | 40 | +   "execution_count": 2,  | 
 | 41 | +   "metadata": {},  | 
 | 42 | +   "outputs": [],  | 
 | 43 | +   "source": [  | 
 | 44 | +    "# create the faker to populate the data\n",  | 
 | 45 | +    "fake = faker.Faker()"  | 
 | 46 | +   ]  | 
 | 47 | +  },  | 
 | 48 | +  {  | 
 | 49 | +   "cell_type": "code",  | 
 | 50 | +   "execution_count": 3,  | 
 | 51 | +   "metadata": {},  | 
 | 52 | +   "outputs": [],  | 
 | 53 | +   "source": [  | 
 | 54 | +    "usernames = set()\n",  | 
 | 55 | +    "usernames_no = 1000\n",  | 
 | 56 | +    "\n",  | 
 | 57 | +    "# populate the set with 1000 unique usernames\n",  | 
 | 58 | +    "while len(usernames) < usernames_no:\n",  | 
 | 59 | +    "    usernames.add(fake.user_name())"  | 
 | 60 | +   ]  | 
 | 61 | +  },  | 
 | 62 | +  {  | 
 | 63 | +   "cell_type": "code",  | 
 | 64 | +   "execution_count": 4,  | 
 | 65 | +   "metadata": {},  | 
 | 66 | +   "outputs": [  | 
 | 67 | +    {  | 
 | 68 | +     "data": {  | 
 | 69 | +      "text/plain": [  | 
 | 70 | +       "['{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}',\n",   | 
 | 71 | +       " '{\"username\": \"sarahcarpenter\", \"name\": \"Michael Kane\", \"gender\": \"M\", \"email\": \"[email protected]\", \"age\": 58, \"address\": \"7129 Patrick Walks Suite 215\\\\nLaurenside, LA 97179\"}',\n",   | 
 | 72 | +       " '{\"username\": \"kevin37\", \"name\": \"Nathaniel Miller\", \"gender\": \"M\", \"email\": \"[email protected]\", \"age\": 36, \"address\": \"8247 Manning Burgs Suite 806\\\\nLopezshire, MS 06606\"}']"  | 
 | 73 | +      ]  | 
 | 74 | +     },  | 
 | 75 | +     "execution_count": 4,  | 
 | 76 | +     "metadata": {},  | 
 | 77 | +     "output_type": "execute_result"  | 
 | 78 | +    }  | 
 | 79 | +   ],  | 
 | 80 | +   "source": [  | 
 | 81 | +    "def get_random_name_and_gender():\n",  | 
 | 82 | +    "    skew = .6  # 60% of users will be female\n",  | 
 | 83 | +    "    male = random.random() > skew\n",  | 
 | 84 | +    "    if male:\n",  | 
 | 85 | +    "        return fake.name_male(), 'M'\n",  | 
 | 86 | +    "    else:\n",  | 
 | 87 | +    "        return fake.name_female(), 'F'\n",  | 
 | 88 | +    "\n",  | 
 | 89 | +    "# for each username, create a complete user profile\n",  | 
 | 90 | +    "# simulate user data coming from an API. It is a list\n",  | 
 | 91 | +    "# of JSON strings (users).\n",  | 
 | 92 | +    "def get_users(usernames):\n",  | 
 | 93 | +    "    users = []\n",  | 
 | 94 | +    "    for username in usernames:\n",  | 
 | 95 | +    "        name, gender = get_random_name_and_gender()\n",  | 
 | 96 | +    "        user = {\n",  | 
 | 97 | +    "            'username': username,\n",  | 
 | 98 | +    "            'name': name,\n",  | 
 | 99 | +    "            'gender': gender,\n",  | 
 | 100 | +    "            'email': fake.email(),\n",  | 
 | 101 | +    "            'age': fake.random_int(min=18, max=90),\n",  | 
 | 102 | +    "            'address': fake.address(),\n",  | 
 | 103 | +    "        }\n",  | 
 | 104 | +    "        users.append(json.dumps(user))\n",  | 
 | 105 | +    "    return users\n",  | 
 | 106 | +    "\n",  | 
 | 107 | +    "users = get_users(usernames)\n",  | 
 | 108 | +    "users[:3]"  | 
 | 109 | +   ]  | 
 | 110 | +  },  | 
 | 111 | +  {  | 
 | 112 | +   "cell_type": "code",  | 
 | 113 | +   "execution_count": 5,  | 
 | 114 | +   "metadata": {},  | 
 | 115 | +   "outputs": [],  | 
 | 116 | +   "source": [  | 
 | 117 | +    "# campaign name format:\n",  | 
 | 118 | +    "# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency\n",  | 
 | 119 | +    "def get_type():\n",  | 
 | 120 | +    "    # just some gibberish internal codes\n",  | 
 | 121 | +    "    types = ['AKX', 'BYU', 'GRZ', 'KTR']\n",  | 
 | 122 | +    "    return random.choice(types)\n",  | 
 | 123 | +    "\n",  | 
 | 124 | +    "def get_start_end_dates():\n",  | 
 | 125 | +    "    duration = random.randint(1, 2 * 365)\n",  | 
 | 126 | +    "    offset = random.randint(-365, 365)\n",  | 
 | 127 | +    "    start = date.today() - timedelta(days=offset)\n",  | 
 | 128 | +    "    end = start + timedelta(days=duration)\n",  | 
 | 129 | +    "    \n",  | 
 | 130 | +    "    def _format_date(date_):\n",  | 
 | 131 | +    "        return date_.strftime(\"%Y%m%d\")\n",  | 
 | 132 | +    "    \n",  | 
 | 133 | +    "    return _format_date(start), _format_date(end)\n",  | 
 | 134 | +    "\n",  | 
 | 135 | +    "def get_age():\n",  | 
 | 136 | +    "    age = random.randrange(20, 46, 5)\n",  | 
 | 137 | +    "    diff = random.randrange(5, 26, 5)\n",  | 
 | 138 | +    "    return '{}-{}'.format(age, age + diff)\n",  | 
 | 139 | +    "\n",  | 
 | 140 | +    "def get_gender():\n",  | 
 | 141 | +    "    return random.choice(('M', 'F', 'B'))\n",  | 
 | 142 | +    "\n",  | 
 | 143 | +    "def get_currency():\n",  | 
 | 144 | +    "    return random.choice(('GBP', 'EUR', 'USD'))\n",  | 
 | 145 | +    "\n",  | 
 | 146 | +    "def get_campaign_name():\n",  | 
 | 147 | +    "    separator = '_'\n",  | 
 | 148 | +    "    type_ = get_type()\n",  | 
 | 149 | +    "    start, end = get_start_end_dates()\n",  | 
 | 150 | +    "    age = get_age()\n",  | 
 | 151 | +    "    gender = get_gender()\n",  | 
 | 152 | +    "    currency = get_currency()\n",  | 
 | 153 | +    "    return separator.join(\n",  | 
 | 154 | +    "        (type_, start, end, age, gender, currency))"  | 
 | 155 | +   ]  | 
 | 156 | +  },  | 
 | 157 | +  {  | 
 | 158 | +   "cell_type": "code",  | 
 | 159 | +   "execution_count": 6,  | 
 | 160 | +   "metadata": {},  | 
 | 161 | +   "outputs": [],  | 
 | 162 | +   "source": [  | 
 | 163 | +    "# campaign data:\n",  | 
 | 164 | +    "# name, budget, spent, clicks, impressions\n",  | 
 | 165 | +    "def get_campaign_data():\n",  | 
 | 166 | +    "    name = get_campaign_name()\n",  | 
 | 167 | +    "    budget = random.randint(10**3, 10**6)\n",  | 
 | 168 | +    "    spent = random.randint(10**2, budget)    \n",  | 
 | 169 | +    "    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))    \n",  | 
 | 170 | +    "    impressions = int(random.gauss(0.5 * 10**6, 2))\n",  | 
 | 171 | +    "    return {\n",  | 
 | 172 | +    "        'cmp_name': name,\n",  | 
 | 173 | +    "        'cmp_bgt': budget,\n",  | 
 | 174 | +    "        'cmp_spent': spent,\n",  | 
 | 175 | +    "        'cmp_clicks': clicks,\n",  | 
 | 176 | +    "        'cmp_impr': impressions\n",  | 
 | 177 | +    "    }"  | 
 | 178 | +   ]  | 
 | 179 | +  },  | 
 | 180 | +  {  | 
 | 181 | +   "cell_type": "code",  | 
 | 182 | +   "execution_count": 7,  | 
 | 183 | +   "metadata": {},  | 
 | 184 | +   "outputs": [],  | 
 | 185 | +   "source": [  | 
 | 186 | +    "# assemble the logic to get the final version of the rough data\n",  | 
 | 187 | +    "# data will be a list of dictionaries. Each dictionary will follow\n",  | 
 | 188 | +    "# this structure:\n",  | 
 | 189 | +    "# {'user': user_json, 'campaigns': [c1, c2, ...]}\n",  | 
 | 190 | +    "# where user_json is the JSON string version of a user data dict\n",  | 
 | 191 | +    "# and c1, c2, ... are campaign dicts as returned by\n",  | 
 | 192 | +    "# get_campaign_data\n",  | 
 | 193 | +    "\n",  | 
 | 194 | +    "def get_data(users):\n",  | 
 | 195 | +    "    data = []\n",  | 
 | 196 | +    "    for user in users:\n",  | 
 | 197 | +    "        campaigns = [get_campaign_data()\n",  | 
 | 198 | +    "                     for _ in range(random.randint(2, 8))]\n",  | 
 | 199 | +    "        data.append({'user': user, 'campaigns': campaigns})\n",  | 
 | 200 | +    "    return data"  | 
 | 201 | +   ]  | 
 | 202 | +  },  | 
 | 203 | +  {  | 
 | 204 | +   "cell_type": "markdown",  | 
 | 205 | +   "metadata": {},  | 
 | 206 | +   "source": [  | 
 | 207 | +    "## 2 - Cleaning the data"  | 
 | 208 | +   ]  | 
 | 209 | +  },  | 
 | 210 | +  {  | 
 | 211 | +   "cell_type": "code",  | 
 | 212 | +   "execution_count": 8,  | 
 | 213 | +   "metadata": {},  | 
 | 214 | +   "outputs": [  | 
 | 215 | +    {  | 
 | 216 | +     "data": {  | 
 | 217 | +      "text/plain": [  | 
 | 218 | +       "[{'user': '{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}',\n",   | 
 | 219 | +       "  'campaigns': [{'cmp_name': 'GRZ_20210131_20210411_30-40_F_GBP',\n",  | 
 | 220 | +       "    'cmp_bgt': 253951,\n",  | 
 | 221 | +       "    'cmp_spent': 17953,\n",  | 
 | 222 | +       "    'cmp_clicks': 52573,\n",  | 
 | 223 | +       "    'cmp_impr': 500001},\n",  | 
 | 224 | +       "   {'cmp_name': 'BYU_20210109_20221204_30-35_M_GBP',\n",  | 
 | 225 | +       "    'cmp_bgt': 150314,\n",  | 
 | 226 | +       "    'cmp_spent': 125884,\n",  | 
 | 227 | +       "    'cmp_clicks': 24575,\n",  | 
 | 228 | +       "    'cmp_impr': 499999},\n",  | 
 | 229 | +       "   {'cmp_name': 'GRZ_20211124_20220921_20-35_B_EUR',\n",  | 
 | 230 | +       "    'cmp_bgt': 791397,\n",  | 
 | 231 | +       "    'cmp_spent': 480963,\n",  | 
 | 232 | +       "    'cmp_clicks': 39668,\n",  | 
 | 233 | +       "    'cmp_impr': 499999},\n",  | 
 | 234 | +       "   {'cmp_name': 'GRZ_20210727_20220211_35-45_B_EUR',\n",  | 
 | 235 | +       "    'cmp_bgt': 910204,\n",  | 
 | 236 | +       "    'cmp_spent': 339997,\n",  | 
 | 237 | +       "    'cmp_clicks': 16698,\n",  | 
 | 238 | +       "    'cmp_impr': 500000},\n",  | 
 | 239 | +       "   {'cmp_name': 'BYU_20220216_20220407_20-25_F_EUR',\n",  | 
 | 240 | +       "    'cmp_bgt': 393134,\n",  | 
 | 241 | +       "    'cmp_spent': 158930,\n",  | 
 | 242 | +       "    'cmp_clicks': 46631,\n",  | 
 | 243 | +       "    'cmp_impr': 500000}]},\n",  | 
 | 244 | +       " {'user': '{\"username\": \"sarahcarpenter\", \"name\": \"Michael Kane\", \"gender\": \"M\", \"email\": \"[email protected]\", \"age\": 58, \"address\": \"7129 Patrick Walks Suite 215\\\\nLaurenside, LA 97179\"}',\n",   | 
 | 245 | +       "  'campaigns': [{'cmp_name': 'BYU_20220324_20221230_20-45_B_USD',\n",  | 
 | 246 | +       "    'cmp_bgt': 819948,\n",  | 
 | 247 | +       "    'cmp_spent': 105178,\n",  | 
 | 248 | +       "    'cmp_clicks': 27755,\n",  | 
 | 249 | +       "    'cmp_impr': 500004},\n",  | 
 | 250 | +       "   {'cmp_name': 'GRZ_20201008_20210604_30-40_B_GBP',\n",  | 
 | 251 | +       "    'cmp_bgt': 829698,\n",  | 
 | 252 | +       "    'cmp_spent': 143193,\n",  | 
 | 253 | +       "    'cmp_clicks': 88114,\n",  | 
 | 254 | +       "    'cmp_impr': 499998},\n",  | 
 | 255 | +       "   {'cmp_name': 'GRZ_20210710_20211130_25-30_B_USD',\n",  | 
 | 256 | +       "    'cmp_bgt': 815470,\n",  | 
 | 257 | +       "    'cmp_spent': 79377,\n",  | 
 | 258 | +       "    'cmp_clicks': 28283,\n",  | 
 | 259 | +       "    'cmp_impr': 500002},\n",  | 
 | 260 | +       "   {'cmp_name': 'AKX_20211028_20220112_25-35_F_USD',\n",  | 
 | 261 | +       "    'cmp_bgt': 944028,\n",  | 
 | 262 | +       "    'cmp_spent': 657427,\n",  | 
 | 263 | +       "    'cmp_clicks': 6668,\n",  | 
 | 264 | +       "    'cmp_impr': 499999},\n",  | 
 | 265 | +       "   {'cmp_name': 'AKX_20211025_20220314_25-35_M_EUR',\n",  | 
 | 266 | +       "    'cmp_bgt': 39136,\n",  | 
 | 267 | +       "    'cmp_spent': 29326,\n",  | 
 | 268 | +       "    'cmp_clicks': 20927,\n",  | 
 | 269 | +       "    'cmp_impr': 499998},\n",  | 
 | 270 | +       "   {'cmp_name': 'BYU_20211227_20220615_20-35_F_USD',\n",  | 
 | 271 | +       "    'cmp_bgt': 940412,\n",  | 
 | 272 | +       "    'cmp_spent': 131757,\n",  | 
 | 273 | +       "    'cmp_clicks': 57384,\n",  | 
 | 274 | +       "    'cmp_impr': 500001},\n",  | 
 | 275 | +       "   {'cmp_name': 'AKX_20220323_20230602_35-55_M_GBP',\n",  | 
 | 276 | +       "    'cmp_bgt': 545483,\n",  | 
 | 277 | +       "    'cmp_spent': 96427,\n",  | 
 | 278 | +       "    'cmp_clicks': 43290,\n",  | 
 | 279 | +       "    'cmp_impr': 499999},\n",  | 
 | 280 | +       "   {'cmp_name': 'AKX_20210917_20220912_35-55_B_USD',\n",  | 
 | 281 | +       "    'cmp_bgt': 129347,\n",  | 
 | 282 | +       "    'cmp_spent': 4747,\n",  | 
 | 283 | +       "    'cmp_clicks': 88217,\n",  | 
 | 284 | +       "    'cmp_impr': 499999}]}]"  | 
 | 285 | +      ]  | 
 | 286 | +     },  | 
 | 287 | +     "execution_count": 8,  | 
 | 288 | +     "metadata": {},  | 
 | 289 | +     "output_type": "execute_result"  | 
 | 290 | +    }  | 
 | 291 | +   ],  | 
 | 292 | +   "source": [  | 
 | 293 | +    "# fetch simulated rough data\n",  | 
 | 294 | +    "rough_data = get_data(users)\n",  | 
 | 295 | +    "\n",  | 
 | 296 | +    "rough_data[:2]  # let's take a peek"  | 
 | 297 | +   ]  | 
 | 298 | +  },  | 
 | 299 | +  {  | 
 | 300 | +   "cell_type": "code",  | 
 | 301 | +   "execution_count": 9,  | 
 | 302 | +   "metadata": {},  | 
 | 303 | +   "outputs": [  | 
 | 304 | +    {  | 
 | 305 | +     "data": {  | 
 | 306 | +      "text/plain": [  | 
 | 307 | +       "[{'cmp_name': 'GRZ_20210131_20210411_30-40_F_GBP',\n",  | 
 | 308 | +       "  'cmp_bgt': 253951,\n",  | 
 | 309 | +       "  'cmp_spent': 17953,\n",  | 
 | 310 | +       "  'cmp_clicks': 52573,\n",  | 
 | 311 | +       "  'cmp_impr': 500001,\n",  | 
 | 312 | +       "  'user': '{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}'},\n",   | 
 | 313 | +       " {'cmp_name': 'BYU_20210109_20221204_30-35_M_GBP',\n",  | 
 | 314 | +       "  'cmp_bgt': 150314,\n",  | 
 | 315 | +       "  'cmp_spent': 125884,\n",  | 
 | 316 | +       "  'cmp_clicks': 24575,\n",  | 
 | 317 | +       "  'cmp_impr': 499999,\n",  | 
 | 318 | +       "  'user': '{\"username\": \"susan42\", \"name\": \"Emily Smith\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 53, \"address\": \"66537 Riley Mission Apt. 337\\\\nNorth Jennifer, NH 95781\"}'}]"  | 
 | 319 | +      ]  | 
 | 320 | +     },  | 
 | 321 | +     "execution_count": 9,  | 
 | 322 | +     "metadata": {},  | 
 | 323 | +     "output_type": "execute_result"  | 
 | 324 | +    }  | 
 | 325 | +   ],  | 
 | 326 | +   "source": [  | 
 | 327 | +    "# Let's start from having a different version of the data\n",  | 
 | 328 | +    "# I want a list whose items will be dicts. Each dict is \n",  | 
 | 329 | +    "# the original campaign dict plus the user JSON\n",  | 
 | 330 | +    "\n",  | 
 | 331 | +    "data = []\n",  | 
 | 332 | +    "for datum in rough_data:\n",  | 
 | 333 | +    "    for campaign in datum['campaigns']:\n",  | 
 | 334 | +    "        campaign.update({'user': datum['user']})\n",  | 
 | 335 | +    "        data.append(campaign)\n",  | 
 | 336 | +    "data[:2]  # let's take another peek"  | 
 | 337 | +   ]  | 
 | 338 | +  },  | 
 | 339 | +  {  | 
 | 340 | +   "cell_type": "code",  | 
 | 341 | +   "execution_count": 10,  | 
 | 342 | +   "metadata": {},  | 
 | 343 | +   "outputs": [],  | 
 | 344 | +   "source": [  | 
 | 345 | +    "# Warning: Uncommenting and executing this cell will overwrite data.json\n",  | 
 | 346 | +    "#with open('data.json', 'w') as stream:\n",  | 
 | 347 | +    "#     stream.write(json.dumps(data))"  | 
 | 348 | +   ]  | 
 | 349 | +  }  | 
 | 350 | + ],  | 
 | 351 | + "metadata": {  | 
 | 352 | +  "kernelspec": {  | 
 | 353 | +   "display_name": "Python 3 (ipykernel)",  | 
 | 354 | +   "language": "python",  | 
 | 355 | +   "name": "python3"  | 
 | 356 | +  },  | 
 | 357 | +  "language_info": {  | 
 | 358 | +   "codemirror_mode": {  | 
 | 359 | +    "name": "ipython",  | 
 | 360 | +    "version": 3  | 
 | 361 | +   },  | 
 | 362 | +   "file_extension": ".py",  | 
 | 363 | +   "mimetype": "text/x-python",  | 
 | 364 | +   "name": "python",  | 
 | 365 | +   "nbconvert_exporter": "python",  | 
 | 366 | +   "pygments_lexer": "ipython3",  | 
 | 367 | +   "version": "3.9.7"  | 
 | 368 | +  }  | 
 | 369 | + },  | 
 | 370 | + "nbformat": 4,  | 
 | 371 | + "nbformat_minor": 4  | 
 | 372 | +}  | 
0 commit comments