| 
 | 1 | +{  | 
 | 2 | + "cells": [  | 
 | 3 | +  {  | 
 | 4 | +   "cell_type": "code",  | 
 | 5 | +   "execution_count": 4,  | 
 | 6 | +   "metadata": {},  | 
 | 7 | +   "outputs": [],  | 
 | 8 | +   "source": [  | 
 | 9 | +    "df = pd.read_fwf('data/headlines.txt')\n",  | 
 | 10 | +    "df.columns = ['headline']"  | 
 | 11 | +   ]  | 
 | 12 | +  },  | 
 | 13 | +  {  | 
 | 14 | +   "cell_type": "code",  | 
 | 15 | +   "execution_count": 29,  | 
 | 16 | +   "metadata": {},  | 
 | 17 | +   "outputs": [  | 
 | 18 | +    {  | 
 | 19 | +     "data": {  | 
 | 20 | +      "text/html": [  | 
 | 21 | +       "<div>\n",  | 
 | 22 | +       "<style scoped>\n",  | 
 | 23 | +       "    .dataframe tbody tr th:only-of-type {\n",  | 
 | 24 | +       "        vertical-align: middle;\n",  | 
 | 25 | +       "    }\n",  | 
 | 26 | +       "\n",  | 
 | 27 | +       "    .dataframe tbody tr th {\n",  | 
 | 28 | +       "        vertical-align: top;\n",  | 
 | 29 | +       "    }\n",  | 
 | 30 | +       "\n",  | 
 | 31 | +       "    .dataframe thead th {\n",  | 
 | 32 | +       "        text-align: right;\n",  | 
 | 33 | +       "    }\n",  | 
 | 34 | +       "</style>\n",  | 
 | 35 | +       "<table border=\"1\" class=\"dataframe\">\n",  | 
 | 36 | +       "  <thead>\n",  | 
 | 37 | +       "    <tr style=\"text-align: right;\">\n",  | 
 | 38 | +       "      <th></th>\n",  | 
 | 39 | +       "      <th>headline</th>\n",  | 
 | 40 | +       "    </tr>\n",  | 
 | 41 | +       "  </thead>\n",  | 
 | 42 | +       "  <tbody>\n",  | 
 | 43 | +       "    <tr>\n",  | 
 | 44 | +       "      <td>0</td>\n",  | 
 | 45 | +       "      <td>Could Zika Reach New York City?</td>\n",  | 
 | 46 | +       "    </tr>\n",  | 
 | 47 | +       "    <tr>\n",  | 
 | 48 | +       "      <td>1</td>\n",  | 
 | 49 | +       "      <td>First Case of Zika in Miami Beach</td>\n",  | 
 | 50 | +       "    </tr>\n",  | 
 | 51 | +       "    <tr>\n",  | 
 | 52 | +       "      <td>2</td>\n",  | 
 | 53 | +       "      <td>Mystery Virus Spreads in Recife, Brazil</td>\n",  | 
 | 54 | +       "    </tr>\n",  | 
 | 55 | +       "    <tr>\n",  | 
 | 56 | +       "      <td>3</td>\n",  | 
 | 57 | +       "      <td>Dallas man comes down with case of Zika</td>\n",  | 
 | 58 | +       "    </tr>\n",  | 
 | 59 | +       "    <tr>\n",  | 
 | 60 | +       "      <td>4</td>\n",  | 
 | 61 | +       "      <td>Trinidad confirms first Zika case</td>\n",  | 
 | 62 | +       "    </tr>\n",  | 
 | 63 | +       "  </tbody>\n",  | 
 | 64 | +       "</table>\n",  | 
 | 65 | +       "</div>"  | 
 | 66 | +      ],  | 
 | 67 | +      "text/plain": [  | 
 | 68 | +       "                                  headline\n",  | 
 | 69 | +       "0          Could Zika Reach New York City?\n",  | 
 | 70 | +       "1        First Case of Zika in Miami Beach\n",  | 
 | 71 | +       "2  Mystery Virus Spreads in Recife, Brazil\n",  | 
 | 72 | +       "3  Dallas man comes down with case of Zika\n",  | 
 | 73 | +       "4        Trinidad confirms first Zika case"  | 
 | 74 | +      ]  | 
 | 75 | +     },  | 
 | 76 | +     "execution_count": 29,  | 
 | 77 | +     "metadata": {},  | 
 | 78 | +     "output_type": "execute_result"  | 
 | 79 | +    }  | 
 | 80 | +   ],  | 
 | 81 | +   "source": [  | 
 | 82 | +    "df.head()"  | 
 | 83 | +   ]  | 
 | 84 | +  },  | 
 | 85 | +  {  | 
 | 86 | +   "cell_type": "code",  | 
 | 87 | +   "execution_count": 46,  | 
 | 88 | +   "metadata": {},  | 
 | 89 | +   "outputs": [],  | 
 | 90 | +   "source": [  | 
 | 91 | +    "import geonamescache\n",  | 
 | 92 | +    "\n",  | 
 | 93 | +    "gc = geonamescache.GeonamesCache()\n",  | 
 | 94 | +    "countries = gc.get_countries()\n",  | 
 | 95 | +    "cities = gc.get_cities()"  | 
 | 96 | +   ]  | 
 | 97 | +  },  | 
 | 98 | +  {  | 
 | 99 | +   "cell_type": "code",  | 
 | 100 | +   "execution_count": 58,  | 
 | 101 | +   "metadata": {},  | 
 | 102 | +   "outputs": [],  | 
 | 103 | +   "source": [  | 
 | 104 | +    "country_names = []\n",  | 
 | 105 | +    "country_ids = list(countries.keys())\n",  | 
 | 106 | +    "for country_id in country_ids:\n",  | 
 | 107 | +    "    country_names.append(countries[country_id]['name'])"  | 
 | 108 | +   ]  | 
 | 109 | +  },  | 
 | 110 | +  {  | 
 | 111 | +   "cell_type": "code",  | 
 | 112 | +   "execution_count": 60,  | 
 | 113 | +   "metadata": {},  | 
 | 114 | +   "outputs": [  | 
 | 115 | +    {  | 
 | 116 | +     "data": {  | 
 | 117 | +      "text/plain": [  | 
 | 118 | +       "252"  | 
 | 119 | +      ]  | 
 | 120 | +     },  | 
 | 121 | +     "execution_count": 60,  | 
 | 122 | +     "metadata": {},  | 
 | 123 | +     "output_type": "execute_result"  | 
 | 124 | +    }  | 
 | 125 | +   ],  | 
 | 126 | +   "source": [  | 
 | 127 | +    "len(country_names)"  | 
 | 128 | +   ]  | 
 | 129 | +  },  | 
 | 130 | +  {  | 
 | 131 | +   "cell_type": "code",  | 
 | 132 | +   "execution_count": 73,  | 
 | 133 | +   "metadata": {},  | 
 | 134 | +   "outputs": [  | 
 | 135 | +    {  | 
 | 136 | +     "data": {  | 
 | 137 | +      "text/plain": [  | 
 | 138 | +       "['Andorra',\n",  | 
 | 139 | +       " 'United Arab Emirates',\n",  | 
 | 140 | +       " 'Afghanistan',\n",  | 
 | 141 | +       " 'Antigua and Barbuda',\n",  | 
 | 142 | +       " 'Anguilla',\n",  | 
 | 143 | +       " 'Albania',\n",  | 
 | 144 | +       " 'Armenia',\n",  | 
 | 145 | +       " 'Angola',\n",  | 
 | 146 | +       " 'Antarctica',\n",  | 
 | 147 | +       " 'Argentina',\n",  | 
 | 148 | +       " 'American Samoa',\n",  | 
 | 149 | +       " 'Austria',\n",  | 
 | 150 | +       " 'Australia',\n",  | 
 | 151 | +       " 'Aruba',\n",  | 
 | 152 | +       " 'Aland Islands',\n",  | 
 | 153 | +       " 'Azerbaijan',\n",  | 
 | 154 | +       " 'Bosnia and Herzegovina',\n",  | 
 | 155 | +       " 'Barbados',\n",  | 
 | 156 | +       " 'Bangladesh',\n",  | 
 | 157 | +       " 'Belgium']"  | 
 | 158 | +      ]  | 
 | 159 | +     },  | 
 | 160 | +     "execution_count": 73,  | 
 | 161 | +     "metadata": {},  | 
 | 162 | +     "output_type": "execute_result"  | 
 | 163 | +    }  | 
 | 164 | +   ],  | 
 | 165 | +   "source": [  | 
 | 166 | +    "country_names[:20]"  | 
 | 167 | +   ]  | 
 | 168 | +  },  | 
 | 169 | +  {  | 
 | 170 | +   "cell_type": "code",  | 
 | 171 | +   "execution_count": 68,  | 
 | 172 | +   "metadata": {},  | 
 | 173 | +   "outputs": [],  | 
 | 174 | +   "source": [  | 
 | 175 | +    "city_names = []\n",  | 
 | 176 | +    "city_ids = list(cities.keys())\n",  | 
 | 177 | +    "for city_id in city_ids:\n",  | 
 | 178 | +    "    city_names.append(cities[city_id]['name'])"  | 
 | 179 | +   ]  | 
 | 180 | +  },  | 
 | 181 | +  {  | 
 | 182 | +   "cell_type": "code",  | 
 | 183 | +   "execution_count": 70,  | 
 | 184 | +   "metadata": {},  | 
 | 185 | +   "outputs": [  | 
 | 186 | +    {  | 
 | 187 | +     "data": {  | 
 | 188 | +      "text/plain": [  | 
 | 189 | +       "24336"  | 
 | 190 | +      ]  | 
 | 191 | +     },  | 
 | 192 | +     "execution_count": 70,  | 
 | 193 | +     "metadata": {},  | 
 | 194 | +     "output_type": "execute_result"  | 
 | 195 | +    }  | 
 | 196 | +   ],  | 
 | 197 | +   "source": [  | 
 | 198 | +    "len(city_names)"  | 
 | 199 | +   ]  | 
 | 200 | +  },  | 
 | 201 | +  {  | 
 | 202 | +   "cell_type": "code",  | 
 | 203 | +   "execution_count": 94,  | 
 | 204 | +   "metadata": {},  | 
 | 205 | +   "outputs": [  | 
 | 206 | +    {  | 
 | 207 | +     "data": {  | 
 | 208 | +      "text/plain": [  | 
 | 209 | +       "['Andorra la Vella',\n",  | 
 | 210 | +       " 'Umm Al Quwain City',\n",  | 
 | 211 | +       " 'Ras Al Khaimah City',\n",  | 
 | 212 | +       " 'Zayed City',\n",  | 
 | 213 | +       " 'Khawr Fakkān',\n",  | 
 | 214 | +       " 'Dubai',\n",  | 
 | 215 | +       " 'Dibba Al-Fujairah',\n",  | 
 | 216 | +       " 'Dibba Al-Hisn',\n",  | 
 | 217 | +       " 'Sharjah',\n",  | 
 | 218 | +       " 'Ar Ruways',\n",  | 
 | 219 | +       " 'Al Fujairah City',\n",  | 
 | 220 | +       " 'Al Ain City',\n",  | 
 | 221 | +       " 'Ajman City',\n",  | 
 | 222 | +       " 'Adh Dhayd',\n",  | 
 | 223 | +       " 'Abu Dhabi',\n",  | 
 | 224 | +       " 'Khalifah A City',\n",  | 
 | 225 | +       " 'Bani Yas City',\n",  | 
 | 226 | +       " 'Musaffah',\n",  | 
 | 227 | +       " 'Al Shamkhah City',\n",  | 
 | 228 | +       " 'Reef Al Fujairah City']"  | 
 | 229 | +      ]  | 
 | 230 | +     },  | 
 | 231 | +     "execution_count": 94,  | 
 | 232 | +     "metadata": {},  | 
 | 233 | +     "output_type": "execute_result"  | 
 | 234 | +    }  | 
 | 235 | +   ],  | 
 | 236 | +   "source": [  | 
 | 237 | +    "city_names[:20]"  | 
 | 238 | +   ]  | 
 | 239 | +  },  | 
 | 240 | +  {  | 
 | 241 | +   "cell_type": "code",  | 
 | 242 | +   "execution_count": 89,  | 
 | 243 | +   "metadata": {},  | 
 | 244 | +   "outputs": [],  | 
 | 245 | +   "source": [  | 
 | 246 | +    "from unidecode import unidecode\n",  | 
 | 247 | +    "\n",  | 
 | 248 | +    "city_names_unidecoded = [unidecode(city) for city in city_names]"  | 
 | 249 | +   ]  | 
 | 250 | +  },  | 
 | 251 | +  {  | 
 | 252 | +   "cell_type": "code",  | 
 | 253 | +   "execution_count": 93,  | 
 | 254 | +   "metadata": {},  | 
 | 255 | +   "outputs": [  | 
 | 256 | +    {  | 
 | 257 | +     "data": {  | 
 | 258 | +      "text/plain": [  | 
 | 259 | +       "['Andorra la Vella',\n",  | 
 | 260 | +       " 'Umm Al Quwain City',\n",  | 
 | 261 | +       " 'Ras Al Khaimah City',\n",  | 
 | 262 | +       " 'Zayed City',\n",  | 
 | 263 | +       " 'Khawr Fakkan',\n",  | 
 | 264 | +       " 'Dubai',\n",  | 
 | 265 | +       " 'Dibba Al-Fujairah',\n",  | 
 | 266 | +       " 'Dibba Al-Hisn',\n",  | 
 | 267 | +       " 'Sharjah',\n",  | 
 | 268 | +       " 'Ar Ruways',\n",  | 
 | 269 | +       " 'Al Fujairah City',\n",  | 
 | 270 | +       " 'Al Ain City',\n",  | 
 | 271 | +       " 'Ajman City',\n",  | 
 | 272 | +       " 'Adh Dhayd',\n",  | 
 | 273 | +       " 'Abu Dhabi',\n",  | 
 | 274 | +       " 'Khalifah A City',\n",  | 
 | 275 | +       " 'Bani Yas City',\n",  | 
 | 276 | +       " 'Musaffah',\n",  | 
 | 277 | +       " 'Al Shamkhah City',\n",  | 
 | 278 | +       " 'Reef Al Fujairah City']"  | 
 | 279 | +      ]  | 
 | 280 | +     },  | 
 | 281 | +     "execution_count": 93,  | 
 | 282 | +     "metadata": {},  | 
 | 283 | +     "output_type": "execute_result"  | 
 | 284 | +    }  | 
 | 285 | +   ],  | 
 | 286 | +   "source": [  | 
 | 287 | +    "city_names_unidecoded[:20]"  | 
 | 288 | +   ]  | 
 | 289 | +  },  | 
 | 290 | +  {  | 
 | 291 | +   "cell_type": "code",  | 
 | 292 | +   "execution_count": 104,  | 
 | 293 | +   "metadata": {},  | 
 | 294 | +   "outputs": [],  | 
 | 295 | +   "source": [  | 
 | 296 | +    "pattern_country = '|'.join(country_names)\n",  | 
 | 297 | +    "pattern_city = '|'.join(city_names)\n",  | 
 | 298 | +    "\n",  | 
 | 299 | +    "def pattern_searcher(search_str:str, search_list:str):\n",  | 
 | 300 | +    "    search_obj = re.search(search_list, search_str)\n",  | 
 | 301 | +    "    if search_obj :\n",  | 
 | 302 | +    "        return_str = search_str[search_obj.start(): search_obj.end()]\n",  | 
 | 303 | +    "    else:\n",  | 
 | 304 | +    "        return_str = 'NA'\n",  | 
 | 305 | +    "    return return_str\n",  | 
 | 306 | +    "\n",  | 
 | 307 | +    "df['country'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_country))\n",  | 
 | 308 | +    "df['city'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_city))"  | 
 | 309 | +   ]  | 
 | 310 | +  },  | 
 | 311 | +  {  | 
 | 312 | +   "cell_type": "code",  | 
 | 313 | +   "execution_count": 107,  | 
 | 314 | +   "metadata": {},  | 
 | 315 | +   "outputs": [],  | 
 | 316 | +   "source": [  | 
 | 317 | +    "df = df.replace('NA', np.nan)"  | 
 | 318 | +   ]  | 
 | 319 | +  },  | 
 | 320 | +  {  | 
 | 321 | +   "cell_type": "code",  | 
 | 322 | +   "execution_count": 108,  | 
 | 323 | +   "metadata": {},  | 
 | 324 | +   "outputs": [  | 
 | 325 | +    {  | 
 | 326 | +     "data": {  | 
 | 327 | +      "text/plain": [  | 
 | 328 | +       "headline      0\n",  | 
 | 329 | +       "country     633\n",  | 
 | 330 | +       "city         40\n",  | 
 | 331 | +       "dtype: int64"  | 
 | 332 | +      ]  | 
 | 333 | +     },  | 
 | 334 | +     "execution_count": 108,  | 
 | 335 | +     "metadata": {},  | 
 | 336 | +     "output_type": "execute_result"  | 
 | 337 | +    }  | 
 | 338 | +   ],  | 
 | 339 | +   "source": [  | 
 | 340 | +    "df.isnull().sum()"  | 
 | 341 | +   ]  | 
 | 342 | +  },  | 
 | 343 | +  {  | 
 | 344 | +   "cell_type": "code",  | 
 | 345 | +   "execution_count": null,  | 
 | 346 | +   "metadata": {},  | 
 | 347 | +   "outputs": [],  | 
 | 348 | +   "source": []  | 
 | 349 | +  }  | 
 | 350 | + ],  | 
 | 351 | + "metadata": {  | 
 | 352 | +  "kernelspec": {  | 
 | 353 | +   "display_name": "Python 3",  | 
 | 354 | +   "language": "python",  | 
 | 355 | +   "name": "python3"  | 
 | 356 | +  },  | 
 | 357 | +  "language_info": {  | 
 | 358 | +   "codemirror_mode": {  | 
 | 359 | +    "name": "ipython",  | 
 | 360 | +    "version": 3  | 
 | 361 | +   },  | 
 | 362 | +   "file_extension": ".py",  | 
 | 363 | +   "mimetype": "text/x-python",  | 
 | 364 | +   "name": "python",  | 
 | 365 | +   "nbconvert_exporter": "python",  | 
 | 366 | +   "pygments_lexer": "ipython3",  | 
 | 367 | +   "version": "3.6.7"  | 
 | 368 | +  }  | 
 | 369 | + },  | 
 | 370 | + "nbformat": 4,  | 
 | 371 | + "nbformat_minor": 2  | 
 | 372 | +}  | 
0 commit comments