Add notebook

susanli2016 · web-flow · commit a6dbf6acba92 · 2021-09-15T00:17:45.000-04:00
diff --git a/cv_load_CatBoost.ipynb b/cv_load_CatBoost.ipynb
@@ -0,0 +1,313 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import Counter\n",
+    "\n",
+    "dtype = {\n",
+    "    'Usage': 'category',\n",
+    "    'Description': 'category',\n",
+    "    'status': 'category',\n",
+    "}\n",
+    "df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])\n",
+    "df.drop(columns='Usage', inplace=True)\n",
+    "\n",
+    "df.loc[df['cancel_date'] == '0001-01-01T00:00:00', ['cancel_date']] = None\n",
+    "df['cancel_date'] = pd.to_datetime(df['cancel_date'])\n",
+    "\n",
+    "df['arrival_year'] = df['arrival'].dt.year"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "appearances = {}\n",
+    "for contract_id in df['contract_id'].unique():\n",
+    "    subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])\n",
+    "    # save the last known state\n",
+    "    # fill in blanks for bad years\n",
+    "    # handle case where cancel year might come after a series of misses\n",
+    "    yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}\n",
+    "    earliest = subset_df['arrival_year'].min()\n",
+    "    latest = min(subset_df['arrival_year'].max(), 2019)\n",
+    "    activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)]    \n",
+    "    if activity:\n",
+    "        resort_id = subset_df['resort_id'].values[0]\n",
+    "        appearances[str(contract_id)] = [resort_id] + activity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rows = []\n",
+    "for r in list(appearances.values()):\n",
+    "    resort_id, activity = r[0], r[1:]\n",
+    "    row = [None] * 5\n",
+    "    row[-len(activity):] = [s for year, s in activity]\n",
+    "    rows.append([resort_id] + row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "resort_id    0.000000\n",
+       "year_2015    0.515846\n",
+       "year_2016    0.205276\n",
+       "year_2017    0.098678\n",
+       "year_2018    0.048311\n",
+       "year_2019    0.000000\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.isnull().sum() / df.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.fillna('missing', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "resort_id    57\n",
+       "year_2015     4\n",
+       "year_2016     4\n",
+       "year_2017     4\n",
+       "year_2018     4\n",
+       "year_2019     3\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = df.drop(\"year_2019\", axis=1)\n",
+    "y = df[\"year_2019\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1, 2, 3, 4]\n"
+     ]
+    }
+   ],
+   "source": [
+    "cat_features = list(range(0, X.shape[1]))\n",
+    "print(cat_features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0:\tlearn: 0.9089395\ttest: 0.9088238\tbest: 0.9088238 (0)\ttotal: 83.9ms\tremaining: 755ms\n",
+      "5:\tlearn: 0.6921539\ttest: 0.6930650\tbest: 0.6930650 (5)\ttotal: 219ms\tremaining: 146ms\n",
+      "9:\tlearn: 0.6811982\ttest: 0.6819978\tbest: 0.6819978 (9)\ttotal: 319ms\tremaining: 0us\n",
+      "\n",
+      "bestTest = 0.6819978385\n",
+      "bestIteration = 9\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<catboost.core.CatBoostClassifier at 0x7f2c8c533390>"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from catboost import CatBoostClassifier\n",
+    "\n",
+    "clf = CatBoostClassifier(iterations=10, verbose=5, learning_rate=0.5)\n",
+    "clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_prob = clf.predict_proba(data=X_val)\n",
+    "y_pred = clf.predict(data=X_val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.7176913425345044\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "accuracy = accuracy_score(y_val, y_pred)\n",
+    "print('Accuracy:', accuracy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AUC: 0.8310445038554471\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import roc_auc_score\n",
+    "\n",
+    "auc = roc_auc_score(y_val, y_prob, multi_class=\"ovo\", average=\"macro\")\n",
+    "print('AUC:', auc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "      active       0.75      0.87      0.80     11582\n",
+      "   cancelled       0.63      0.52      0.57      5562\n",
+      "     no-show       0.73      0.52      0.61      3578\n",
+      "\n",
+      "    accuracy                           0.72     20722\n",
+      "   macro avg       0.70      0.64      0.66     20722\n",
+      "weighted avg       0.71      0.72      0.71     20722\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import classification_report\n",
+    "\n",
+    "print(classification_report(y_val, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}