Skip to content

Commit a6dbf6a

Browse files
authored
Add notebook
1 parent f6da528 commit a6dbf6a

File tree

1 file changed

+313
-0
lines changed

1 file changed

+313
-0
lines changed

cv_load_CatBoost.ipynb

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from collections import Counter\n",
10+
"\n",
11+
"dtype = {\n",
12+
" 'Usage': 'category',\n",
13+
" 'Description': 'category',\n",
14+
" 'status': 'category',\n",
15+
"}\n",
16+
"df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])\n",
17+
"df.drop(columns='Usage', inplace=True)\n",
18+
"\n",
19+
"df.loc[df['cancel_date'] == '0001-01-01T00:00:00', ['cancel_date']] = None\n",
20+
"df['cancel_date'] = pd.to_datetime(df['cancel_date'])\n",
21+
"\n",
22+
"df['arrival_year'] = df['arrival'].dt.year"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 2,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"appearances = {}\n",
32+
"for contract_id in df['contract_id'].unique():\n",
33+
" subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])\n",
34+
" # save the last known state\n",
35+
" # fill in blanks for bad years\n",
36+
" # handle case where cancel year might come after a series of misses\n",
37+
" yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}\n",
38+
" earliest = subset_df['arrival_year'].min()\n",
39+
" latest = min(subset_df['arrival_year'].max(), 2019)\n",
40+
" activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)] \n",
41+
" if activity:\n",
42+
" resort_id = subset_df['resort_id'].values[0]\n",
43+
" appearances[str(contract_id)] = [resort_id] + activity"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": 3,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"rows = []\n",
53+
"for r in list(appearances.values()):\n",
54+
" resort_id, activity = r[0], r[1:]\n",
55+
" row = [None] * 5\n",
56+
" row[-len(activity):] = [s for year, s in activity]\n",
57+
" rows.append([resort_id] + row)"
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": 4,
63+
"metadata": {},
64+
"outputs": [],
65+
"source": [
66+
"df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 5,
72+
"metadata": {},
73+
"outputs": [
74+
{
75+
"data": {
76+
"text/plain": [
77+
"resort_id 0.000000\n",
78+
"year_2015 0.515846\n",
79+
"year_2016 0.205276\n",
80+
"year_2017 0.098678\n",
81+
"year_2018 0.048311\n",
82+
"year_2019 0.000000\n",
83+
"dtype: float64"
84+
]
85+
},
86+
"execution_count": 5,
87+
"metadata": {},
88+
"output_type": "execute_result"
89+
}
90+
],
91+
"source": [
92+
"df.isnull().sum() / df.shape[0]"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": 6,
98+
"metadata": {},
99+
"outputs": [],
100+
"source": [
101+
"df.fillna('missing', inplace=True)"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": 9,
107+
"metadata": {},
108+
"outputs": [
109+
{
110+
"data": {
111+
"text/plain": [
112+
"resort_id 57\n",
113+
"year_2015 4\n",
114+
"year_2016 4\n",
115+
"year_2017 4\n",
116+
"year_2018 4\n",
117+
"year_2019 3\n",
118+
"dtype: int64"
119+
]
120+
},
121+
"execution_count": 9,
122+
"metadata": {},
123+
"output_type": "execute_result"
124+
}
125+
],
126+
"source": [
127+
"df.nunique()"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": 11,
133+
"metadata": {},
134+
"outputs": [],
135+
"source": [
136+
"X = df.drop(\"year_2019\", axis=1)\n",
137+
"y = df[\"year_2019\"]"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": 12,
143+
"metadata": {},
144+
"outputs": [
145+
{
146+
"name": "stdout",
147+
"output_type": "stream",
148+
"text": [
149+
"[0, 1, 2, 3, 4]\n"
150+
]
151+
}
152+
],
153+
"source": [
154+
"cat_features = list(range(0, X.shape[1]))\n",
155+
"print(cat_features)"
156+
]
157+
},
158+
{
159+
"cell_type": "code",
160+
"execution_count": 13,
161+
"metadata": {},
162+
"outputs": [],
163+
"source": [
164+
"from sklearn.model_selection import train_test_split\n",
165+
"\n",
166+
"\n",
167+
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)"
168+
]
169+
},
170+
{
171+
"cell_type": "code",
172+
"execution_count": 14,
173+
"metadata": {},
174+
"outputs": [
175+
{
176+
"name": "stdout",
177+
"output_type": "stream",
178+
"text": [
179+
"0:\tlearn: 0.9089395\ttest: 0.9088238\tbest: 0.9088238 (0)\ttotal: 83.9ms\tremaining: 755ms\n",
180+
"5:\tlearn: 0.6921539\ttest: 0.6930650\tbest: 0.6930650 (5)\ttotal: 219ms\tremaining: 146ms\n",
181+
"9:\tlearn: 0.6811982\ttest: 0.6819978\tbest: 0.6819978 (9)\ttotal: 319ms\tremaining: 0us\n",
182+
"\n",
183+
"bestTest = 0.6819978385\n",
184+
"bestIteration = 9\n",
185+
"\n"
186+
]
187+
},
188+
{
189+
"data": {
190+
"text/plain": [
191+
"<catboost.core.CatBoostClassifier at 0x7f2c8c533390>"
192+
]
193+
},
194+
"execution_count": 14,
195+
"metadata": {},
196+
"output_type": "execute_result"
197+
}
198+
],
199+
"source": [
200+
"from catboost import CatBoostClassifier\n",
201+
"\n",
202+
"clf = CatBoostClassifier(iterations=10, verbose=5, learning_rate=0.5)\n",
203+
"clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))"
204+
]
205+
},
206+
{
207+
"cell_type": "code",
208+
"execution_count": 15,
209+
"metadata": {},
210+
"outputs": [],
211+
"source": [
212+
"y_prob = clf.predict_proba(data=X_val)\n",
213+
"y_pred = clf.predict(data=X_val)"
214+
]
215+
},
216+
{
217+
"cell_type": "code",
218+
"execution_count": 16,
219+
"metadata": {},
220+
"outputs": [
221+
{
222+
"name": "stdout",
223+
"output_type": "stream",
224+
"text": [
225+
"Accuracy: 0.7176913425345044\n"
226+
]
227+
}
228+
],
229+
"source": [
230+
"from sklearn.metrics import accuracy_score\n",
231+
"\n",
232+
"accuracy = accuracy_score(y_val, y_pred)\n",
233+
"print('Accuracy:', accuracy)"
234+
]
235+
},
236+
{
237+
"cell_type": "code",
238+
"execution_count": 17,
239+
"metadata": {},
240+
"outputs": [
241+
{
242+
"name": "stdout",
243+
"output_type": "stream",
244+
"text": [
245+
"AUC: 0.8310445038554471\n"
246+
]
247+
}
248+
],
249+
"source": [
250+
"from sklearn.metrics import roc_auc_score\n",
251+
"\n",
252+
"auc = roc_auc_score(y_val, y_prob, multi_class=\"ovo\", average=\"macro\")\n",
253+
"print('AUC:', auc)"
254+
]
255+
},
256+
{
257+
"cell_type": "code",
258+
"execution_count": 18,
259+
"metadata": {},
260+
"outputs": [
261+
{
262+
"name": "stdout",
263+
"output_type": "stream",
264+
"text": [
265+
" precision recall f1-score support\n",
266+
"\n",
267+
" active 0.75 0.87 0.80 11582\n",
268+
" cancelled 0.63 0.52 0.57 5562\n",
269+
" no-show 0.73 0.52 0.61 3578\n",
270+
"\n",
271+
" accuracy 0.72 20722\n",
272+
" macro avg 0.70 0.64 0.66 20722\n",
273+
"weighted avg 0.71 0.72 0.71 20722\n",
274+
"\n"
275+
]
276+
}
277+
],
278+
"source": [
279+
"from sklearn.metrics import classification_report\n",
280+
"\n",
281+
"print(classification_report(y_val, y_pred))"
282+
]
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": null,
287+
"metadata": {},
288+
"outputs": [],
289+
"source": []
290+
}
291+
],
292+
"metadata": {
293+
"kernelspec": {
294+
"display_name": "Python 3",
295+
"language": "python",
296+
"name": "python3"
297+
},
298+
"language_info": {
299+
"codemirror_mode": {
300+
"name": "ipython",
301+
"version": 3
302+
},
303+
"file_extension": ".py",
304+
"mimetype": "text/x-python",
305+
"name": "python",
306+
"nbconvert_exporter": "python",
307+
"pygments_lexer": "ipython3",
308+
"version": "3.6.7"
309+
}
310+
},
311+
"nbformat": 4,
312+
"nbformat_minor": 2
313+
}

0 commit comments

Comments
 (0)