Skip to content

Commit 6ace5bb

Browse files
add 3.亲和性分析推荐电影.ipynb
1 parent 74f69a8 commit 6ace5bb

8 files changed

+103533
-271
lines changed

《Python数据挖掘入门与实践》/0.数据挖掘流程简单示例10min.ipynb

Lines changed: 180 additions & 154 deletions
Large diffs are not rendered by default.

《Python数据挖掘入门与实践》/1.近邻算法分类.ipynb

Lines changed: 59 additions & 56 deletions
Large diffs are not rendered by default.

《Python数据挖掘入门与实践》/2.决策树预测获胜球队.ipynb

Lines changed: 42 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,14 @@
88
{
99
"cell_type": "code",
1010
"execution_count": 1,
11-
"metadata": {
12-
"collapsed": true
13-
},
11+
"metadata": {},
1412
"outputs": [],
1513
"source": [
1614
"import os\n",
1715
"import numpy as np\n",
1816
"import pandas as pd\n",
19-
"home_folder = os.path.expanduser(\"~\")\n",
20-
"data_folder = os.path.join(home_folder, \"Data\", \"basketball\")\n",
17+
"home_folder = \".\"\n",
18+
"data_folder = os.path.join(home_folder, \"data\")\n",
2119
"data_filename = os.path.join(data_folder, \"leagues_NBA_2014_games_games.csv\")"
2220
]
2321
},
@@ -409,7 +407,7 @@
409407
"name": "stdout",
410408
"output_type": "stream",
411409
"text": [
412-
"Home Win percentage: 58.0%\n"
410+
"Home Win 百分比: 58.0%\n"
413411
]
414412
},
415413
{
@@ -550,7 +548,7 @@
550548
}
551549
],
552550
"source": [
553-
"print(\"Home Win percentage: {0:.1f}%\".format(100 * results[\"HomeWin\"].sum() / results[\"HomeWin\"].count()))\n",
551+
"print(\"Home Win 百分比: {0:.1f}%\".format(100 * results[\"HomeWin\"].sum() / results[\"HomeWin\"].count()))\n",
554552
"results[\"HomeLastWin\"] = False\n",
555553
"results[\"VisitorLastWin\"] = False\n",
556554
"# This creates two new columns, all set to False\n",
@@ -757,22 +755,7 @@
757755
"cell_type": "code",
758756
"execution_count": 8,
759757
"metadata": {},
760-
"outputs": [
761-
{
762-
"name": "stderr",
763-
"output_type": "stream",
764-
"text": [
765-
"/home/dlinking-lxy/more-space/pyworks/venv/lib/python3.5/site-packages/ipykernel_launcher.py:13: DeprecationWarning: \n",
766-
".ix is deprecated. Please use\n",
767-
".loc for label based indexing or\n",
768-
".iloc for positional indexing\n",
769-
"\n",
770-
"See the documentation here:\n",
771-
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix\n",
772-
" del sys.path[0]\n"
773-
]
774-
}
775-
],
758+
"outputs": [],
776759
"source": [
777760
"# What about win streaks?\n",
778761
"results[\"HomeWinStreak\"] = 0\n",
@@ -786,7 +769,7 @@
786769
" visitor_team = row[\"Visitor Team\"]\n",
787770
" row[\"HomeWinStreak\"] = win_streak[home_team]\n",
788771
" row[\"VisitorWinStreak\"] = win_streak[visitor_team]\n",
789-
" results.ix[index] = row \n",
772+
" results.loc[index] = row \n",
790773
" # Set current win\n",
791774
" if row[\"HomeWin\"]:\n",
792775
" win_streak[home_team] += 1\n",
@@ -820,7 +803,7 @@
820803
},
821804
{
822805
"cell_type": "code",
823-
"execution_count": 10,
806+
"execution_count": 13,
824807
"metadata": {},
825808
"outputs": [
826809
{
@@ -1693,7 +1676,7 @@
16931676
"[30 rows x 24 columns]"
16941677
]
16951678
},
1696-
"execution_count": 10,
1679+
"execution_count": 13,
16971680
"metadata": {},
16981681
"output_type": "execute_result"
16991682
}
@@ -1707,7 +1690,7 @@
17071690
},
17081691
{
17091692
"cell_type": "code",
1710-
"execution_count": 11,
1693+
"execution_count": 15,
17111694
"metadata": {},
17121695
"outputs": [
17131696
{
@@ -1860,7 +1843,7 @@
18601843
"4 False 0 0 0 "
18611844
]
18621845
},
1863-
"execution_count": 11,
1846+
"execution_count": 15,
18641847
"metadata": {},
18651848
"output_type": "execute_result"
18661849
}
@@ -1884,15 +1867,15 @@
18841867
},
18851868
{
18861869
"cell_type": "code",
1887-
"execution_count": 12,
1870+
"execution_count": 26,
18881871
"metadata": {},
18891872
"outputs": [
18901873
{
18911874
"name": "stdout",
18921875
"output_type": "stream",
18931876
"text": [
18941877
"Using whether the home team is ranked higher\n",
1895-
"Accuracy: 60.2%\n"
1878+
"准确率: 60.2%\n"
18961879
]
18971880
}
18981881
],
@@ -1901,27 +1884,19 @@
19011884
"clf = DecisionTreeClassifier(random_state=14)\n",
19021885
"scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')\n",
19031886
"print(\"Using whether the home team is ranked higher\")\n",
1904-
"print(\"Accuracy: {0:.1f}%\".format(np.mean(scores) * 100))"
1887+
"print(\"准确率: {0:.1f}%\".format(np.mean(scores) * 100))"
19051888
]
19061889
},
19071890
{
19081891
"cell_type": "code",
1909-
"execution_count": 13,
1892+
"execution_count": 25,
19101893
"metadata": {},
19111894
"outputs": [
19121895
{
19131896
"name": "stdout",
19141897
"output_type": "stream",
19151898
"text": [
1916-
"Accuracy: 60.5%\n"
1917-
]
1918-
},
1919-
{
1920-
"name": "stderr",
1921-
"output_type": "stream",
1922-
"text": [
1923-
"/home/dlinking-lxy/more-space/pyworks/venv/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
1924-
" DeprecationWarning)\n"
1899+
"准确率: 60.5%\n"
19251900
]
19261901
}
19271902
],
@@ -1934,12 +1909,12 @@
19341909
"clf = DecisionTreeClassifier(random_state=14)\n",
19351910
"grid = GridSearchCV(clf, parameter_space)\n",
19361911
"grid.fit(X_homehigher, y_true)\n",
1937-
"print(\"Accuracy: {0:.1f}%\".format(grid.best_score_ * 100))"
1912+
"print(\"准确率: {0:.1f}%\".format(grid.best_score_ * 100))"
19381913
]
19391914
},
19401915
{
19411916
"cell_type": "code",
1942-
"execution_count": 14,
1917+
"execution_count": 18,
19431918
"metadata": {},
19441919
"outputs": [
19451920
{
@@ -2127,7 +2102,7 @@
21272102
"5 0 "
21282103
]
21292104
},
2130-
"execution_count": 14,
2105+
"execution_count": 18,
21312106
"metadata": {},
21322107
"output_type": "execute_result"
21332108
}
@@ -2152,15 +2127,15 @@
21522127
},
21532128
{
21542129
"cell_type": "code",
2155-
"execution_count": 15,
2130+
"execution_count": 19,
21562131
"metadata": {},
21572132
"outputs": [
21582133
{
21592134
"name": "stdout",
21602135
"output_type": "stream",
21612136
"text": [
21622137
"Using whether the home team is ranked higher\n",
2163-
"Accuracy: 60.5%\n"
2138+
"准确率: 60.5%\n"
21642139
]
21652140
}
21662141
],
@@ -2169,19 +2144,19 @@
21692144
"clf = DecisionTreeClassifier(random_state=14)\n",
21702145
"scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')\n",
21712146
"print(\"Using whether the home team is ranked higher\")\n",
2172-
"print(\"Accuracy: {0:.1f}%\".format(np.mean(scores) * 100))\n"
2147+
"print(\"准确率: {0:.1f}%\".format(np.mean(scores) * 100))\n"
21732148
]
21742149
},
21752150
{
21762151
"cell_type": "code",
2177-
"execution_count": 16,
2152+
"execution_count": 20,
21782153
"metadata": {},
21792154
"outputs": [
21802155
{
21812156
"name": "stdout",
21822157
"output_type": "stream",
21832158
"text": [
2184-
"Accuracy: 61.2%\n"
2159+
"准确率: 61.2%\n"
21852160
]
21862161
}
21872162
],
@@ -2198,20 +2173,20 @@
21982173
"\n",
21992174
"clf = DecisionTreeClassifier(random_state=14)\n",
22002175
"scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')\n",
2201-
"print(\"Accuracy: {0:.1f}%\".format(np.mean(scores) * 100))"
2176+
"print(\"准确率: {0:.1f}%\".format(np.mean(scores) * 100))"
22022177
]
22032178
},
22042179
{
22052180
"cell_type": "code",
2206-
"execution_count": 17,
2181+
"execution_count": 21,
22072182
"metadata": {},
22082183
"outputs": [
22092184
{
22102185
"name": "stdout",
22112186
"output_type": "stream",
22122187
"text": [
22132188
"Using full team labels is ranked higher\n",
2214-
"Accuracy: 60.5%\n"
2189+
"准确率: 60.5%\n"
22152190
]
22162191
}
22172192
],
@@ -2220,12 +2195,12 @@
22202195
"clf = RandomForestClassifier(random_state=14)\n",
22212196
"scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')\n",
22222197
"print(\"Using full team labels is ranked higher\")\n",
2223-
"print(\"Accuracy: {0:.1f}%\".format(np.mean(scores) * 100))"
2198+
"print(\"准确率: {0:.1f}%\".format(np.mean(scores) * 100))"
22242199
]
22252200
},
22262201
{
22272202
"cell_type": "code",
2228-
"execution_count": 18,
2203+
"execution_count": 22,
22292204
"metadata": {},
22302205
"outputs": [
22312206
{
@@ -2243,35 +2218,35 @@
22432218
},
22442219
{
22452220
"cell_type": "code",
2246-
"execution_count": 19,
2221+
"execution_count": 23,
22472222
"metadata": {},
22482223
"outputs": [
22492224
{
22502225
"name": "stdout",
22512226
"output_type": "stream",
22522227
"text": [
22532228
"Using whether the home team is ranked higher\n",
2254-
"Accuracy: 60.9%\n"
2229+
"准确率: 60.9%\n"
22552230
]
22562231
}
22572232
],
22582233
"source": [
22592234
"clf = RandomForestClassifier(random_state=14)\n",
22602235
"scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')\n",
22612236
"print(\"Using whether the home team is ranked higher\")\n",
2262-
"print(\"Accuracy: {0:.1f}%\".format(np.mean(scores) * 100))"
2237+
"print(\"准确率: {0:.1f}%\".format(np.mean(scores) * 100))"
22632238
]
22642239
},
22652240
{
22662241
"cell_type": "code",
2267-
"execution_count": 20,
2242+
"execution_count": 24,
22682243
"metadata": {},
22692244
"outputs": [
22702245
{
22712246
"name": "stdout",
22722247
"output_type": "stream",
22732248
"text": [
2274-
"Accuracy: 63.8%\n",
2249+
"准确率: 63.8%\n",
22752250
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
22762251
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
22772252
" min_impurity_split=1e-07, min_samples_leaf=6,\n",
@@ -2297,9 +2272,16 @@
22972272
"clf = RandomForestClassifier(random_state=14)\n",
22982273
"grid = GridSearchCV(clf, parameter_space)\n",
22992274
"grid.fit(X_all, y_true)\n",
2300-
"print(\"Accuracy: {0:.1f}%\".format(grid.best_score_ * 100))\n",
2275+
"print(\"准确率: {0:.1f}%\".format(grid.best_score_ * 100))\n",
23012276
"print(grid.best_estimator_)"
23022277
]
2278+
},
2279+
{
2280+
"cell_type": "code",
2281+
"execution_count": null,
2282+
"metadata": {},
2283+
"outputs": [],
2284+
"source": []
23032285
}
23042286
],
23052287
"metadata": {

0 commit comments

Comments
 (0)