2
2
from sklearn import cross_validation
3
3
import logloss
4
4
import numpy as np
5
+ import pandas as pd
6
+ from sklearn .preprocessing import Imputer
5
7
6
8
def main ():
7
9
#read in data, parse into training and target sets
8
- dataset = np .genfromtxt (open ('Data/train.csv' ,'r' ), delimiter = ',' , dtype = 'f8' )[1 :]
9
- target = np .array ([x [0 ] for x in dataset ])
10
- train = np .array ([x [1 :] for x in dataset ])
10
+ dataset = pd .read_csv ('Data/train.csv' )
11
+ target = dataset .Activity .values
12
+ train = dataset .drop ('Activity' , axis = 1 ).values
13
+ imp = Imputer (missing_values = 'NaN' ,strategy = 'mean' ,axis = 0 )
14
+ new_train_data = imp .fit_transform (train )
11
15
12
16
#In this case we'll use a random forest, but this could be any classifier
13
- cfr = RandomForestClassifier (n_estimators = 100 )
17
+ cfr = RandomForestClassifier (n_estimators = 100 , n_jobs = - 1 )
14
18
15
19
#Simple K-Fold cross validation. 5 folds.
16
- cv = cross_validation .KFold (len (train ), k = 5 , indices = False )
20
+ cv = cross_validation .KFold (len (new_train_data ), n_folds = 5 , indices = False )
17
21
18
22
#iterate through the training and test cross validation segments and
19
23
#run the classifier on each one, aggregating the results into a list
20
24
results = []
21
25
for traincv , testcv in cv :
22
- probas = cfr .fit (train [traincv ], target [traincv ]).predict_proba (train [testcv ])
26
+ probas = cfr .fit (new_train_data [traincv ], target [traincv ]).predict_proba (new_train_data [testcv ])
23
27
results .append ( logloss .llfun (target [testcv ], [x [1 ] for x in probas ]) )
24
28
25
29
#print out the mean of the cross-validated results
26
30
print "Results: " + str ( np .array (results ).mean () )
27
31
28
32
if __name__ == "__main__" :
29
- main ()
33
+ main ()
0 commit comments