@@ -59,37 +59,47 @@ def feature_utility(data, selected_feature_name, target_name):
5959 plt .legend ([bar [0 ] for bar in bars ], target_classes , loc = 'best' )
6060 plt .show ()
6161
62+ def encode_label (data ):
63+ la_en = preprocessing .LabelEncoder ()
64+ for col in ['job' , 'marital' , 'education' , 'default' , 'housing' , 'loan' ,
65+ 'contact' , 'month' , 'poutcome' , 'y' ]:
66+ data [col ] = bank_data [col ].astype ('category' )
67+ data [col ] = la_en .fit_transform (bank_data [col ])
68+ return data
69+
6270dataset_path = ['bank.csv' , 'bank-full.csv' ]
6371bank_data = pd .read_csv (dataset_path [1 ], sep = ';' )
72+ print (bank_data .head ())
6473
6574# good categorical features: job, marital, education, housing, loan, contact, month, poutcome
6675# bad categorical features: default
6776# feature_utility(bank_data, 'housing', 'y')
6877
69- le_en = preprocessing .LabelEncoder ()
70- for col in ['job' , 'marital' , 'education' , 'default' , 'housing' , 'loan' ,
71- 'contact' , 'month' , 'poutcome' , 'y' ]:
72- bank_data [col ] = bank_data [col ].astype ('category' )
73- bank_data [col ] = le_en .fit_transform (bank_data [col ])
78+ bank_data = encode_label (bank_data )
7479# print(bank_data.dtypes)
7580# print(bank_data.head())
7681
7782X_data , y_data = bank_data .iloc [:, :- 1 ], bank_data .iloc [:, - 1 ]
83+ # show the percentage of answer yes and no.
84+ answer_no , answer_yes = y_data .value_counts ()
85+ print ('Percentage of answering no: ' , answer_no / (answer_no + answer_yes ))
86+
7887X_train , X_test , y_train , y_test = train_test_split (
7988 X_data , y_data ,
8089 test_size = 0.2 )
8190
82- dt_clf = DecisionTreeClassifier ()
83- rf_clf = RandomForestClassifier ()
91+ dt_clf = DecisionTreeClassifier (class_weight = 'balanced' ,)
92+ rf_clf = RandomForestClassifier (class_weight = 'balanced' )
93+ # randomize the data, and run the cross validation for 5 times
8494cv = ShuffleSplit (X_data .shape [0 ], n_iter = 5 ,
85- test_size = 0.2 , random_state = 0 )
86- print (cross_val_score (dt_clf , X_data , y_data , cv = cv , scoring = 'accuracy ' ).mean ())
87- print (cross_val_score (rf_clf , X_data , y_data , cv = cv , scoring = 'accuracy ' ).mean ())
88-
89- dt_clf .fit (X_train , y_train )
90- print (dt_clf .score (X_test , y_test ))
91- rf_clf .fit (X_train , y_train )
92- print (rf_clf .score (X_test , y_test ))
95+ test_size = 0.3 , random_state = 0 )
96+ print (cross_val_score (dt_clf , X_data , y_data , cv = cv , scoring = 'f1 ' ).mean ())
97+ print (cross_val_score (rf_clf , X_data , y_data , cv = cv , scoring = 'f1 ' ).mean ())
98+
99+ # dt_clf.fit(X_train, y_train)
100+ # print(dt_clf.score(X_test, y_test))
101+ # rf_clf.fit(X_train, y_train)
102+ # print(rf_clf.score(X_test, y_test))
93103
94104# print(rf_clf.predict(X_test.iloc[10, :][np.newaxis, :]))
95105# print(y_test.iloc[10])
0 commit comments