Skip to content

Commit bd5dc21

Browse files
author
Weichen Shen
committed
Update docs
1 parent d524c86 commit bd5dc21

File tree

7 files changed

+169
-143
lines changed

7 files changed

+169
-143
lines changed

docs/source/Examples.md

Lines changed: 47 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,43 +17,52 @@ and run the following codes.
1717

1818
```python
1919
import pandas as pd
20-
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
20+
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
21+
from sklearn.model_selection import train_test_split
22+
from sklearn.metrics import log_loss, roc_auc_score
2123
from deepctr.models import DeepFM
2224

23-
2425
data = pd.read_csv('./criteo_sample.txt')
2526

26-
sparse_features = ['C' + str(i) for i in range(1, 27)]
27-
dense_features = ['I'+str(i) for i in range(1,14)]
27+
sparse_features = ['C' + str(i) for i in range(1, 27)]
28+
dense_features = ['I'+str(i) for i in range(1, 14)]
2829

2930
data[sparse_features] = data[sparse_features].fillna('-1', )
3031
data[dense_features] = data[dense_features].fillna(0,)
31-
3232
target = ['label']
3333

3434
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
3535
for feat in sparse_features:
3636
lbe = LabelEncoder()
3737
data[feat] = lbe.fit_transform(data[feat])
38-
mms = MinMaxScaler(feature_range=(0,1))
38+
mms = MinMaxScaler(feature_range=(0, 1))
3939
data[dense_features] = mms.fit_transform(data[dense_features])
4040

4141
# 2.count #unique features for each sparse field,and record dense feature field name
4242

43-
sparse_feature_dict = {feat: data[feat].nunique() for feat in sparse_features}
43+
sparse_feature_dict = {feat: data[feat].nunique()
44+
for feat in sparse_features}
4445
dense_feature_list = dense_features
4546

4647
# 3.generate input data for model
4748

48-
model_input = [data[feat].values for feat in sparse_feature_dict] + [data[feat].values for feat in dense_feature_list]
49-
50-
#4.Define Model,compile and
51-
52-
53-
model = DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}, final_activation='sigmoid')
54-
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
55-
history = model.fit(model_input, data[target].values,
56-
batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
49+
train, test = train_test_split(data, test_size=0.2)
50+
train_model_input = [train[feat].values for feat in sparse_feature_dict] + \
51+
[train[feat].values for feat in dense_feature_list]
52+
test_model_input = [test[feat].values for feat in sparse_feature_dict] + \
53+
[test[feat].values for feat in dense_feature_list]
54+
55+
# 4.Define Model,train,predict and evaluate
56+
model = DeepFM({"sparse": sparse_feature_dict,
57+
"dense": dense_feature_list}, final_activation='sigmoid')
58+
model.compile("adam", "binary_crossentropy",
59+
metrics=['binary_crossentropy'], )
60+
61+
history = model.fit(train_model_input, train[target].values,
62+
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
63+
pred_ans = model.predict(test_model_input, batch_size=256)
64+
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
65+
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
5766
```
5867

5968
## Regression: Movielens
@@ -70,28 +79,37 @@ This example shows how to use ``DeepFM`` to solve a simple binary regression tas
7079

7180
```python
7281
import pandas as pd
73-
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
82+
from sklearn.preprocessing import LabelEncoder
83+
from sklearn.model_selection import train_test_split
84+
from sklearn.metrics import mean_squared_error
7485
from deepctr.models import DeepFM
7586

76-
7787
data = pd.read_csv("./movielens_sample.txt")
78-
sparse_features = [ "movie_id","user_id","gender","age","occupation","zip"]
88+
sparse_features = ["movie_id", "user_id",
89+
"gender", "age", "occupation", "zip"]
7990
target = ['rating']
8091

8192
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
8293
for feat in sparse_features:
8394
lbe = LabelEncoder()
8495
data[feat] = lbe.fit_transform(data[feat])
85-
#2.count #unique features for each sparse field
86-
sparse_feature_dim = {feat:data[feat].nunique() for feat in sparse_features}
87-
#3.generate input data for model
88-
model_input = [data[feat].values for feat in sparse_feature_dim]
89-
#4.Define Model,compile and train
90-
model = DeepFM({"sparse":sparse_feature_dim,"dense":[]},final_activation='linear')
91-
92-
model.compile("adam","mse",metrics=['mse'],)
93-
history = model.fit(model_input,data[target].values,
94-
batch_size=256,epochs=10,verbose=2,validation_split=0.2,)
96+
# 2.count #unique features for each sparse field
97+
sparse_feature_dim = {feat: data[feat].nunique()
98+
for feat in sparse_features}
99+
# 3.generate input data for model
100+
train, test = train_test_split(data, test_size=0.2)
101+
train_model_input = [train[feat].values for feat in sparse_feature_dim]
102+
test_model_input = [test[feat].values for feat in sparse_feature_dim]
103+
# 4.Define Model,train,predict and evaluate
104+
model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
105+
final_activation='linear')
106+
model.compile("adam", "mse", metrics=['mse'],)
107+
108+
history = model.fit(train_model_input, train[target].values,
109+
batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
110+
pred_ans = model.predict(test_model_input, batch_size=256)
111+
print("test MSE", round(mean_squared_error(
112+
test[target].values, pred_ans), 4))
95113
```
96114
## Multi-value Input : Movielens
97115
----------------------------------

docs/source/History.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
- 12/27/2018 : [v0.2.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.2.1) released.Add [AutoInt](./Features.html#autoint-automatic-feature-interactiont) Model.
44
- 12/22/2018 : [v0.2.0](https://github.com/shenweichen/DeepCTR/releases/tag/v0.2.0) released.Add [xDeepFM](./Features.html#xdeepfm) and automatic check for new version.
55
- 12/19/2018 : [v0.1.6](https://github.com/shenweichen/DeepCTR/releases/tag/v0.1.6) released.Now DeepCTR is compatible with tensorflow from `1.4-1.12` except for `1.7` and `1.8`.
6-
- 29/11/2018 : [v0.1.4](https://github.com/shenweichen/DeepCTR/releases/tag/v0.1.4) released.Add [FAQ](./FAQ.html) in docs
6+
- 11/29/2018 : [v0.1.4](https://github.com/shenweichen/DeepCTR/releases/tag/v0.1.4) released.Add [FAQ](./FAQ.html) in docs
77
- 11/24/2018 : DeepCTR first version v0.1.0 is released on [PyPi](https://pypi.org/project/deepctr/)

docs/source/Quick-Start.md

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Quick-Start
2+
3+
## Installation Guide
4+
### CPU version
5+
Install `deepctr` package is through `pip`
6+
```bash
7+
pip install deepctr
8+
```
9+
### GPU version
10+
If you have a `tensorflow-gpu` on your local machine,make sure its version is
11+
**`tensorflow-gpu>=1.4.0,!=1.7.*,!=1.8.*`**
12+
Then,use the following command to install
13+
```bash
14+
pip install deepctr --no-deps
15+
```
16+
## Getting started: 4 steps to DeepCTR
17+
18+
19+
### Step 1: Import model
20+
21+
22+
```python
23+
import pandas as pd
24+
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
25+
26+
from deepctr.models import DeepFM
27+
28+
data = pd.read_csv('./criteo_sample.txt')
29+
30+
sparse_features = ['C' + str(i) for i in range(1, 27)]
31+
dense_features = ['I'+str(i) for i in range(1,14)]
32+
target = ['label']
33+
34+
data[sparse_features] = data[sparse_features].fillna('-1', )
35+
data[dense_features] = data[dense_features].fillna(0,)
36+
```
37+
38+
39+
40+
### Step 2: Simple preprocessing
41+
42+
43+
Usually there are two simple way to encode the sparse categorical feature for embedding
44+
45+
- Label Encoding: map the features to integer value from 0 ~ len(#unique) - 1
46+
- Hash Encoding: map the features to a fix range,like 0 ~ 9999
47+
48+
And for dense numerical features,they are usually discretized to buckets,here we use normalization.
49+
50+
```python
51+
for feat in sparse_features:
52+
lbe = LabelEncoder()# or Hash
53+
data[feat] = lbe.fit_transform(data[feat])
54+
mms = MinMaxScaler(feature_range=(0,1))
55+
data[dense_features] = mms.fit_transform(data[dense_features])
56+
```
57+
58+
59+
### Step 3: Generate feature config dict
60+
61+
Here, for sparse features, we transform them into dense vectors by embedding techniques.
62+
For dense numerical features, we add a dummy index like LIBFM.
63+
That is to say, all dense features under the same field share the same embedding vector.
64+
In some implementations, the dense feature is concatened to the input embedding vectors of the deep network, you can modify the code yourself.
65+
66+
```python
67+
sparse_feature_dict = {feat: data[feat].nunique() for feat in sparse_features}
68+
dense_feature_list = dense_features
69+
```
70+
71+
### Step 4: Generate the training samples and train the model
72+
73+
There are two rules here that we must follow
74+
75+
- The sparse features are placed in front of the dense features.
76+
- The order of the feature we fit into the model must be consistent with the order of the feature dictionary iterations
77+
78+
79+
```python
80+
# make sure the order is right
81+
model_input = [data[feat].values for feat in sparse_feature_dict] + [data[feat].values for feat in dense_feature_list]
82+
83+
model = DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}, final_activation='sigmoid')
84+
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
85+
history = model.fit(model_input, data[target].values,
86+
batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
87+
88+
```
89+
You can check the full code [here](./Examples.html#classification-criteo).
90+
91+
92+
93+
94+
95+
96+
97+

docs/source/Quick-Start.rst

Lines changed: 0 additions & 99 deletions
This file was deleted.

docs/source/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ News
4545
:maxdepth: 2
4646
:caption: Home:
4747

48-
Quick-Start
48+
Quick-Start<Quick-Start.md>
4949
Features
5050
Examples<Examples.md>
5151
FAQ<FAQ.md>

examples/run_classification_criteo.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import pandas as pd
22
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
3+
from sklearn.model_selection import train_test_split
4+
from sklearn.metrics import log_loss, roc_auc_score
35
from deepctr.models import DeepFM
46

57
if __name__ == "__main__":
@@ -27,17 +29,20 @@
2729

2830
# 3.generate input data for model
2931

30-
model_input = [data[feat].values for feat in sparse_feature_dict] + \
31-
[data[feat].values for feat in dense_feature_list] # + [data[target[0]].values]
32+
train, test = train_test_split(data, test_size=0.2)
33+
train_model_input = [train[feat].values for feat in sparse_feature_dict] + \
34+
[train[feat].values for feat in dense_feature_list]
35+
test_model_input = [test[feat].values for feat in sparse_feature_dict] + \
36+
[test[feat].values for feat in dense_feature_list]
3237

33-
# 4.Define Model,compile and train
38+
# 4.Define Model,train,predict and evaluate
3439
model = DeepFM({"sparse": sparse_feature_dict,
3540
"dense": dense_feature_list}, final_activation='sigmoid')
36-
3741
model.compile("adam", "binary_crossentropy",
3842
metrics=['binary_crossentropy'], )
3943

40-
history = model.fit(model_input, data[target].values,
41-
44+
history = model.fit(train_model_input, train[target].values,
4245
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
43-
print("demo done")
46+
pred_ans = model.predict(test_model_input, batch_size=256)
47+
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
48+
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

0 commit comments

Comments
 (0)