Update docs

Weichen Shen · Weichen Shen · commit bd5dc215e1c2 · 2019-01-03T12:48:42.000+08:00
diff --git a/docs/source/Examples.md b/docs/source/Examples.md
@@ -17,43 +17,52 @@ and run the following codes.
 
 ```python
 import pandas as pd
-from sklearn.preprocessing import LabelEncoder,MinMaxScaler
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import log_loss, roc_auc_score
 from deepctr.models import DeepFM
 
-
 data = pd.read_csv('./criteo_sample.txt')
 
-sparse_features  = ['C' + str(i) for i in range(1, 27)]
-dense_features = ['I'+str(i) for i in range(1,14)]
+sparse_features = ['C' + str(i) for i in range(1, 27)]
+dense_features = ['I'+str(i) for i in range(1, 14)]
 
 data[sparse_features] = data[sparse_features].fillna('-1', )
 data[dense_features] = data[dense_features].fillna(0,)
-
 target = ['label']
 
 # 1.Label Encoding for sparse features,and do simple Transformation for dense features
 for feat in sparse_features:
     lbe = LabelEncoder()
     data[feat] = lbe.fit_transform(data[feat])
-mms = MinMaxScaler(feature_range=(0,1))
+mms = MinMaxScaler(feature_range=(0, 1))
 data[dense_features] = mms.fit_transform(data[dense_features])
 
 # 2.count #unique features for each sparse field,and record dense feature field name
 
-sparse_feature_dict = {feat: data[feat].nunique() for feat in sparse_features}
+sparse_feature_dict = {feat: data[feat].nunique()
+                        for feat in sparse_features}
 dense_feature_list = dense_features
 
 # 3.generate input data for model
 
-model_input = [data[feat].values for feat in sparse_feature_dict] + [data[feat].values for feat in dense_feature_list]
-
-#4.Define Model,compile and
-
-
-model = DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}, final_activation='sigmoid')
-model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
-history = model.fit(model_input, data[target].values,
-                    batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
+train, test = train_test_split(data, test_size=0.2)
+train_model_input = [train[feat].values for feat in sparse_feature_dict] + \
+    [train[feat].values for feat in dense_feature_list]
+test_model_input = [test[feat].values for feat in sparse_feature_dict] + \
+    [test[feat].values for feat in dense_feature_list]
+
+# 4.Define Model,train,predict and evaluate
+model = DeepFM({"sparse": sparse_feature_dict,
+                "dense": dense_feature_list}, final_activation='sigmoid')
+model.compile("adam", "binary_crossentropy",
+                metrics=['binary_crossentropy'], )
+
+history = model.fit(train_model_input, train[target].values,
+                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
+pred_ans = model.predict(test_model_input, batch_size=256)
+print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
+print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
 ```
 
 ## Regression: Movielens
@@ -70,28 +79,37 @@ This example shows how to use ``DeepFM`` to solve a simple binary regression tas
 
 ```python
 import pandas as pd
-from sklearn.preprocessing import LabelEncoder,MinMaxScaler
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
 from deepctr.models import DeepFM
 
-
 data = pd.read_csv("./movielens_sample.txt")
-sparse_features = [ "movie_id","user_id","gender","age","occupation","zip"]
+sparse_features = ["movie_id", "user_id",
+                    "gender", "age", "occupation", "zip"]
 target = ['rating']
 
 # 1.Label Encoding for sparse features,and do simple Transformation for dense features
 for feat in sparse_features:
     lbe = LabelEncoder()
     data[feat] = lbe.fit_transform(data[feat])
-#2.count #unique features for each sparse field
-sparse_feature_dim = {feat:data[feat].nunique() for feat in sparse_features}
-#3.generate input data for model
-model_input = [data[feat].values for feat in sparse_feature_dim]
-#4.Define Model,compile and train
-model = DeepFM({"sparse":sparse_feature_dim,"dense":[]},final_activation='linear')
-
-model.compile("adam","mse",metrics=['mse'],)
-history = model.fit(model_input,data[target].values,
-            batch_size=256,epochs=10,verbose=2,validation_split=0.2,)
+# 2.count #unique features for each sparse field
+sparse_feature_dim = {feat: data[feat].nunique()
+                        for feat in sparse_features}
+# 3.generate input data for model
+train, test = train_test_split(data, test_size=0.2)
+train_model_input = [train[feat].values for feat in sparse_feature_dim]
+test_model_input = [test[feat].values for feat in sparse_feature_dim]
+# 4.Define Model,train,predict and evaluate
+model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
+                final_activation='linear')
+model.compile("adam", "mse", metrics=['mse'],)
+
+history = model.fit(train_model_input, train[target].values,
+                    batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
+pred_ans = model.predict(test_model_input, batch_size=256)
+print("test MSE", round(mean_squared_error(
+    test[target].values, pred_ans), 4))
 ```
 ## Multi-value Input : Movielens
 ----------------------------------
diff --git a/docs/source/History.md b/docs/source/History.md
@@ -3,5 +3,5 @@
 - 12/27/2018 : [v0.2.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.2.1) released.Add [AutoInt](./Features.html#autoint-automatic-feature-interactiont) Model.
 - 12/22/2018 : [v0.2.0](https://github.com/shenweichen/DeepCTR/releases/tag/v0.2.0) released.Add [xDeepFM](./Features.html#xdeepfm) and automatic check for new version.
 - 12/19/2018 : [v0.1.6](https://github.com/shenweichen/DeepCTR/releases/tag/v0.1.6) released.Now DeepCTR is compatible with tensorflow from `1.4-1.12` except for `1.7` and `1.8`. 
-- 29/11/2018 : [v0.1.4](https://github.com/shenweichen/DeepCTR/releases/tag/v0.1.4) released.Add [FAQ](./FAQ.html) in docs
+- 11/29/2018 : [v0.1.4](https://github.com/shenweichen/DeepCTR/releases/tag/v0.1.4) released.Add [FAQ](./FAQ.html) in docs
 - 11/24/2018 : DeepCTR first version v0.1.0  is released on [PyPi](https://pypi.org/project/deepctr/)
diff --git a/docs/source/Quick-Start.md b/docs/source/Quick-Start.md
@@ -0,0 +1,97 @@
+# Quick-Start
+
+## Installation Guide
+### CPU version
+Install `deepctr` package is through `pip` 
+```bash
+pip install deepctr
+```
+### GPU version
+If you have a `tensorflow-gpu` on your local machine,make sure its version is
+**`tensorflow-gpu>=1.4.0,!=1.7.*,!=1.8.*`**  
+Then,use the following command to install
+```bash
+pip install deepctr --no-deps
+```
+## Getting started: 4 steps to DeepCTR
+
+
+### Step 1: Import model
+
+
+```python
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder,MinMaxScaler
+
+from deepctr.models import DeepFM
+
+data = pd.read_csv('./criteo_sample.txt')
+
+sparse_features  = ['C' + str(i) for i in range(1, 27)]
+dense_features = ['I'+str(i) for i in range(1,14)]
+target = ['label']
+
+data[sparse_features] = data[sparse_features].fillna('-1', )
+data[dense_features] = data[dense_features].fillna(0,)
+```
+    
+
+
+### Step 2: Simple preprocessing
+
+
+Usually there are two simple way to encode the sparse categorical feature for embedding
+
+- Label Encoding: map the features to integer value from 0 ~ len(#unique) - 1
+- Hash Encoding: map the features to a fix range,like 0 ~ 9999
+
+And for dense numerical features,they are usually  discretized to buckets,here we use normalization.
+
+```python
+for feat in sparse_features:
+    lbe = LabelEncoder()# or Hash
+    data[feat] = lbe.fit_transform(data[feat])
+mms = MinMaxScaler(feature_range=(0,1))
+data[dense_features] = mms.fit_transform(data[dense_features])
+```
+
+
+### Step 3: Generate feature config dict
+
+Here, for sparse features, we transform them into dense vectors by embedding techniques.
+For dense numerical features, we add a dummy index like LIBFM.
+That is to say, all dense features under the same field share the same embedding vector.
+In some implementations, the dense feature is concatened to the input embedding vectors of the deep network, you can modify the code yourself.
+
+```python
+sparse_feature_dict = {feat: data[feat].nunique() for feat in sparse_features}
+dense_feature_list = dense_features
+```
+
+### Step 4: Generate the training samples and train the model
+
+There are two rules here that we must follow
+
+  - The sparse features are placed in front of the dense features.
+  - The order of the feature we fit into the model must be consistent with the order of the feature dictionary iterations
+
+
+```python
+# make sure the order is right
+model_input = [data[feat].values for feat in sparse_feature_dict] + [data[feat].values for feat in dense_feature_list]
+
+model = DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}, final_activation='sigmoid')
+model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
+history = model.fit(model_input, data[target].values,
+                    batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
+
+```
+You can check the full code [here](./Examples.html#classification-criteo).
+
+
+
+
+
+
+
+
diff --git a/docs/source/Quick-Start.rst b/docs/source/Quick-Start.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -45,7 +45,7 @@ News
    :maxdepth: 2
    :caption: Home:
 
-   Quick-Start
+   Quick-Start<Quick-Start.md>
    Features
    Examples<Examples.md>
    FAQ<FAQ.md>
diff --git a/examples/run_classification_criteo.py b/examples/run_classification_criteo.py
@@ -1,5 +1,7 @@
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder, MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import log_loss, roc_auc_score
 from deepctr.models import DeepFM
 
 if __name__ == "__main__":
@@ -27,17 +29,20 @@
 
     # 3.generate input data for model
 
-    model_input = [data[feat].values for feat in sparse_feature_dict] + \
-        [data[feat].values for feat in dense_feature_list]  # + [data[target[0]].values]
+    train, test = train_test_split(data, test_size=0.2)
+    train_model_input = [train[feat].values for feat in sparse_feature_dict] + \
+        [train[feat].values for feat in dense_feature_list]
+    test_model_input = [test[feat].values for feat in sparse_feature_dict] + \
+        [test[feat].values for feat in dense_feature_list]
 
-    # 4.Define Model,compile and train
+    # 4.Define Model,train,predict and evaluate
     model = DeepFM({"sparse": sparse_feature_dict,
                     "dense": dense_feature_list}, final_activation='sigmoid')
-
     model.compile("adam", "binary_crossentropy",
                   metrics=['binary_crossentropy'], )
 
-    history = model.fit(model_input, data[target].values,
-
+    history = model.fit(train_model_input, train[target].values,
                         batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
-    print("demo done")
+    pred_ans = model.predict(test_model_input, batch_size=256)
+    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
+    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
diff --git a/examples/run_regression_movielens.py b/examples/run_regression_movielens.py