【推荐系统】Facebook经典模型GBDT+LR代码实践

2022-08-03 366

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介： 在CRT预估中，工业界一般是会采用逻辑回归进行处理，对用户特征画像进行建模，然后计算点击概率，评估用户是否会有点击的行为。

在CRT预估中，工业界一般是会采用逻辑回归进行处理，对用户特征画像进行建模，然后计算点击概率，评估用户是否会有点击的行为。

但是逻辑回归这个算法天生就会有个缺陷，它不能够区分非线性的数据，原因是逻辑回归是在普通的线性回归的基础之上添加了Sigmoid函数，处理的只能是线性数据，那么我们就需要获得线性可分的数据，这是如果采用人工进行组合特征，成本会非常的贵，而且需要有经验的专业人士，才能够获得提升模型效果的组合特征。

在2014年Facebook发表的一篇论文《Practical Lessons from Predicting Clicks on Ads at Facebook》，这篇论文提出了使用GBDT去产生高效的特征组合。

一、导库

import numpy as np

import pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder

from sklearn.metrics import log_loss

import lightgbm as lgb

import gc

from scipy import sparse

import warnings

warnings.filterwarnings('ignore')

二、处理数据

path = 'data/'

df_train = pd.read_csv(path + 'kaggle_train.csv')

df_test = pd.read_csv(path + 'kaggle_test.csv')

# 合并训练集和测试集

df_train.drop(['Id'], axis=1, inplace=True)

df_test.drop(['Id'], axis=1, inplace=True)

df_test['Label'] = -1

data = pd.concat([df_train, df_test], axis=0)

data.fillna(-1, inplace=True)

# 将连续性和类别型特征分离

continuous_feature = ['I' + str(i+1) for i in range(13)]

category_feature = ['C' + str(i+1) for i in range(26)]

三、构建LR模型

def LR_model(data, continuous_feature, category_feature):

# 将连续型特征归一化

scaler = MinMaxScaler()

for col in continuous_feature:

data[col] = scaler.fit_transform(data[col].values.reshape(-1,1))

# 将离散特征进行one-hot编码

for col in category_feature:

onehot_features = pd.get_dummies(data[col], prefix=col)

data.drop([col], axis=1, inplace=True)

data = pd.concat([data, onehot_features], axis=1)

# 将训练集和测试集分开

train_data = data[data['Label'] != -1]

target = train_data.pop('Label')

test_data = data[data['Label'] == -1]

test_data.drop(['Label'], axis=1, inplace=True)

# 划分数据集

x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.3, random_state=2021)

# 构建模型

LR = LogisticRegression()

LR.fit(x_train, y_train)

train_logloss = log_loss(y_train, LR.predict_proba(x_train)[:, 1])

val_logloss = log_loss(y_val, LR.predict_proba(x_val)[:, 1])

print('train_logloss: ',train_logloss)

print('val_logloss：',val_logloss)

# 模型预测

y_pred = LR.predict_proba(test_data)[:, 1]

四、构建GBDT模型

def GBDT_model(data, continuous_feature, category_feature):

# 将分类特征离散化

for col in category_feature:

onehot_feature = pd.get_dummies(data[col], prefix=col)

data.drop([col], axis=1, inplace=True)

data = pd.concat([data, onehot_feature], axis=1)

# 将训练集和测试集分开

train_data = data[data['Label'] != -1]

target = train_data.pop('Label')

test_data = data[data['Label'] == -1]

test_data.drop(['Label'], axis=1, inplace=True)

# 划分数据集

x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.3, random_state=2021)

# 构建模型

GBM = lgb.LGBMClassifier(boosting_type='gbdt',

objective='binary',

subsample=0.8,

min_child_weight=0.5,

colsample_bytree=0.7,

num_leaves=100,

max_depth=12,

learning_rate=0.01,

n_estimators=100,

silent=True

)

GBM.fit(x_train, y_train,

eval_set=[(x_train, y_train), (x_val, y_val)],

eval_names=['train', 'val'],

eval_metric='binary_logloss',

early_stopping_rounds=100,

)

train_logloss = log_loss(y_train, GBM.predict_proba(x_train)[:, 1])

val_logloss = log_loss(y_val, GBM.predict_proba(x_val)[:, 1])

print('train_logloss: ',train_logloss)

print('val_logloss：',val_logloss)

# 模型预测

y_pred = GBM.predict_proba(test_data)[:, 1]

五、构建GBDT+LR融合模型

def GBDT_LR_model(data, continuous_feature, category_feature):

# 将分类特征离散化

for col in category_feature:

onehot_feature = pd.get_dummies(data[col], prefix=col)

data.drop([col], axis=1, inplace=True)

data = pd.concat([data, onehot_feature], axis=1)

# 将训练集和测试集分开

train_data = data[data['Label'] != -1]

target = train_data.pop('Label')

test_data = data[data['Label'] == -1]

test_data.drop(['Label'], axis=1, inplace=True)

# 划分数据集

x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.2, random_state=2021)

# 构建模型

GBM = lgb.LGBMClassifier(boosting_type='gbdt',

objective='binary',

subsample=0.8,

min_child_weight=0.5,

colsample_bytree=0.7,

num_leaves=100,

max_depth=12,

learning_rate=0.01,

n_estimators=100,

silent=True

)

GBM.fit(x_train, y_train,

eval_set=[(x_train, y_train), (x_val, y_val)],

eval_names=['train', 'val'],

eval_metric='binary_logloss',

early_stopping_rounds=100,

)

model = GBM.booster_

gbdt_feats_train = model.predict(train_data, pred_leaf=True)

gbdt_feats_test = model.predict(test_data, pred_leaf=True)

gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]

df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name)

df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)

train = pd.concat([train_data, df_train_gbdt_feats], axis = 1)

test = pd.concat([test_data, df_test_gbdt_feats], axis = 1)

train_len = train.shape[0]

data = pd.concat([train, test])

del train

del test

gc.collect()

# 将连续特征归一化

scaler = MinMaxScaler()

for col in continuous_feature:

data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))

# 将叶子节点特征进行one-hot编码

for col in gbdt_feats_name:

onehot_feats = pd.get_dummies(data[col], prefix = col)

data.drop([col], axis = 1, inplace = True)

data = pd.concat([data, onehot_feats], axis = 1)

train = data[: train_len]

test = data[train_len:]

del data

gc.collect()

# 划分数据集

x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2021)

# 构建LR模型

LR = LogisticRegression()

LR.fit(x_train, y_train)

train_logloss = log_loss(y_train, LR.predict_proba(x_train)[:, 1])

val_logloss = log_loss(y_val, LR.predict_proba(x_val)[:, 1])

print('train-logloss: ', train_logloss)

print('val-logloss: ', val_logloss)

# 模型预测

y_pred = LR.predict_proba(test)[:, 1]

六、评估结果

# 训练和预测LR模型

LR_model(data.copy(), continuous_feature, category_feature)

# 模型训练和预测GBDT模型

GBDT_model(data.copy(), continuous_feature, category_feature)

# 训练和预测GBDT+LR模型

GBDT_LR_model(data.copy(), continuous_feature, category_feature)

【推荐系统】Facebook经典模型GBDT+LR代码实践

一、导库

二、处理数据

三、构建LR模型

四、构建GBDT模型

五、构建GBDT+LR融合模型

六、评估结果

热门文章

最新文章

相关课程

相关电子书

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

训练营

直播

乘风者计划

下载

镜像站

技术资料

【推荐系统】Facebook经典模型GBDT+LR代码实践

一、导库

二、处理数据

三、构建LR模型

四、构建GBDT模型

五、构建GBDT+LR融合模型

六、评估结果

热门文章

最新文章

相关课程

相关电子书