Day45 神经网络调参

nlp_gte_sentence-embedding_chinese-large

GTE (General Text Embeddings) 是阿里达摩院推出的通用文本向量模型,专门针对中文场景优化,可将文本转换为高质量的向量表示。

一、核心调参知识点 (表格数据专用)

1. 网络架构设计 (Architecture)
  • Embedding 层 (关键)
    • 适用对象:高基数分类变量(如 user_iditem_idshop_id)。
    • 维度公式: d=min⁡(600,⌈n0.25⌉×2)d=min(600,⌈n0.25⌉×2) 或简单取 n4×24n​×2 。不要设太大,容易过拟合。
    • 作用:将稀疏的 One-Hot 向量压缩为稠密向量,捕捉特征间关系。
  • 全连接层 (MLP)
    • 宽度:通常呈金字塔型递减(如 512 -> 256 -> 128 -> 64)。
    • 深度:表格数据通常不需要太深,3-5 层足矣。过深会导致梯度消失且难以训练。
  • 激活函数
    • 推荐 ReLU (简单高效) 或 GELU/SiLU (更平滑,收敛更好)。避免在深层使用 Sigmoid/Tanh。
  • 残差连接 (Residuals)
    • 如果层数超过 4 层,建议加入 ResNet 风格的跳跃连接,防止退化。
2. 正则化与防过拟合 (Regularization)
  • Dropout
    • 表格数据极易过拟合。建议在 MLP 层之间加入 Dropout。
    • 经验值0.1 ~ 0.3。如果数据量小,可提高到 0.5
  • Batch Normalization (BN)
    • 放在 Linear 之后,Activation 之前。能加速收敛并起到轻微正则化作用。
  • 权重衰减 (Weight Decay / L2)
    • AdamW 优化器自带。推荐范围:1e-5 ~ 1e-3
3. 优化器与学习率 (Optimizer & LR)
  • 优化器:首选 AdamW (比 Adam 解耦了权重衰减,泛化更好)。
  • 学习率 (LR)
    • 初始值:1e-3 (0.001) 是通用起点。
    • Warmup:前几个 Epoch 线性增加 LR,防止初期梯度爆炸。
    • Scheduler:使用 ReduceLROnPlateau (验证集 Loss 不降时减半) 或 CosineAnnealing
4. 损失函数与不平衡处理 (Loss & Imbalance)
  • 问题:IJCAI 转化率极低 (~2%),正负样本极度不平衡。
  • 解决方案
    • Focal Loss:降低易分类样本权重,聚焦难分样本。
    • Weighted BCE:给正样本更高的权重 ( weight=NnegNposweight=Npos​Nneg​​ )。
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# ==========================
# 1. 定义 Focal Loss (解决样本不平衡)
# ==========================
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        if self.reduction == 'mean':
            return torch.mean(focal_loss)
        elif self.reduction == 'sum':
            return torch.sum(focal_loss)
        return focal_loss

# ==========================
# 2. 构建神经网络模型
# ==========================
class TabularNN(nn.Module):
    def __init__(self, cat_dims, num_dims, emb_dims, hidden_units, dropout=0.2):
        """
        cat_dims: 每个类别特征的基数 (list)
        num_dims: 数值特征的数量 (int)
        emb_dims: 每个类别特征对应的 embedding 维度 (list)
        hidden_units: MLP 隐藏层单元数 (list, e.g., [512, 256, 128])
        """
        super(TabularNN, self).__init__()
        
        # 1. Embedding 层列表
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_embeddings=count, embedding_dim=dim)
            for count, dim in zip(cat_dims, emb_dims)
        ])
        
        # 2. 计算输入 MLP 的总维度
        emb_total_dim = sum(emb_dims)
        input_dim = emb_total_dim + num_dims
        
        # 3. 构建 MLP 塔
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_units:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        
        self.mlp = nn.Sequential(*layers)
        
        # 4. 输出层
        self.out_layer = nn.Linear(prev_dim, 1)
        
        # 初始化权重
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, cat_features, num_features):
        # cat_features: List of tensors [B,], num_features: Tensor [B, N]
        
        # 1. Embedding 拼接
        emb_outputs = [emb(cat) for emb, cat in zip(self.embeddings, cat_features)]
        emb_concat = torch.cat(emb_outputs, dim=1)
        
        # 2. 与数值特征拼接
        x = torch.cat([emb_concat, num_features], dim=1)
        
        # 3. MLP 前向传播
        x = self.mlp(x)
        
        # 4. 输出 (Logits, 不带 Sigmoid,因为 Loss 用 BCEWithLogits)
        return self.out_layer(x)

# ==========================
# 3. 训练与验证流程 (含调参策略)
# ==========================
def train_model(model, train_loader, val_loader, device, epochs=50, lr=1e-3, weight_decay=1e-5):
    criterion = FocalLoss(alpha=1.0, gamma=2.0) # 或者用 nn.BCEWithLogitsLoss(pos_weight=torch.tensor([20.0]).to(device))
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    # 学习率调度:当验证集 Loss 不下降时减半
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
    
    best_val_loss = float('inf')
    patience_counter = 0
    early_stop_patience = 7
    best_model_state = None
    
    print(f"{'Epoch':<6} | {'Train Loss':<10} | {'Val Loss':<10} | {'Val AUC':<8} | {'LR'}")
    print("-" * 60)
    
    for epoch in range(epochs):
        # --- 训练阶段 ---
        model.train()
        train_loss_sum = 0
        for cat_batch, num_batch, y_batch in train_loader:
            cat_batch = [c.to(device) for c in cat_batch]
            num_batch = num_batch.to(device)
            y_batch = y_batch.to(device)
            
            optimizer.zero_grad()
            outputs = model(cat_batch, num_batch)
            loss = criterion(outputs, y_batch.unsqueeze(1).float())
            
            loss.backward()
            # 梯度裁剪 (防止梯度爆炸,重要!)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            train_loss_sum += loss.item()
        
        avg_train_loss = train_loss_sum / len(train_loader)
        
        # --- 验证阶段 ---
        model.eval()
        val_loss_sum = 0
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for cat_batch, num_batch, y_batch in val_loader:
                cat_batch = [c.to(device) for c in cat_batch]
                num_batch = num_batch.to(device)
                y_batch = y_batch.to(device)
                
                outputs = model(cat_batch, num_batch)
                loss = criterion(outputs, y_batch.unsqueeze(1).float())
                val_loss_sum += loss.item()
                
                probs = torch.sigmoid(outputs).cpu().numpy()
                all_preds.extend(probs)
                all_targets.extend(y_batch.cpu().numpy())
        
        avg_val_loss = val_loss_sum / len(val_loader)
        val_auc = roc_auc_score(all_targets, all_preds)
        
        # 打印进度
        current_lr = optimizer.param_groups[0]['lr']
        print(f"{epoch+1:<6} | {avg_train_loss:.6f}   | {avg_val_loss:.6f}   | {val_auc:.4f}   | {current_lr:.2e}")
        
        # 调度器步长
        scheduler.step(avg_val_loss)
        
        # 早停机制 (Early Stopping)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy() # 保存最佳模型
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                print(f"\n⚠️ 早停触发于 Epoch {epoch+1}")
                break
    
    # 恢复最佳模型
    if best_model_state:
        model.load_state_dict(best_model_state)
        print(f"✅ 已加载最佳模型 (Val Loss: {best_val_loss:.6f})")
    
    return model

# ==========================
# 4. 数据预处理示例 (模拟 IJCAI 数据)
# ==========================
def prepare_data(df, cat_cols, num_cols, target_col='is_trade'):
    # 1. 标签编码 (Label Encoding)
    le_dict = {}
    cat_data = []
    cat_dims = []
    emb_dims = []
    
    for col in cat_cols:
        le = LabelEncoder()
        # 注意:测试集可能有未见过的类别,这里简化处理,实际需 fit 在 train+test 联合集上
        df[col] = df[col].fillna(-1).astype(str) 
        le.fit(df[col])
        df[col] = le.transform(df[col])
        le_dict[col] = le
        
        n_unique = df[col].nunique()
        cat_data.append(torch.tensor(df[col].values, dtype=torch.long))
        cat_dims.append(n_unique)
        # Embedding 维度经验公式
        emb_dims.append(min(600, int(np.ceil(n_unique ** 0.25) * 2)))
    
    # 2. 数值特征标准化
    num_data = df[num_cols].fillna(0).values
    # 简单归一化 (实际建议用 StandardScaler)
    num_data = (num_data - num_data.mean(axis=0)) / (num_data.std(axis=0) + 1e-8)
    num_tensor = torch.tensor(num_data, dtype=torch.float32)
    
    # 3. 标签
    if target_col in df.columns:
        y = df[target_col].values
        # 过滤掉测试集的 -1
        mask = y != -1
        y = y[mask]
        cat_data = [c[mask] for c in cat_data]
        num_tensor = num_tensor[mask]
        y_tensor = torch.tensor(y, dtype=torch.long)
    else:
        y_tensor = None # 测试集无标签
        
    return cat_data, num_tensor, y_tensor, cat_dims, emb_dims

# ==========================
# 5. 主执行入口
# ==========================
if __name__ == "__main__":
    # 假设您已经有了处理好的 DataFrame 'df'
    # df = pd.read_csv(...) 
    
    # 模拟数据生成 (替换为您的真实数据加载)
    print("正在生成模拟数据...")
    n_samples = 10000
    df_sim = pd.DataFrame({
        'user_id': np.random.randint(0, 5000, n_samples),
        'item_id': np.random.randint(0, 8000, n_samples),
        'shop_id': np.random.randint(0, 2000, n_samples),
        'hour': np.random.randint(0, 24, n_samples),
        'price': np.random.rand(n_samples) * 100,
        'is_trade': np.random.choice([0, 1], n_samples, p=[0.98, 0.02]) # 模拟不平衡
    })
    
    # 配置特征
    CAT_COLS = ['user_id', 'item_id', 'shop_id', 'hour']
    NUM_COLS = ['price']
    TARGET = 'is_trade'
    
    # 准备数据
    cat_tensors, num_tensor, y_tensor, cat_dims, emb_dims = prepare_data(df_sim, CAT_COLS, NUM_COLS, TARGET)
    
    # 划分训练/验证集 (简单切片)
    split_idx = int(len(y_tensor) * 0.8)
    
    train_cat = [c[:split_idx] for c in cat_tensors]
    train_num = num_tensor[:split_idx]
    train_y = y_tensor[:split_idx]
    
    val_cat = [c[split_idx:] for c in cat_tensors]
    val_num = num_tensor[split_idx:]
    val_y = y_tensor[split_idx:]
    
    # 构建 DataLoader
    train_dataset = TensorDataset(*train_cat, train_num, train_y)
    val_dataset = TensorDataset(*val_cat, val_num, val_y)
    
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)
    
    # 初始化模型
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"使用设备: {device}")
    
    hidden_layers = [512, 256, 128, 64] # 调参重点:尝试减少层数或宽度
    model = TabularNN(
        cat_dims=cat_dims,
        num_dims=len(NUM_COLS),
        emb_dims=emb_dims,
        hidden_units=hidden_layers,
        dropout=0.3 # 调参重点:0.2 ~ 0.5
    ).to(device)
    
    print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")
    
    # 开始训练
    best_model = train_model(
        model, 
        train_loader, 
        val_loader, 
        device, 
        epochs=30, 
        lr=1e-3,       # 调参重点:1e-2 ~ 1e-4
        weight_decay=1e-4 # 调参重点:1e-6 ~ 1e-3
    )
    
    print("\n🎉 训练完成!")

您可能感兴趣的与本文相关的镜像

nlp_gte_sentence-embedding_chinese-large

nlp_gte_sentence-embedding_chinese-large

文本生成
特征提取
模型微调

GTE (General Text Embeddings) 是阿里达摩院推出的通用文本向量模型,专门针对中文场景优化,可将文本转换为高质量的向量表示。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值