|
|
@@ -2,7 +2,7 @@
|
|
|
# -*- coding:utf-8 -*-
|
|
|
import numpy as np
|
|
|
from models.rank.data import DataLoader
|
|
|
-from sklearn.ensemble import GradientBoostingClassifier
|
|
|
+from lightgbm import LGBMClassifier # 替换为LightGBM
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
|
|
from sklearn.model_selection import GridSearchCV
|
|
|
@@ -14,26 +14,61 @@ class Trainer:
|
|
|
def __init__(self, path):
|
|
|
self._load_data(path)
|
|
|
|
|
|
- # 初始化GBDT和LR模型参数
|
|
|
- self._gbdt_params = {
|
|
|
- 'n_estimators': 100,
|
|
|
- 'learning_rate': 0.01,
|
|
|
- 'max_depth': 6,
|
|
|
- 'subsample': 0.8,
|
|
|
- 'random_state': 42,
|
|
|
+ # 初始化LightGBM和LR模型参数
|
|
|
+ self._lgbm_params = {
|
|
|
+ # 核心参数
|
|
|
+ 'objective': 'binary', # 二分类任务
|
|
|
+ 'boosting_type': 'gbdt', # 传统GBDT算法
|
|
|
+ # 'metric': ['auc', 'binary_logloss'], # 评估指标
|
|
|
+
|
|
|
+ # 树结构控制
|
|
|
+ 'num_leaves': 31, # 叶子节点数 (建议20-63)
|
|
|
+ 'max_depth': 7, # 树深度 (3-7)
|
|
|
+ 'min_child_samples': 30, # 叶子节点最小样本数 (20-100)
|
|
|
+ 'min_split_gain': 0.02, # 分裂最小增益 (0.01-0.1)
|
|
|
+
|
|
|
+ # 正则化
|
|
|
+ 'lambda_l1': 0.1, # L1正则 (0-10)
|
|
|
+ 'lambda_l2': 0.2, # L2正则 (0-10)
|
|
|
+ 'feature_fraction': 0.8, # 特征采样比例 (0.7-1.0)
|
|
|
+ 'bagging_fraction': 0.9, # 数据采样比例 (0.8-1.0)
|
|
|
+ 'bagging_freq': 5, # 每5次迭代执行bagging
|
|
|
+
|
|
|
+ # 学习控制
|
|
|
+ 'learning_rate': 0.05, # 学习率 (0.01-0.1)
|
|
|
+ 'n_estimators': 1000, # 树的数量 (配合早停)
|
|
|
+ # 'early_stopping_rounds': 50, # 早停轮数
|
|
|
+
|
|
|
+ # 类别特征处理
|
|
|
+ # 'categorical_feature': 'auto', # 自动检测类别特征
|
|
|
+ # 'max_cat_to_onehot': 5, # 类别值>5时不做one-hot
|
|
|
+
|
|
|
+ # 系统
|
|
|
+ 'n_jobs': -1, # 使用所有CPU
|
|
|
+ 'random_state': 42, # 随机种子
|
|
|
+ 'verbose': -1 # 不输出日志
|
|
|
}
|
|
|
self._lr_params = {
|
|
|
- "max_iter": 1000,
|
|
|
- 'C': 1.0,
|
|
|
- 'penalty': 'elasticnet',
|
|
|
- 'l1_ratio': 0.8, # 添加 l1_ratio 参数,可以根据需要调整
|
|
|
- 'solver': 'saga',
|
|
|
+ # 求解器
|
|
|
+ 'penalty': 'elasticnet', # 弹性网络正则
|
|
|
+ 'solver': 'saga', # 支持elasticnet
|
|
|
+ 'max_iter': 1000, # 迭代次数
|
|
|
+
|
|
|
+ # 正则化
|
|
|
+ 'C': 0.3, # 逆正则强度 (0.1-1.0)
|
|
|
+ 'l1_ratio': 0.7, # L1权重 (0.5-0.9)
|
|
|
+
|
|
|
+ # 类别平衡
|
|
|
+ 'class_weight': 'balanced', # 自动平衡类别权重
|
|
|
+
|
|
|
+ # 系统
|
|
|
'random_state': 42,
|
|
|
- 'class_weight': 'balanced'
|
|
|
+ 'n_jobs': -1, # 并行计算
|
|
|
+ 'tol': 1e-4 # 早停阈值
|
|
|
}
|
|
|
|
|
|
# 初始化模型
|
|
|
- self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
|
|
|
+ self._lgbm_model = LGBMClassifier(**self._lgbm_params)
|
|
|
self._lr_model = LogisticRegression(**self._lr_params)
|
|
|
|
|
|
self._onehot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
|
|
|
@@ -44,36 +79,42 @@ class Trainer:
|
|
|
|
|
|
def train(self):
|
|
|
"""模型训练"""
|
|
|
- print("开始训练GBDT模型...")
|
|
|
- # 训练GBDT模型
|
|
|
- self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"])
|
|
|
+ print("开始训练LightGBM模型...")
|
|
|
+ # 训练LightGBM模型
|
|
|
+ self._lgbm_model.fit(self._train_dataset["data"], self._train_dataset["label"])
|
|
|
|
|
|
- # 获取GBDT的每棵树的分数(决策值)
|
|
|
- gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0] # 仅取每棵树的叶节点输出
|
|
|
+ # 获取LightGBM的叶节点索引
|
|
|
+ lgbm_train_preds = self._lgbm_model.predict(
|
|
|
+ self._train_dataset["data"],
|
|
|
+ pred_leaf=True
|
|
|
+ )
|
|
|
|
|
|
- gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds)
|
|
|
+ # 对叶节点索引进行one-hot编码
|
|
|
+ lgbm_feats_encoded = self._onehot_encoder.fit_transform(lgbm_train_preds)
|
|
|
|
|
|
print("开始训练LR模型...")
|
|
|
# 使用决策树输出作为LR的输入特征
|
|
|
- self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"])
|
|
|
+ self._lr_model.fit(lgbm_feats_encoded, self._train_dataset["label"])
|
|
|
|
|
|
def predict(self, X):
|
|
|
- # 获取GBDT模型的预测分数
|
|
|
- gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
|
|
|
+ # 获取LightGBM模型的叶节点索引
|
|
|
+ lgbm_preds = self._lgbm_model.predict(X, pred_leaf=True)
|
|
|
|
|
|
- gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
|
|
|
+ # 对叶节点索引进行one-hot编码
|
|
|
+ lgbm_feats_encoded = self._onehot_encoder.transform(lgbm_preds)
|
|
|
|
|
|
- # 使用训练好的LR模型输出概率
|
|
|
- return self._lr_model.predict(gbdt_feats_encoded)
|
|
|
+ # 使用训练好的LR模型进行预测
|
|
|
+ return self._lr_model.predict(lgbm_feats_encoded)
|
|
|
|
|
|
def predict_proba(self, X):
|
|
|
- # 获取GBDT模型的预测分数
|
|
|
- gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
|
|
|
+ # 获取LightGBM模型的叶节点索引
|
|
|
+ lgbm_preds = self._lgbm_model.predict(X, pred_leaf=True)
|
|
|
|
|
|
- gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
|
|
|
+ # 对叶节点索引进行one-hot编码
|
|
|
+ lgbm_feats_encoded = self._onehot_encoder.transform(lgbm_preds)
|
|
|
|
|
|
# 使用训练好的LR模型输出概率
|
|
|
- return self._lr_model.predict_proba(gbdt_feats_encoded)
|
|
|
+ return self._lr_model.predict_proba(lgbm_feats_encoded)
|
|
|
|
|
|
def evaluate(self):
|
|
|
# 对测试集进行预测
|
|
|
@@ -97,7 +138,7 @@ class Trainer:
|
|
|
|
|
|
def save_model(self, model_path):
|
|
|
"""将模型保存到本地"""
|
|
|
- models = {"gbdt_model": self._gbdt_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder}
|
|
|
+ models = {"lgbm_model": self._lgbm_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder}
|
|
|
joblib.dump(models, model_path)
|
|
|
|
|
|
|
|
|
@@ -115,11 +156,10 @@ if __name__ == "__main__":
|
|
|
eval_metrics = trainer.evaluate()
|
|
|
|
|
|
# 输出评估结果
|
|
|
- print("GBDT-LR Evaluation Metrics:")
|
|
|
+ print("LightGBM-LR Evaluation Metrics:")
|
|
|
for metric, value in eval_metrics.items():
|
|
|
print(f"{metric}: {value:.4f}")
|
|
|
|
|
|
# 保存模型
|
|
|
model_path = "./models/rank/weights/model.pkl"
|
|
|
- trainer.save_model(model_path)
|
|
|
-
|
|
|
+ trainer.save_model(model_path)
|