gbdt_lr.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #!/usr/bin/env python3
  2. # -*- coding:utf-8 -*-
  3. import numpy as np
  4. from models.rank.data import DataLoader
  5. from lightgbm import LGBMClassifier # 替换为LightGBM
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
  8. from sklearn.model_selection import GridSearchCV
  9. from sklearn.preprocessing import OneHotEncoder
  10. import joblib
  11. import time
  12. class Trainer:
  13. def __init__(self, path):
  14. self._load_data(path)
  15. # 初始化LightGBM和LR模型参数
  16. self._lgbm_params = {
  17. # 核心参数
  18. 'objective': 'binary', # 二分类任务
  19. 'boosting_type': 'gbdt', # 传统GBDT算法
  20. # 'metric': ['auc', 'binary_logloss'], # 评估指标
  21. # 树结构控制
  22. 'num_leaves': 31, # 叶子节点数 (建议20-63)
  23. 'max_depth': 7, # 树深度 (3-7)
  24. 'min_child_samples': 30, # 叶子节点最小样本数 (20-100)
  25. 'min_split_gain': 0.02, # 分裂最小增益 (0.01-0.1)
  26. # 正则化
  27. 'lambda_l1': 0.1, # L1正则 (0-10)
  28. 'lambda_l2': 0.2, # L2正则 (0-10)
  29. 'feature_fraction': 0.8, # 特征采样比例 (0.7-1.0)
  30. 'bagging_fraction': 0.9, # 数据采样比例 (0.8-1.0)
  31. 'bagging_freq': 5, # 每5次迭代执行bagging
  32. # 学习控制
  33. 'learning_rate': 0.05, # 学习率 (0.01-0.1)
  34. 'n_estimators': 1000, # 树的数量 (配合早停)
  35. # 'early_stopping_rounds': 50, # 早停轮数
  36. # 类别特征处理
  37. # 'categorical_feature': 'auto', # 自动检测类别特征
  38. # 'max_cat_to_onehot': 5, # 类别值>5时不做one-hot
  39. # 系统
  40. 'n_jobs': -1, # 使用所有CPU
  41. 'random_state': 42, # 随机种子
  42. 'verbose': -1 # 不输出日志
  43. }
  44. self._lr_params = {
  45. # 求解器
  46. 'penalty': 'elasticnet', # 弹性网络正则
  47. 'solver': 'saga', # 支持elasticnet
  48. 'max_iter': 1000, # 迭代次数
  49. # 正则化
  50. 'C': 0.3, # 逆正则强度 (0.1-1.0)
  51. 'l1_ratio': 0.7, # L1权重 (0.5-0.9)
  52. # 类别平衡
  53. 'class_weight': 'balanced', # 自动平衡类别权重
  54. # 系统
  55. 'random_state': 42,
  56. 'n_jobs': -1, # 并行计算
  57. 'tol': 1e-4 # 早停阈值
  58. }
  59. # 初始化模型
  60. self._lgbm_model = LGBMClassifier(**self._lgbm_params)
  61. self._lr_model = LogisticRegression(**self._lr_params)
  62. self._onehot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
  63. def _load_data(self, path):
  64. dataloader = DataLoader(path)
  65. self._train_dataset, self._test_dataset = dataloader.split_dataset()
  66. def train(self):
  67. """模型训练"""
  68. print("开始训练LightGBM模型...")
  69. # 训练LightGBM模型
  70. self._lgbm_model.fit(self._train_dataset["data"], self._train_dataset["label"])
  71. # 获取LightGBM的叶节点索引
  72. lgbm_train_preds = self._lgbm_model.predict(
  73. self._train_dataset["data"],
  74. pred_leaf=True
  75. )
  76. # 对叶节点索引进行one-hot编码
  77. lgbm_feats_encoded = self._onehot_encoder.fit_transform(lgbm_train_preds)
  78. print("开始训练LR模型...")
  79. # 使用决策树输出作为LR的输入特征
  80. self._lr_model.fit(lgbm_feats_encoded, self._train_dataset["label"])
  81. def predict(self, X):
  82. # 获取LightGBM模型的叶节点索引
  83. lgbm_preds = self._lgbm_model.predict(X, pred_leaf=True)
  84. # 对叶节点索引进行one-hot编码
  85. lgbm_feats_encoded = self._onehot_encoder.transform(lgbm_preds)
  86. # 使用训练好的LR模型进行预测
  87. return self._lr_model.predict(lgbm_feats_encoded)
  88. def predict_proba(self, X):
  89. # 获取LightGBM模型的叶节点索引
  90. lgbm_preds = self._lgbm_model.predict(X, pred_leaf=True)
  91. # 对叶节点索引进行one-hot编码
  92. lgbm_feats_encoded = self._onehot_encoder.transform(lgbm_preds)
  93. # 使用训练好的LR模型输出概率
  94. return self._lr_model.predict_proba(lgbm_feats_encoded)
  95. def evaluate(self):
  96. # 对测试集进行预测
  97. y_pred = self.predict(self._test_dataset["data"])
  98. y_pred_proba = self.predict_proba(self._test_dataset["data"])[:, 1] # 获取正类的概率
  99. # 计算各类评估指标
  100. accuracy = accuracy_score(self._test_dataset["label"], y_pred)
  101. precision = precision_score(self._test_dataset["label"], y_pred)
  102. recall = recall_score(self._test_dataset["label"], y_pred)
  103. f1 = f1_score(self._test_dataset["label"], y_pred)
  104. roc_auc = roc_auc_score(self._test_dataset["label"], y_pred_proba)
  105. return {
  106. 'accuracy': accuracy,
  107. 'precision': precision,
  108. 'recall': recall,
  109. 'f1_score': f1,
  110. 'roc_auc': roc_auc
  111. }
  112. def save_model(self, model_path):
  113. """将模型保存到本地"""
  114. models = {"lgbm_model": self._lgbm_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder}
  115. joblib.dump(models, model_path)
  116. if __name__ == "__main__":
  117. gbdt_data_path = "./data/train_data.csv"
  118. trainer = Trainer(gbdt_data_path)
  119. start_time = time.time()
  120. trainer.train()
  121. end_time = time.time()
  122. training_time_hours = (end_time - start_time) / 3600
  123. print(f"训练时间: {training_time_hours:.4f} 小时")
  124. eval_metrics = trainer.evaluate()
  125. # 输出评估结果
  126. print("LightGBM-LR Evaluation Metrics:")
  127. for metric, value in eval_metrics.items():
  128. print(f"{metric}: {value:.4f}")
  129. # 保存模型
  130. model_path = "./models/rank/weights/model.pkl"
  131. trainer.save_model(model_path)