gbdt_lr.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. #!/usr/bin/env python3
  2. # -*- coding:utf-8 -*-
  3. import numpy as np
  4. from models.rank.data import DataLoader
  5. from sklearn.ensemble import GradientBoostingClassifier
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
  8. from sklearn.model_selection import GridSearchCV
  9. from sklearn.preprocessing import OneHotEncoder
  10. class Trainer:
  11. def __init__(self, path):
  12. self._load_data(path)
  13. # 初始化GBDT和LR模型参数
  14. self._gbdt_params = {
  15. 'n_estimators': 100,
  16. 'learning_rate': 0.01,
  17. 'max_depth': 6,
  18. 'subsample': 0.8,
  19. 'random_state': 42,
  20. }
  21. self._lr_params = {
  22. "max_iter": 1000,
  23. 'C': 1.0,
  24. 'penalty': 'l2',
  25. 'solver': 'liblinear',
  26. 'random_state': 42,
  27. 'class_weight': 'balanced'
  28. }
  29. # 初始化模型
  30. self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
  31. self._lr_model = LogisticRegression(**self._lr_params)
  32. self._onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
  33. def _load_data(self, path):
  34. dataloader = DataLoader(path)
  35. self._train_dataset, self._test_dataset = dataloader.split_dataset()
  36. def train(self):
  37. """模型训练"""
  38. print("开始训练GBDT模型...")
  39. # 训练GBDT模型
  40. self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"])
  41. # 获取GBDT的每棵树的分数(决策值)
  42. gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0] # 仅取每棵树的叶节点输出
  43. gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds)
  44. print("开始训练LR模型...")
  45. # 使用决策树输出作为LR的输入特征
  46. self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"])
  47. def predict(self, X):
  48. # 获取GBDT模型的预测分数
  49. gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
  50. gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
  51. # 使用训练好的LR模型输出概率
  52. return self._lr_model.predict(gbdt_feats_encoded)
  53. def predict_proba(self, X):
  54. # 获取GBDT模型的预测分数
  55. gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
  56. gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
  57. # 使用训练好的LR模型输出概率
  58. return self._lr_model.predict_proba(gbdt_feats_encoded)
  59. def evaluate(self):
  60. # 对测试集进行预测
  61. y_pred = self.predict(self._test_dataset["data"])
  62. y_pred_proba = self.predict_proba(self._test_dataset["data"])[:, 1] # 获取正类的概率
  63. # 计算各类评估指标
  64. accuracy = accuracy_score(self._test_dataset["label"], y_pred)
  65. precision = precision_score(self._test_dataset["label"], y_pred)
  66. recall = recall_score(self._test_dataset["label"], y_pred)
  67. f1 = f1_score(self._test_dataset["label"], y_pred)
  68. roc_auc = roc_auc_score(self._test_dataset["label"], y_pred_proba)
  69. return {
  70. 'accuracy': accuracy,
  71. 'precision': precision,
  72. 'recall': recall,
  73. 'f1_score': f1,
  74. 'roc_auc': roc_auc
  75. }
  76. if __name__ == "__main__":
  77. gbdt_data_path = "./models/rank/data/gbdt_data.csv"
  78. trainer = Trainer(gbdt_data_path)
  79. trainer.train()
  80. eval_metrics = trainer.evaluate()
  81. # 输出评估结果
  82. print("GBDT-LR Evaluation Metrics:")
  83. for metric, value in eval_metrics.items():
  84. print(f"{metric}: {value:.4f}")