#!/usr/bin/env python3 # -*- coding:utf-8 -*- import numpy as np from models.rank.data import DataLoader from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import OneHotEncoder import joblib import time class Trainer: def __init__(self, path): self._load_data(path) # 初始化GBDT和LR模型参数 self._gbdt_params = { 'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 6, 'subsample': 0.8, 'random_state': 42, } self._lr_params = { "max_iter": 1000, 'C': 1.0, 'penalty': 'elasticnet', 'l1_ratio': 0.8, # 添加 l1_ratio 参数,可以根据需要调整 'solver': 'saga', 'random_state': 42, 'class_weight': 'balanced' } # 初始化模型 self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params) self._lr_model = LogisticRegression(**self._lr_params) self._onehot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore') def _load_data(self, path): dataloader = DataLoader(path) self._train_dataset, self._test_dataset = dataloader.split_dataset() def train(self): """模型训练""" print("开始训练GBDT模型...") # 训练GBDT模型 self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"]) # 获取GBDT的每棵树的分数(决策值) gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0] # 仅取每棵树的叶节点输出 gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds) print("开始训练LR模型...") # 使用决策树输出作为LR的输入特征 self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"]) def predict(self, X): # 获取GBDT模型的预测分数 gbdt_preds = self._gbdt_model.apply(X)[:, :, 0] gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds) # 使用训练好的LR模型输出概率 return self._lr_model.predict(gbdt_feats_encoded) def predict_proba(self, X): # 获取GBDT模型的预测分数 gbdt_preds = self._gbdt_model.apply(X)[:, :, 0] gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds) # 使用训练好的LR模型输出概率 return self._lr_model.predict_proba(gbdt_feats_encoded) def evaluate(self): # 对测试集进行预测 y_pred = self.predict(self._test_dataset["data"]) y_pred_proba = self.predict_proba(self._test_dataset["data"])[:, 1] # 获取正类的概率 # 计算各类评估指标 accuracy = accuracy_score(self._test_dataset["label"], y_pred) precision = precision_score(self._test_dataset["label"], y_pred) recall = recall_score(self._test_dataset["label"], y_pred) f1 = f1_score(self._test_dataset["label"], y_pred) roc_auc = roc_auc_score(self._test_dataset["label"], y_pred_proba) return { 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'roc_auc': roc_auc } def save_model(self, model_path): """将模型保存到本地""" models = {"gbdt_model": self._gbdt_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder} joblib.dump(models, model_path) if __name__ == "__main__": gbdt_data_path = "./models/rank/data/gbdt_data.csv" trainer = Trainer(gbdt_data_path) start_time = time.time() trainer.train() end_time = time.time() training_time_hours = (end_time - start_time) / 3600 print(f"训练时间: {training_time_hours:.4f} 小时") eval_metrics = trainer.evaluate() # 输出评估结果 print("GBDT-LR Evaluation Metrics:") for metric, value in eval_metrics.items(): print(f"{metric}: {value:.4f}") # 保存模型 model_path = "./models/rank/weights/model.pkl" trainer.save_model(model_path)