|
|
@@ -4,40 +4,103 @@ import numpy as np
|
|
|
from models.rank.data import DataLoader
|
|
|
from sklearn.ensemble import GradientBoostingClassifier
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
|
|
+from sklearn.model_selection import GridSearchCV
|
|
|
+from sklearn.preprocessing import OneHotEncoder
|
|
|
|
|
|
class Trainer:
|
|
|
def __init__(self, path):
|
|
|
self._load_data(path)
|
|
|
|
|
|
+ # 初始化GBDT和LR模型参数
|
|
|
+ self._gbdt_params = {
|
|
|
+ 'n_estimators': 100,
|
|
|
+ 'learning_rate': 0.01,
|
|
|
+ 'max_depth': 6,
|
|
|
+ 'subsample': 0.8,
|
|
|
+ 'random_state': 42,
|
|
|
+ }
|
|
|
+ self._lr_params = {
|
|
|
+ "max_iter": 1000,
|
|
|
+ 'C': 1.0,
|
|
|
+ 'penalty': 'l2',
|
|
|
+ 'solver': 'liblinear',
|
|
|
+ 'random_state': 42,
|
|
|
+ 'class_weight': 'balanced'
|
|
|
+ }
|
|
|
+
|
|
|
+ # 初始化模型
|
|
|
+ self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
|
|
|
+ self._lr_model = LogisticRegression(**self._lr_params)
|
|
|
+
|
|
|
+ self._onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
|
|
+
|
|
|
def _load_data(self, path):
|
|
|
dataloader = DataLoader(path)
|
|
|
- self._train_dataset, self._val_dataset, self._test_dataset = dataloader.split_dataset()
|
|
|
+ self._train_dataset, self._test_dataset = dataloader.split_dataset()
|
|
|
|
|
|
- def train_gbdt(self):
|
|
|
- self._gbdt_model = GradientBoostingClassifier(
|
|
|
- n_estimators=100,
|
|
|
- learning_rate=0.1,
|
|
|
- max_depth=3,
|
|
|
- random_state=42,
|
|
|
- )
|
|
|
-
|
|
|
- # 模型训练
|
|
|
+ def train(self):
|
|
|
+ """模型训练"""
|
|
|
+ print("开始训练GBDT模型...")
|
|
|
+ # 训练GBDT模型
|
|
|
self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"])
|
|
|
|
|
|
- def train_lr(self):
|
|
|
- gbdt_train_prdes = self._gbdt_model.predict_proba(self._train_dataset["data"])[:, 1] # 获取正类概率
|
|
|
- gbdt_val_prdes = self._gbdt_model.predict_proba(self._val_dataset["data"])[:, 1]
|
|
|
+ # 获取GBDT的每棵树的分数(决策值)
|
|
|
+ gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0] # 仅取每棵树的叶节点输出
|
|
|
+
|
|
|
+ gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds)
|
|
|
+
|
|
|
+ print("开始训练LR模型...")
|
|
|
+ # 使用决策树输出作为LR的输入特征
|
|
|
+ self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"])
|
|
|
+
|
|
|
+ def predict(self, X):
|
|
|
+ # 获取GBDT模型的预测分数
|
|
|
+ gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
|
|
|
+
|
|
|
+ gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
|
|
|
+
|
|
|
+ # 使用训练好的LR模型输出概率
|
|
|
+ return self._lr_model.predict(gbdt_feats_encoded)
|
|
|
+
|
|
|
+ def predict_proba(self, X):
|
|
|
+ # 获取GBDT模型的预测分数
|
|
|
+ gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
|
|
|
+
|
|
|
+ gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
|
|
|
+
|
|
|
+ # 使用训练好的LR模型输出概率
|
|
|
+ return self._lr_model.predict_proba(gbdt_feats_encoded)
|
|
|
|
|
|
- # 将GBDT的预测结果作为额外特征来训练LR
|
|
|
- lr_train_data = np.column_stack([self._train_dataset["data"], gbdt_train_prdes])
|
|
|
- lr_val_data = np.column_stack([self._val_dataset["data"], gbdt_val_prdes])
|
|
|
+ def evaluate(self):
|
|
|
+ # 对测试集进行预测
|
|
|
+ y_pred = self.predict(self._test_dataset["data"])
|
|
|
+ y_pred_proba = self.predict_proba(self._test_dataset["data"])[:, 1] # 获取正类的概率
|
|
|
|
|
|
- # 训练LR模型
|
|
|
- self.lr_model = LogisticRegression(solver='saga', max_iter=1000)
|
|
|
- self.lr_model.fit(lr_train_data, self._train_dataset["label"])
|
|
|
+ # 计算各类评估指标
|
|
|
+ accuracy = accuracy_score(self._test_dataset["label"], y_pred)
|
|
|
+ precision = precision_score(self._test_dataset["label"], y_pred)
|
|
|
+ recall = recall_score(self._test_dataset["label"], y_pred)
|
|
|
+ f1 = f1_score(self._test_dataset["label"], y_pred)
|
|
|
+ roc_auc = roc_auc_score(self._test_dataset["label"], y_pred_proba)
|
|
|
|
|
|
+ return {
|
|
|
+ 'accuracy': accuracy,
|
|
|
+ 'precision': precision,
|
|
|
+ 'recall': recall,
|
|
|
+ 'f1_score': f1,
|
|
|
+ 'roc_auc': roc_auc
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
gbdt_data_path = "./models/rank/data/gbdt_data.csv"
|
|
|
trainer = Trainer(gbdt_data_path)
|
|
|
- trainer.train_gbdt()
|
|
|
- trainer.train_lr()
|
|
|
+ trainer.train()
|
|
|
+ eval_metrics = trainer.evaluate()
|
|
|
+
|
|
|
+ # 输出评估结果
|
|
|
+ print("GBDT-LR Evaluation Metrics:")
|
|
|
+ for metric, value in eval_metrics.items():
|
|
|
+ print(f"{metric}: {value:.4f}")
|
|
|
+
|