Pārlūkot izejas kodu

完善GBDT-LR模型

Sherlock 1 gadu atpakaļ
vecāks
revīzija
6072ed8657
3 mainītis faili ar 88 papildinājumiem un 34 dzēšanām
  1. 3 12
      models/rank/data/dataloader.py
  2. 1 1
      models/rank/data/preprocess.py
  3. 84 21
      models/rank/gbdt_lr.py

+ 3 - 12
models/rank/data/dataloader.py

@@ -32,27 +32,18 @@ class DataLoader:
         features = self._gbdt_data.drop("label", axis=1)
         labels = self._gbdt_data["label"]
         
-        # 2. 划分数据集,70%训练集、15%验证集、15%测试集
-        X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.3, random_state=42, shuffle=True)
-        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp,test_size=0.5, random_state=42,shuffle=True)
-        
-        # 获取One-Hot编码列和数值型列
-        
+        # 2. 划分数据集,80%训练集、20%的测试集
+        X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, shuffle=True)
         
         # 3. 数据标准化(仅对特征进行标准化)
         scaler = StandardScaler()
         X_train[self._numeric_columns] = scaler.fit_transform(X_train[self._numeric_columns])
-        X_val[self._numeric_columns] = scaler.fit_transform(X_val[self._numeric_columns])
         X_test[self._numeric_columns] = scaler.fit_transform(X_test[self._numeric_columns])
         
         train_dataset = {"data": X_train, "label": y_train}
-        val_dataset = {"data": X_val, "label": y_val}
         test_dataset = {"data": X_test, "label": y_test}
         
-        train_data = pd.DataFrame(X_train, columns=self._gbdt_data.drop('label', axis=1).columns)
-        train_data['label'] = y_train
-        
-        return train_dataset, val_dataset, test_dataset
+        return train_dataset, test_dataset
     
 if __name__ == '__main__':
     path = './models/rank/data/gbdt_data.csv'

+ 1 - 1
models/rank/data/preprocess.py

@@ -87,7 +87,7 @@ class DataProcess():
         negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
         
         positive_count = len(positive_samples)
-        negative_count = min(2 * positive_count, len(negative_samples))
+        negative_count = min(1 * positive_count, len(negative_samples))
         print(positive_count)
         print(negative_count)
         

+ 84 - 21
models/rank/gbdt_lr.py

@@ -4,40 +4,103 @@ import numpy as np
 from models.rank.data import DataLoader
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import OneHotEncoder
 
 class Trainer:
     def __init__(self, path):
         self._load_data(path)
         
+        # 初始化GBDT和LR模型参数
+        self._gbdt_params = {
+            'n_estimators': 100,
+            'learning_rate': 0.01,
+            'max_depth': 6,
+            'subsample': 0.8,
+            'random_state': 42,
+        }
+        self._lr_params = {
+            "max_iter": 1000,
+            'C': 1.0, 
+            'penalty': 'l2', 
+            'solver': 'liblinear',
+            'random_state': 42,
+            'class_weight': 'balanced'
+        }
+        
+        # 初始化模型
+        self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
+        self._lr_model = LogisticRegression(**self._lr_params)
+        
+        self._onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        
     def _load_data(self, path):
         dataloader = DataLoader(path)
-        self._train_dataset, self._val_dataset, self._test_dataset = dataloader.split_dataset()
+        self._train_dataset, self._test_dataset = dataloader.split_dataset()
         
-    def train_gbdt(self):
-        self._gbdt_model = GradientBoostingClassifier(
-            n_estimators=100,
-            learning_rate=0.1,
-            max_depth=3,
-            random_state=42,
-        )
-        
-        # 模型训练
+    def train(self):
+        """模型训练"""
+        print("开始训练GBDT模型...")
+        # 训练GBDT模型
         self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"])
         
-    def train_lr(self):
-        gbdt_train_prdes = self._gbdt_model.predict_proba(self._train_dataset["data"])[:, 1] # 获取正类概率
-        gbdt_val_prdes = self._gbdt_model.predict_proba(self._val_dataset["data"])[:, 1]
+        # 获取GBDT的每棵树的分数(决策值)
+        gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0]  # 仅取每棵树的叶节点输出
+        
+        gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds)
+        
+        print("开始训练LR模型...")
+        # 使用决策树输出作为LR的输入特征
+        self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"])
+        
+    def predict(self, X):
+        # 获取GBDT模型的预测分数
+        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
+        
+        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
+        
+        # 使用训练好的LR模型输出概率
+        return self._lr_model.predict(gbdt_feats_encoded)
+    
+    def predict_proba(self, X):
+        # 获取GBDT模型的预测分数
+        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
+        
+        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
+        
+        # 使用训练好的LR模型输出概率
+        return self._lr_model.predict_proba(gbdt_feats_encoded)
         
-        # 将GBDT的预测结果作为额外特征来训练LR
-        lr_train_data = np.column_stack([self._train_dataset["data"], gbdt_train_prdes])
-        lr_val_data = np.column_stack([self._val_dataset["data"], gbdt_val_prdes])
+    def evaluate(self):
+        # 对测试集进行预测
+        y_pred = self.predict(self._test_dataset["data"])
+        y_pred_proba = self.predict_proba(self._test_dataset["data"])[:, 1]  # 获取正类的概率
         
-        # 训练LR模型
-        self.lr_model = LogisticRegression(solver='saga', max_iter=1000)
-        self.lr_model.fit(lr_train_data, self._train_dataset["label"])
+        # 计算各类评估指标
+        accuracy = accuracy_score(self._test_dataset["label"], y_pred)
+        precision = precision_score(self._test_dataset["label"], y_pred)
+        recall = recall_score(self._test_dataset["label"], y_pred)
+        f1 = f1_score(self._test_dataset["label"], y_pred)
+        roc_auc = roc_auc_score(self._test_dataset["label"], y_pred_proba)
         
+        return {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1_score': f1,
+            'roc_auc': roc_auc
+        }
+    
+     
 if __name__ == "__main__":
     gbdt_data_path = "./models/rank/data/gbdt_data.csv"
     trainer = Trainer(gbdt_data_path)
-    trainer.train_gbdt()
-    trainer.train_lr()
+    trainer.train()
+    eval_metrics = trainer.evaluate()
+    
+    # 输出评估结果
+    print("GBDT-LR Evaluation Metrics:")
+    for metric, value in eval_metrics.items():
+        print(f"{metric}: {value:.4f}")
+