Procházet zdrojové kódy

更改gbdt模型为lightgbm

yangzeyu před 11 měsíci
rodič
revize
88d15479be

+ 15 - 4
database/dao/mysql_dao.py

@@ -50,7 +50,7 @@ class MySqlDao:
         params = {"city_uuid": city_uuid}
         
         data = self.db_helper.load_data_with_page(query, params)
-        data.drop('stat_month', axis=1, inplace=True)
+        # data.drop('stat_month', axis=1, inplace=True)
         data.drop('city_uuid', axis=1, inplace=True)
         
         return data
@@ -170,6 +170,17 @@ class MySqlDao:
 if __name__ == "__main__":
     dao = MySqlDao()
     city_uuid = "00000000000000000000000011445301"
-    # city_uuid = "00000000000000000000000011441801"
-    cust_id_list = ["441800100006", "441800100051", "441800100811"]
-    cust_list = dao.load_mock_order_data()
+    
+    order_data = dao.load_order_data(city_uuid)
+    order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
+    order_data = order_data.infer_objects(copy=False)
+        
+        # 将销售量进行分组求和
+    order_data = order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].sum()
+    
+    cust_data = dao.load_cust_data(city_uuid)
+    cust_data = cust_data["BB_RETAIL_CUSTOMER_NAME"]
+    
+    sale_data = order_data.merge(cust_data, left_on='cust_code', right_on='BB_RETAIL_CUSTOMER_CODE', how="inner")
+    
+    sale_data.to_csv("./data/sale.csv", index=False)

+ 4 - 4
inference.py

@@ -83,10 +83,10 @@ def run():
     pass
 
 if __name__ == '__main__':
-    # generate_features_shap("00000000000000000000000011445301", "350139", delivery_count=5000)
+    generate_features_shap("00000000000000000000000011445301", "350139", delivery_count=5000)
     # recommend_list = get_recommend_list("00000000000000000000000011445301", "420202")
     # recommend_list = pd.DataFrame(recommend_list)
     # recommend_list.to_csv("./data/recommend_list.csv", index=False, encoding="utf-8-sig")
-    data = dao.get_order_by_cust("00000000000000000000000011445301", "445381107139")
-    data = data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
-    data.to_csv("./data/cust.csv", index=False)
+    # data = dao.get_order_by_cust("00000000000000000000000011445301", "445381107139")
+    # data = data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
+    # data.to_csv("./data/cust.csv", index=False)

+ 75 - 35
models/rank/gbdt_lr.py

@@ -2,7 +2,7 @@
 # -*- coding:utf-8 -*-
 import numpy as np
 from models.rank.data import DataLoader
-from sklearn.ensemble import GradientBoostingClassifier
+from lightgbm import LGBMClassifier  # 替换为LightGBM
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
 from sklearn.model_selection import GridSearchCV
@@ -14,26 +14,61 @@ class Trainer:
     def __init__(self, path):
         self._load_data(path)
         
-        # 初始化GBDT和LR模型参数
-        self._gbdt_params = {
-            'n_estimators': 100,
-            'learning_rate': 0.01,
-            'max_depth': 6,
-            'subsample': 0.8,
-            'random_state': 42,
+        # 初始化LightGBM和LR模型参数
+        self._lgbm_params = {
+            # 核心参数
+            'objective': 'binary',          # 二分类任务
+            'boosting_type': 'gbdt',        # 传统GBDT算法
+            # 'metric': ['auc', 'binary_logloss'], # 评估指标
+            
+            # 树结构控制
+            'num_leaves': 31,               # 叶子节点数 (建议20-63)
+            'max_depth': 7,                 # 树深度 (3-7)
+            'min_child_samples': 30,        # 叶子节点最小样本数 (20-100)
+            'min_split_gain': 0.02,         # 分裂最小增益 (0.01-0.1)
+            
+            # 正则化
+            'lambda_l1': 0.1,               # L1正则 (0-10)
+            'lambda_l2': 0.2,               # L2正则 (0-10)
+            'feature_fraction': 0.8,        # 特征采样比例 (0.7-1.0)
+            'bagging_fraction': 0.9,        # 数据采样比例 (0.8-1.0)
+            'bagging_freq': 5,              # 每5次迭代执行bagging
+            
+            # 学习控制
+            'learning_rate': 0.05,          # 学习率 (0.01-0.1)
+            'n_estimators': 1000,           # 树的数量 (配合早停)
+            # 'early_stopping_rounds': 50,    # 早停轮数
+            
+            # 类别特征处理
+            # 'categorical_feature': 'auto',  # 自动检测类别特征
+            # 'max_cat_to_onehot': 5,         # 类别值>5时不做one-hot
+            
+            # 系统
+            'n_jobs': -1,                   # 使用所有CPU
+            'random_state': 42,             # 随机种子
+            'verbose': -1                   # 不输出日志
         }
         self._lr_params = {
-            "max_iter": 1000,
-            'C': 1.0, 
-            'penalty': 'elasticnet', 
-            'l1_ratio': 0.8,  # 添加 l1_ratio 参数,可以根据需要调整
-            'solver': 'saga',
+            # 求解器
+            'penalty': 'elasticnet',        # 弹性网络正则
+            'solver': 'saga',               # 支持elasticnet
+            'max_iter': 1000,               # 迭代次数
+            
+            # 正则化
+            'C': 0.3,                       # 逆正则强度 (0.1-1.0)
+            'l1_ratio': 0.7,                # L1权重 (0.5-0.9)
+            
+            # 类别平衡
+            'class_weight': 'balanced',     # 自动平衡类别权重
+            
+            # 系统
             'random_state': 42,
-            'class_weight': 'balanced'
+            'n_jobs': -1,                   # 并行计算
+            'tol': 1e-4                     # 早停阈值
         }
         
         # 初始化模型
-        self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
+        self._lgbm_model = LGBMClassifier(**self._lgbm_params)
         self._lr_model = LogisticRegression(**self._lr_params)
         
         self._onehot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
@@ -44,36 +79,42 @@ class Trainer:
         
     def train(self):
         """模型训练"""
-        print("开始训练GBDT模型...")
-        # 训练GBDT模型
-        self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"])
+        print("开始训练LightGBM模型...")
+        # 训练LightGBM模型
+        self._lgbm_model.fit(self._train_dataset["data"], self._train_dataset["label"])
         
-        # 获取GBDT的每棵树的分数(决策值)
-        gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0]  # 仅取每棵树的叶节点输出
+        # 获取LightGBM的叶节点索引
+        lgbm_train_preds = self._lgbm_model.predict(
+            self._train_dataset["data"], 
+            pred_leaf=True
+        )
         
-        gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds)
+        # 对叶节点索引进行one-hot编码
+        lgbm_feats_encoded = self._onehot_encoder.fit_transform(lgbm_train_preds)
         
         print("开始训练LR模型...")
         # 使用决策树输出作为LR的输入特征
-        self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"])
+        self._lr_model.fit(lgbm_feats_encoded, self._train_dataset["label"])
         
     def predict(self, X):
-        # 获取GBDT模型的预测分数
-        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
+        # 获取LightGBM模型的叶节点索引
+        lgbm_preds = self._lgbm_model.predict(X, pred_leaf=True)
         
-        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
+        # 对叶节点索引进行one-hot编码
+        lgbm_feats_encoded = self._onehot_encoder.transform(lgbm_preds)
         
-        # 使用训练好的LR模型输出概率
-        return self._lr_model.predict(gbdt_feats_encoded)
+        # 使用训练好的LR模型进行预测
+        return self._lr_model.predict(lgbm_feats_encoded)
     
     def predict_proba(self, X):
-        # 获取GBDT模型的预测分数
-        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
+        # 获取LightGBM模型的叶节点索引
+        lgbm_preds = self._lgbm_model.predict(X, pred_leaf=True)
         
-        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
+        # 对叶节点索引进行one-hot编码
+        lgbm_feats_encoded = self._onehot_encoder.transform(lgbm_preds)
         
         # 使用训练好的LR模型输出概率
-        return self._lr_model.predict_proba(gbdt_feats_encoded)
+        return self._lr_model.predict_proba(lgbm_feats_encoded)
         
     def evaluate(self):
         # 对测试集进行预测
@@ -97,7 +138,7 @@ class Trainer:
         
     def save_model(self, model_path):
         """将模型保存到本地"""
-        models = {"gbdt_model": self._gbdt_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder}
+        models = {"lgbm_model": self._lgbm_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder}
         joblib.dump(models, model_path)
     
      
@@ -115,11 +156,10 @@ if __name__ == "__main__":
     eval_metrics = trainer.evaluate()
     
     # 输出评估结果
-    print("GBDT-LR Evaluation Metrics:")
+    print("LightGBM-LR Evaluation Metrics:")
     for metric, value in eval_metrics.items():
         print(f"{metric}: {value:.4f}")
         
     # 保存模型
     model_path = "./models/rank/weights/model.pkl"
-    trainer.save_model(model_path)
-    
+    trainer.save_model(model_path)

+ 7 - 3
models/rank/gbdt_lr_inference.py

@@ -23,7 +23,7 @@ class GbdtLrModel:
     
     def load_model(self, model_path):
         models = joblib.load(model_path)
-        self.gbdt_model, self.lr_model, self.onehot_encoder = models["gbdt_model"], models["lr_model"], models["onehot_encoder"]
+        self.gbdt_model, self.lr_model, self.onehot_encoder = models["lgbm_model"], models["lr_model"], models["onehot_encoder"]
         
     def get_cust_and_product_data(self, city_uuid, product_id):
         """从商户数据库中获取指定城市所有商户的id"""
@@ -54,7 +54,11 @@ class GbdtLrModel:
         return feats_map
     
     def get_recommend_list(self, recommend_sample, recall_list):
-        gbdt_preds = self.gbdt_model.apply(recommend_sample)[:, :, 0]
+        # gbdt_preds = self.gbdt_model.apply(recommend_sample)[:, :, 0]
+        # gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds)
+        # scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1]
+        
+        gbdt_preds = self.gbdt_model.predict(recommend_sample, pred_leaf=True)
         gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds)
         scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1]
         
@@ -75,7 +79,7 @@ class GbdtLrModel:
         # 获取GBDT模型的特征重要性
         feats_importance = self.gbdt_model.feature_importances_
         # 获取特征名称
-        feats_names = self.gbdt_model.feature_names_in_
+        feats_names = self.gbdt_model.feature_name_
         importance_dict = dict(zip(feats_names, feats_importance))
         
         onehot_feats = {**CustConfig.ONEHOT_CAT, **ShopConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}

+ 29 - 0
test.py

@@ -0,0 +1,29 @@
+from database.dao.mysql_dao import MySqlDao
+from models.rank.data.config import ProductConfig, ImportanceFeaturesMap
+from models.rank.data.utils import sample_data_clear
+
+
+dao = MySqlDao()
+city_uuid = "00000000000000000000000011445301"
+    
+order_data = dao.load_order_data(city_uuid)
+order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
+print(order_data.columns.to_list())
+order_data = order_data.infer_objects(copy=False)
+        
+# 将销售量进行分组求和
+order_data = order_data.groupby(["stat_month", "cust_code", "product_code"], as_index=False)["sale_qty"].sum()
+    
+cust_data = dao.load_cust_data(city_uuid)
+cust_data = cust_data[["BB_RETAIL_CUSTOMER_CODE", "BB_RETAIL_CUSTOMER_NAME"]]
+
+product_data = dao.load_product_data(city_uuid)
+product_data = product_data[ProductConfig.FEATURE_COLUMNS]
+product_data = sample_data_clear(product_data, ProductConfig)
+
+
+sale_data = order_data.merge(cust_data, left_on='cust_code', right_on='BB_RETAIL_CUSTOMER_CODE', how="inner")
+sale_data = sale_data.merge(product_data, left_on='product_code', right_on='product_code', how="inner")
+sale_data = sale_data[["cust_code", "BB_RETAIL_CUSTOMER_NAME"] + ProductConfig.FEATURE_COLUMNS + ["sale_qty", "stat_month"]]
+sale_data = sale_data.rename(columns=ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP)
+sale_data.to_csv("./data/sale_month.csv", index=False)

+ 1 - 1
utils/result_process.py

@@ -110,5 +110,5 @@ def get_cust_list_from_history_order(city_uuid, product_code):
     return merge_data
         
 if __name__ == "__main__":
-    order_data = get_cust_list_from_history_order("00000000000000000000000011445301", "420202")
+    order_data = get_cust_list_from_history_order("00000000000000000000000011445301", "350355")
     order_data.to_csv("./data/eval.csv", index=False)