Преглед изворни кода

更新gbdt-lr打标签方式

Sherlock пре 1 година
родитељ
комит
d18f3b9932

+ 7 - 7
dao/dao.py

@@ -3,18 +3,18 @@ from dao import Mysql
 def load_order_data_from_mysql(city_uuid):
     """从数据库中读取订单数据"""
     client = Mysql()
-    tablename = "yunfu_mock_data"
+    tablename = "tads_brandcul_cust_order"
     query_text = "*"
-    
-    # df = client.load_data(tablename, query_text, "city_uuid", city_uuid)
-    df = client.load_mock_data(tablename, query_text)
+    city_uuid = "00000000000000000000000011441801"
+    df = client.load_data(tablename, query_text, "city_uuid", city_uuid)
+    # df = client.load_mock_data(tablename, query_text)
     if len(df) == 0:
         return None
     
-    # df.drop('stat_month', axis=1, inplace=True)
-    # df.drop('city_uuid', axis=1, inplace=True)
+    df.drop('stat_month', axis=1, inplace=True)
+    df.drop('city_uuid', axis=1, inplace=True)
     
-     # 去除重复值和填补缺失值
+    # 去除重复值和填补缺失值
     df.drop_duplicates(inplace=True)
     df.fillna(0, inplace=True)
     return df

+ 1 - 1
dao/mysql_client.py

@@ -95,7 +95,7 @@ class Mysql(object):
         
         query = text(f"""
             SELECT * 
-            FROM tads_brandcul_cust_info_bak
+            FROM tads_brandcul_cust_info
             WHERE BA_CITY_ORG_CODE = :city_uuid 
             AND BB_RETAIL_CUSTOMER_CODE IN ({cust_id_str})
         """)

+ 2 - 1
models/rank/data/__init__.py

@@ -1,9 +1,10 @@
-from models.rank.data.config import CustConfig, ProductConfig
+from models.rank.data.config import CustConfig, ProductConfig, OrderConfig
 from models.rank.data.dataloader import DataLoader
 from models.rank.data.utils import one_hot_embedding, sample_data_clear
 __all__ = [
     "CustConfig",
     "ProductConfig",
+    "OrderConfig",
     "DataLoader",
     "one_hot_embedding",
     "sample_data_clear"

+ 44 - 7
models/rank/data/config.py

@@ -11,12 +11,14 @@ class CustConfig:
         "PRESENT_STAR_TERMINAL",                       # 终端星级
         "MD04_MG_RTL_CUST_CREDITCLASS_NAME",           # 零售户信用等级名称
         "MD04_DIR_SAL_STORE_FLAG",                     # 直营店标识
-        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",       # 新品订货量占同价类比重
-        "PRODUCT_LISTING_RATE",                        # 品规上架率
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME",              # 零售户经营范围名称
+        "PRODUCT_INSALE_QTY",                          # 在销品规数
+        
+        # "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",       # 新品订货量占同价类比重
+        # "PRODUCT_LISTING_RATE",                        # 品规上架率
         # "STOCKOUT_DAYS",                              # 断货天数
         # "YLT_TURNOVER_RATE",                           # 易灵通动销率
         # "YLT_BAR_PACKAGE_SALE_OCC",                    # 易灵通条包销售占比
-        # "PRODUCT_INSALE_QTY",                          # 在销品规数
         # "UNPACKING_RATE",                              # 拆包率
         
         
@@ -28,7 +30,6 @@ class CustConfig:
         # "BB_RTL_CUST_TERMINALEVEL_NAME",               # 零售户终端层级细分名称
         # "MD04_MG_SAMPLE_CUST_FLAG",                    # 样本户标识
         # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG",            # 零售户大户标识
-        # "BB_CUSTOMER_MANAGER_SCOPE_NAME",              # 零售户经营范围名称
         # "BB_RTL_CUST_OPERATE_METHOD_NAME",             # 零售户经营方式名称
         # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME",          # 零售户卷烟经营规模名称
         
@@ -45,10 +46,14 @@ class CustConfig:
         "OPERATOR_AGE":                             {"method": "fillna", "opt": "mean", "type": "num"},
         "BB_RTL_CUST_CHAIN_FLAG":                   {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
         "PRESENT_STAR_TERMINAL":                    {"method": "fillna", "opt": "fill", "value": "非星级", "type": "str"},
-        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        {"method": "fillna", "opt": "fill", "value": "D", "type": "str"},
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        {"method": "fillna", "opt": "fill", "value": "B", "type": "str"},
         "MD04_DIR_SAL_STORE_FLAG":                  {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
-        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC":    {"method": "fillna", "opt": "mean", "type": "num"},
-        "PRODUCT_LISTING_RATE":                     {"method": "fillna", "opt": "mean", "type": "num"},
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":           {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "PRODUCT_INSALE_QTY":                       {"method": "fillna", "opt": "mean", "type": "num"},
+        
+        
+        # "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC":    {"method": "fillna", "opt": "mean", "type": "num"},
+        # "PRODUCT_LISTING_RATE":                     {"method": "fillna", "opt": "mean", "type": "num"},
         # "STOCKOUT_DAYS":                            {"method": "fillna", "opt": "mean", "type": "num"},
         # "YLT_TURNOVER_RATE":                        {"method": "fillna", "opt": "mean", "type": "num"},
         # "NEW_PRODUCT_MEMBERS_QTY":                  {"method": "fillna", "opt": "mean", "type": "num"},
@@ -77,6 +82,7 @@ class CustConfig:
         "PRESENT_STAR_TERMINAL":                    ["一星", "二星", "三星", "四星", "五星", "非星级"],
         "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        ["AAA", "AA", "A", "B", "C", "D"],
         "MD04_DIR_SAL_STORE_FLAG":                  ["是", "否"],
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":           ["是", "否"],
         
         
         
@@ -139,6 +145,7 @@ class ProductConfig:
         "is_exploding_beads":                          {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
         "is_shortbranch":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
         "is_medium":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_tiny":                                     {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
         "product_style_code_name":                     {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
         "org_is_abnormity":                            {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
         "is_chuangxin":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
@@ -181,6 +188,7 @@ class ProductConfig:
         "is_exploding_beads":                          ["是", "否"],
         "is_shortbranch":                              ["是", "否"],
         "is_medium":                                   ["是", "否"],
+        "is_tiny":                                     ["是", "否"],
         "product_style_code_name":                     ["条盒硬盒", "条包硬盒", "条盒软盒", "条包软盒", "铁盒", "其他"],
         "org_is_abnormity":                            ["是", "否"],
         "is_chuangxin":                                ["是", "否"],
@@ -207,4 +215,33 @@ class ProductConfig:
         # "source_type":                                 ["是", "否"],
         # "chinese_mix":                                 ["是", "否"],
         # "sub_price_type_name":                         ["高端烟", "高价位烟", "普一类烟", "二类烟", "三类烟", "四类烟", "五类烟", "无价类"],
+    }
+    
+class OrderConfig:
+    FEATURE_COLUMNS = [
+        "BB_RETAIL_CUSTOMER_CODE",                          # 零售户编码
+        "PRODUCT_CODE",                                     # 卷烟编码
+        "MONTH6_SALE_QTY",                                  # 近半年销量(箱)
+        "MONTH6_SALE_AMT",                                  # 近半年销售额(万元)
+        "MONTH6_GROSS_PROFIT_RATE",                         # 近半年毛利率
+        "MONTH6_SALE_QTY_YOY",                              # 销售量同比
+        "MONTH6_SALE_QTY_MOM",                              # 销售量环比
+        "MONTH6_SALE_AMT_YOY",                              # 销售额(购进额)同比
+        "MONTH6_SALE_AMT_MOM",                              # 销售额(狗金额)环比
+        "ORDER_FULLORDR_RATE",                              # 订足率
+        "CUSTOMER_REPURCHASE_RATE",                         # 会员重购率   
+        "DEMAND_RATE",                                      # 需求量满足率
+        "LISTING_RATE",                                     # 品规商上架率
+        "PUT_MARKET_FINISH_RATE",                           # 投放完成率
+        "OUT_STOCK_DAYS",                                   # 断货天数
+        "YLT_TURNOVER_RATE",                                # 易灵通动销率
+        "YLT_BAR_PACKAGE_SALE_OCC",                         # 易灵通调包销售占比
+        "UNPACKING_RATE",                                   # 拆包率
+    ]
+    
+    WEIGHTS = {
+        "MONTH6_SALE_QTY":                                  0.15,
+        "MONTH6_SALE_QTY_MOM":                              0.2,
+        "ORDER_FULLORDR_RATE":                              0.3,
+        "DEMAND_RATE":                                      0.35,
     }

+ 46 - 6
models/rank/data/preprocess.py

@@ -1,7 +1,9 @@
 from dao.dao import load_cust_data_from_mysql, load_product_data_from_mysql, load_order_data_from_mysql
-from models.rank.data.config import CustConfig, ProductConfig
+from models.rank.data.config import CustConfig, ProductConfig, OrderConfig
 import os
 import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+import numpy as np
 
 class DataProcess():
     def __init__(self, city_uuid):
@@ -21,16 +23,24 @@ class DataProcess():
         # 1. 获取指定的特征组合
         self._cust_data = self._cust_data[CustConfig.FEATURE_COLUMNS]
         self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
+        self._order_data = self._order_data[OrderConfig.FEATURE_COLUMNS]
         
         # 2. 数据清洗
         self._clean_cust_data()
         self._clean_product_data()
+        self._clean_order_data()
         
-        # 3. 将零售户信息表与卷烟信息表进行笛卡尔积连接
-        self._descartes()
+        # # 3. 将零售户信息表与卷烟信息表进行笛卡尔积连接
+        # self._descartes()
         
-        # 4. 根据order表中的信息给数据打标签
-        self._labeled_data()
+        # # 4. 根据order表中的信息给数据打标签
+        # self._labeled_data()
+        
+        # 3. 根据特征权重给order表中的记录打分
+        self._calculate_score()
+        
+        # 4. 根据中位数打标签
+        self.labeled_data_by_score()
         
         # 5. 选取训练样本
         self._generate_train_data()
@@ -63,6 +73,36 @@ class DataProcess():
                     self._product_data[feature] = self._product_data[feature].fillna(rules["value"])
                 elif rules["opt"] == "mean":
                     self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean())
+                    
+    def _clean_order_data(self):
+        pass
+    
+    def _calculate_score(self):
+        """计算order记录的fens"""
+        self._order_score = self._order_data.copy()
+        # 对参与算分的特征值进行归一化
+        scaler = MinMaxScaler()
+        self._order_score[list(OrderConfig.WEIGHTS.keys())] = scaler.fit_transform(self._order_score[list(OrderConfig.WEIGHTS.keys())])
+        # 计算加权分数
+        self._order_score["score"] = sum(self._order_score[feat] * weight 
+                          for feat, weight in OrderConfig.WEIGHTS.items())
+    
+    def labeled_data_by_score(self):
+        """通过计算分数打标签"""
+        # 按品规分组计算中位数
+        product_medians = self._order_score.groupby("PRODUCT_CODE")["score"].median().reset_index()
+        product_medians.columns = ["PRODUCT_CODE", "median_score"]
+        
+        # 合并中位数到原始订单数据
+        self._order_score = pd.merge(self._order_score, product_medians, on="PRODUCT_CODE")
+        
+        # 生成标签 (1: 大于等于中位数, 0: 小于中位数)
+        self._order_score["label"] = np.where(
+            self._order_score["score"] >= self._order_score["median_score"], 1, 0
+        )
+        self._order_score = self._order_score.sort_values("score", ascending=False)
+        self._order_score = self._order_score[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE", "label"]]
+        self._order_score.to_csv("./models/rank/data/train.csv")
     
     def _descartes(self):
         """将零售户信息与卷烟信息进行笛卡尔积连接"""
@@ -77,7 +117,7 @@ class DataProcess():
         order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
         order_set = set(zip(order_combinations["BB_RETAIL_CUSTOMER_CODE"], order_combinations["PRODUCT_CODE"]))
         
-        # 在descartes_data中打标签:正样本为1,负样本为2
+        # 在descartes_data中打标签:正样本为1,负样本为0
         self._descartes_data['label'] = self._descartes_data.apply(
             lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
     

+ 1 - 1
models/rank/data/utils.py

@@ -8,7 +8,7 @@ def one_hot_embedding(dataframe, onehout_feat):
         dataframe,
         columns=list(onehout_feat.keys()),
         prefix_sep="_",
-        dtype=int
+        dtype=int,
     )
     return dataframe
 

+ 12 - 4
models/rank/gbdt_lr.py

@@ -8,6 +8,7 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import OneHotEncoder
 import joblib
+import time
 
 class Trainer:
     def __init__(self, path):
@@ -24,9 +25,9 @@ class Trainer:
         self._lr_params = {
             "max_iter": 1000,
             'C': 1.0, 
-            'penalty': 'l1', 
-            # 'l1_ratio': 0.5,  # 添加 l1_ratio 参数,可以根据需要调整
-            'solver': 'sag',
+            'penalty': 'elasticnet', 
+            'l1_ratio': 0.8,  # 添加 l1_ratio 参数,可以根据需要调整
+            'solver': 'saga',
             'random_state': 42,
             'class_weight': 'balanced'
         }
@@ -35,7 +36,7 @@ class Trainer:
         self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
         self._lr_model = LogisticRegression(**self._lr_params)
         
-        self._onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        self._onehot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
         
     def _load_data(self, path):
         dataloader = DataLoader(path)
@@ -103,7 +104,14 @@ class Trainer:
 if __name__ == "__main__":
     gbdt_data_path = "./models/rank/data/gbdt_data.csv"
     trainer = Trainer(gbdt_data_path)
+    
+    start_time = time.time()
     trainer.train()
+    end_time = time.time()
+    
+    training_time_hours = (end_time - start_time) / 3600
+    print(f"训练时间: {training_time_hours:.4f} 小时")
+    
     eval_metrics = trainer.evaluate()
     
     # 输出评估结果

+ 36 - 5
models/rank/gbdt_lr_sort.py

@@ -6,7 +6,7 @@ import pandas as pd
 from sklearn.preprocessing import StandardScaler
 
 
-class GbdtLrSort:
+class GbdtLrModel:
     def __init__(self, model_path):
         self.load_model(model_path)
         self.redis = Redis().redis
@@ -63,15 +63,46 @@ class GbdtLrSort:
             self.recommend_list.append({cust_id: float(score)})
             
         self.recommend_list = sorted(self.recommend_list, key=lambda x: list(x.values())[0], reverse=True)
-        for res in self.recommend_list:
+        for res in self.recommend_list[:200]:
             print(res)
     
     def generate_feats_importance(self):
-        pass
+        """生成特征重要性"""
+        # 获取GBDT模型的特征重要性
+        feats_importance = self.gbdt_model.feature_importances_
+        
+        # 获取特征名称
+        feats_names = self.gbdt_model.feature_names_in_
+        
+        importance_dict = dict(zip(feats_names, feats_importance))
+        
+        onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
+        for feat, categories in onehot_feats.items():
+            related_columns = [col for col in feats_names if col.startswith(feat)]
+            if related_columns:
+                # 合并类别重要性
+                combined_importance = sum(importance_dict[col] for col in related_columns)
+                # 删除onehot类别列
+                for col in related_columns:
+                    del importance_dict[col]
+                # 添加合并后的重要性
+                importance_dict[feat] = combined_importance
+        
+        # 排序
+        sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
+        
+        # 输出特征重要性
+        features_importance = []
+        for feat, importance in sorted_importance:
+            features_importance.append({feat: float(importance)})
+        return features_importance
     
 if __name__ == "__main__":
     model_path = "./models/rank/weights/model.pkl"
     city_uuid = "00000000000000000000000011445301"
     product_id = "110102"
-    gbdt_sort = GbdtLrSort(model_path)
-    gbdt_sort.sort(city_uuid, product_id)
+    gbdt_sort = GbdtLrModel(model_path)
+    # gbdt_sort.sort(city_uuid, product_id)
+    importances = gbdt_sort.generate_feats_importance()
+    for importance in importances:
+        print(importance)

BIN
models/rank/weights/model.pkl