ソースを参照

修复删除异常值

yangzeyu 1 年間 前
コミット
7c6988bfd2
4 ファイル変更94 行追加11 行削除
  1. 48 5
      gbdt_lr.py
  2. 28 0
      models/rank/data/config.py
  3. 2 2
      models/rank/data/preprocess.py
  4. 16 4
      models/rank/gbdt_lr_sort.py

+ 48 - 5
gbdt_lr.py

@@ -70,9 +70,33 @@ def get_features_importance(args):
         print("暂无该城市的模型,请先进行模型训练")
         return
     
-    # 加载模型
-    model = GbdtLrModel(os.path.join(model_dir, args.model_name))
-    cust_features_importance, product_features_importance = model.generate_feats_importance()
+    # # 加载模型
+    # model = GbdtLrModel(os.path.join(model_dir, args.model_name))
+    # cust_features_importance, product_features_importance = model.generate_feats_importance()
+    
+    # # 将字典列表转换为 DataFrame
+    # cust_df = pd.DataFrame([
+    #     {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+    #     for item in cust_features_importance
+    # ])
+    
+    # product_df = pd.DataFrame([
+    #     {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+    #     for item in product_features_importance
+    # ])
+    
+    # cust_file_path = os.path.join(model_dir, "cust_features_importance.csv")
+    # product_file_path = os.path.join(model_dir, "product_features_importance.csv")
+    # cust_df.to_csv(cust_file_path, index=False, encoding='utf-8')
+    # product_df.to_csv(product_file_path, index=False, encoding='utf-8')
+    
+    get_features_importance_by_model(model_dir, "ori_model")
+    get_features_importance_by_model(model_dir, "pos_model")
+    get_features_importance_by_model(model_dir, "shopping_model")
+    
+def get_features_importance_by_model(model_dir, modelname):
+    model = GbdtLrModel(os.path.join(model_dir, f"{modelname}.pkl"))
+    cust_features_importance, product_features_importance, order_features_importance = model.generate_feats_importance()
     
     # 将字典列表转换为 DataFrame
     cust_df = pd.DataFrame([
@@ -85,10 +109,29 @@ def get_features_importance(args):
         for item in product_features_importance
     ])
     
-    cust_file_path = os.path.join(model_dir, "cust_features_importance.csv")
-    product_file_path = os.path.join(model_dir, "product_features_importance.csv")
+    order_df = pd.DataFrame([
+        {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+        for item in order_features_importance
+    ])
+    
+    importance_dir = os.path.join(model_dir, "importance")
+    if modelname == 'ori_model':
+        importance_dir = os.path.join(importance_dir, "ori")
+    elif modelname == 'pos_model':
+        importance_dir = os.path.join(importance_dir, "pos")
+    elif modelname == 'shopping_model':
+        importance_dir = os.path.join(importance_dir, "shopping")
+    
+    if not os.path.exists(importance_dir):
+        os.makedirs(importance_dir)
+        
+    cust_file_path = os.path.join(importance_dir, "cust_features_importance.csv")
+    product_file_path = os.path.join(importance_dir, "product_features_importance.csv")
+    order_file_path = os.path.join(importance_dir, "order_features_importance.csv")
+    
     cust_df.to_csv(cust_file_path, index=False, encoding='utf-8')
     product_df.to_csv(product_file_path, index=False, encoding='utf-8')
+    order_df.to_csv(order_file_path, index=False, encoding='utf-8')
         
 def run():
     parser = argparse.ArgumentParser()

+ 28 - 0
models/rank/data/config.py

@@ -296,6 +296,34 @@ class ImportanceFeaturesMap:
         "filter_length":                                    "滤嘴长度",
     }
     
+    ORDER_FEATURE_MAP = {
+        "MONTH6_SALE_QTY": "近半年销量(箱)",
+        "MONTH6_SALE_AMT": "近半年销售额(万元)",
+        "MONTH6_GROSS_PROFIT_RATE": "近半年毛利率",
+        "MONTH6_SALE_QTY_YOY": "销量同比",
+        "MONTH6_SALE_QTY_MOM": "销量环比",
+        "MONTH6_SALE_AMT_YOY": "销售额(购进额)同比",
+        "MONTH6_SALE_AMT_MOM": "销售额(购进额)环比",
+        "STOCK_QTY": "库存",
+        "ORDER_FULLORDR_RATE": "订足率",
+        "ORDER_FULLORDR_RATE_MOM": "订足率环比",
+        "FULL_FILLMENT_RATE": "订单满足率",
+        "CUSTOMER_REPURCHASE_RATE": "会员重购率(部分有会员)",
+        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC": "新品订货量占同价类比重",
+        "DEMAND_RATE": "需求量满足率",
+        "LISTING_RATE": "品规上架率",
+        "PUT_MARKET_FINISH_RATE": "投放完成率",
+        "OUT_STOCK_DAYS": "断货天数(部分零售商有)",
+        "UNPACKING_RATE": "拆包率",
+        "city_uuid": "城市UUID"
+    }
+    
+    POS_FEATURE_MAP = {
+        "YLT_TURNOVER_RATE": "易灵通动销率",
+        "YLT_BAR_PACKAGE_SALE_OCC": "易灵通条包销售占比",
+        "POS_PACKAGE_PRICE": "POS机单包价格",
+    }
+    
     SHOPING_FEATURES_MAP = {
         # 商圈 字段映射
         "r_home_num": "常驻人口_居住人数",

+ 2 - 2
models/rank/data/preprocess.py

@@ -88,7 +88,7 @@ class DataProcess():
     def _clean_order_data(self):
         remaining_cols = self._order_data.columns.drop(OrderConfig.POSFEATURES) # 数据清洗时先不对pos数据做处理
         col_all_missing = remaining_cols[self._order_data[remaining_cols].isnull().all()].to_list()
-        self._order_data = self._order_data.drop(col_all_missing)
+        self._order_data = self._order_data.drop(columns=col_all_missing)
         
         # 去除重复值和填补缺失值
         self._order_data.drop_duplicates(inplace=True)
@@ -97,7 +97,7 @@ class DataProcess():
         
     def _clean_shopping_data(self):
         """处理商圈数据缺省值"""
-        self._shopping_data.drop(["cust_uuid", "longitude", "latitude", "range_radius"], axis=1, inplace=True)
+        self._shopping_data.drop(columns=["cust_uuid", "longitude", "latitude", "range_radius"], axis=1, inplace=True)
         remaining_cols = self._shopping_data.columns.drop(["city_uuid", "cust_code"])
         col_with_missing = remaining_cols[self._shopping_data[remaining_cols].isnull().any()].tolist() # 判断有缺失的字段
         col_all_missing = remaining_cols[self._shopping_data[remaining_cols].isnull().all()].to_list() # 全部缺失的字段

+ 16 - 4
models/rank/gbdt_lr_sort.py

@@ -5,6 +5,7 @@ from models.rank.data import ProductConfig, CustConfig, ImportanceFeaturesMap
 from models.rank.data.utils import one_hot_embedding, sample_data_clear
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
+import os
 
 
 class GbdtLrModel:
@@ -14,6 +15,7 @@ class GbdtLrModel:
         self._mysql_dao = MySqlDao()
     
     def load_model(self, model_path):
+        self._modelname = os.path.basename(model_path).split(".")[0]
         models = joblib.load(model_path)
         self.gbdt_model, self.lr_model, self.onehot_encoder = models["gbdt_model"], models["lr_model"], models["onehot_encoder"]
         
@@ -104,17 +106,27 @@ class GbdtLrModel:
         # 输出特征重要性
         cust_features_importance = []
         product_features_importance = []
+        order_features_importance = []
+        
         for feat, importance in sorted_importance:
             if feat in list(ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP.keys()):
                 cust_features_importance.append({ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP[feat]: float(importance)})
             if feat in list(ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP.keys()):
                 product_features_importance.append({ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP[feat]: float(importance)})
-            if feat in list(ImportanceFeaturesMap.SHOPING_FEATURES_MAP.keys()):
-                product_features_importance.append({ImportanceFeaturesMap.SHOPING_FEATURES_MAP[feat]: float(importance)})
-        return cust_features_importance, product_features_importance
+            if feat in list(ImportanceFeaturesMap.ORDER_FEATURE_MAP.keys()):
+                order_features_importance.append({ImportanceFeaturesMap.ORDER_FEATURE_MAP[feat]: float(importance)})
+                
+            # 零消特征重要性
+            if self._modelname == 'pos_model' and feat in list(ImportanceFeaturesMap.POS_FEATURE_MAP.keys()):
+                order_features_importance.append({ImportanceFeaturesMap.POS_FEATURE_MAP[feat]: float(importance)})
+                
+            # 商圈特征重要性
+            if self._modelname == 'shopping_model' and feat in list(ImportanceFeaturesMap.SHOPING_FEATURES_MAP.keys()):
+                cust_features_importance.append({ImportanceFeaturesMap.SHOPING_FEATURES_MAP[feat]: float(importance)})
+        return cust_features_importance, product_features_importance, order_features_importance
     
 if __name__ == "__main__":
-    model_path = "./models/rank/weights/model.pkl"
+    model_path = "./models/rank/weights/00000000000000000000000011445301/shopping_model.pkl"
     city_uuid = "00000000000000000000000011445301"
     product_id = "110102"
     gbdt_sort = GbdtLrModel(model_path)