Sherlock 1 рік тому
батько
коміт
4d0edf2bb0

+ 2 - 2
database/dao/mysql_dao.py

@@ -21,7 +21,8 @@ class MySqlDao:
         self._cust_tablename = "tads_brandcul_cust_info"
         self._order_tablename = "tads_brandcul_cust_order"
         self._mock_order_tablename = "yunfu_mock_data"
-        self._shopping_tablename = "tads_brandcul_cust_info_lbs"
+        # self._shopping_tablename = "tads_brandcul_cust_info_lbs"
+        self._shopping_tablename = "yunfu_shopping_mock_data"
         
         self._initialized = True
         
@@ -47,7 +48,6 @@ class MySqlDao:
         params = {"city_uuid": city_uuid}
         
         data = self.db_helper.load_data_with_page(query, params)
-        
         data.drop('stat_month', axis=1, inplace=True)
         data.drop('city_uuid', axis=1, inplace=True)
         

+ 13 - 6
gbdt_lr.py

@@ -14,15 +14,22 @@ def train(args):
     
     # 准备数据集  
     print("正在整合训练数据...")
-    processor = DataProcess(args.city_uuid, args.train_data_path)
+    processor = DataProcess(args.city_uuid, args.train_data_dir)
     processor.data_process()
     print("训练数据整合完成!")
     
     # 进行训练
-    trainer(args, model_dir)
+    print("开始训练原始模型")
+    trainer(args, os.path.join(args.train_data_dir, "original_train_data.csv"), model_dir, "ori_model.pkl")
+    
+    print("开始训练pos模型")
+    trainer(args, os.path.join(args.train_data_dir, "pos_train_data.csv"), model_dir, "pos_model.pkl")
+    
+    print("开始训练shopping模型")
+    trainer(args, os.path.join(args.train_data_dir, "shopping_train_data.csv"), model_dir, "shopping_model.pkl")
 
-def trainer(args, model_dir):
-    trainer = Trainer(args.train_data_path)
+def trainer(args, train_data_path, model_dir, model_name):
+    trainer = Trainer(train_data_path)
     
     start_time = time.time()
     trainer.train()
@@ -39,7 +46,7 @@ def trainer(args, model_dir):
         print(f"{metric}: {value:.4f}")
         
     # 保存模型
-    trainer.save_model(os.path.join(model_dir, args.model_name))
+    trainer.save_model(os.path.join(model_dir, model_name))
 
 def recommend_by_product(args):
     model_dir = os.path.join(args.model_path, args.city_uuid)
@@ -86,7 +93,7 @@ def run():
     parser.add_argument("--recommend", action='store_true')
     parser.add_argument("--importance", action='store_true')
     
-    parser.add_argument("--train_data_path", type=str, default="./models/rank/data/gbdt_data.csv")
+    parser.add_argument("--train_data_dir", type=str, default="./data")
     parser.add_argument("--model_path", type=str, default="./models/rank/weights")
     parser.add_argument("--model_name", type=str, default='model.pkl')
     parser.add_argument("--last_n", type=int, default=200)

+ 173 - 1
models/rank/data/config.py

@@ -234,7 +234,8 @@ class OrderConfig:
         "ORDER_FULLORDR_RATE",                              # 订足率
         "FULL_FILLMENT_RATE",                               # 订单满足率
         "ORDER_FULLORDR_RATE_MOM",                          # 订足率环比
-        "CUSTOMER_REPURCHASE_RATE",                         # 会员重购率   
+        "CUSTOMER_REPURCHASE_RATE",                         # 会员重购率  
+        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",            # 新品订货量占同价类比重/decimal(18,6)
         "DEMAND_RATE",                                      # 需求量满足率
         "LISTING_RATE",                                     # 品规商上架率
         "PUT_MARKET_FINISH_RATE",                           # 投放完成率
@@ -252,6 +253,10 @@ class OrderConfig:
         "ORDER_FULLORDR_RATE_MOM":                          0.35,
     }
     
+    POSFEATURES = [
+        "YLT_TURNOVER_RATE","YLT_BAR_PACKAGE_SALE_OCC","POS_PACKAGE_PRICE"
+    ]
+    
 class ImportanceFeaturesMap:
     CUSTOM_FEATRUES_MAP = {
         "BB_RTL_CUST_GRADE_NAME":                           "零售户分档名称",
@@ -289,4 +294,171 @@ class ImportanceFeaturesMap:
         "tbc_total_length":                                 "烟支总长度",
         "tbc_length":                                       "烟支长度",
         "filter_length":                                    "滤嘴长度",
+    }
+    
+    SHOPING_FEATURES_MAP = {
+        # 商圈 字段映射
+        "r_home_num": "常驻人口_居住人数",
+        "r_work_num": "常驻人口_工作人数",
+        "r_resident_num": "常驻人口_工作或居住人数",
+        "r_urban_cons_middle": "常驻人口_城市消费水平_中",
+        "r_urban_cons_low": "常驻人口_城市消费水平_低",
+        "r_urban_cons_lower": "常驻人口_城市消费水平_次低",
+        "r_urban_cons_secondhigh": "常驻人口_城市消费水平_次高",
+        "r_urban_cons_high": "常驻人口_城市消费水平_高",
+        "r_edu_junior_middle": "常驻人口_学历_初中",
+        "r_edu_doctor": "常驻人口_学历_博士",
+        "r_edu_specialty": "常驻人口_学历_大专",
+        "r_edu_primary": "常驻人口_学历_小学",
+        "r_edu_college": "常驻人口_学历_本科",
+        "r_edu_postgraduate": "常驻人口_学历_硕士",
+        "r_edu_senior_middle": "常驻人口_学历_高中",
+        "r_house_price79999": "常驻人口_居住社区房价_60000_79999",
+        "r_house_price59999": "常驻人口_居住社区房价_40000_59999",
+        "r_house_price39999": "常驻人口_居住社区房价_20000_39999",
+        "r_house_price19999": "常驻人口_居住社区房价_10000_19999",
+        "r_house_price9999": "常驻人口_居住社区房价_8000_9999",
+        "r_house_price7999": "常驻人口_居住社区房价_5000_7999",
+        "r_house_price4999": "常驻人口_居住社区房价_2000_4999",
+        "r_age_17": "常驻人口_年龄_0_17",
+        "r_age_24": "常驻人口_年龄_18_24",
+        "r_age_30": "常驻人口_年龄_25_30",
+        "r_age_35": "常驻人口_年龄_31_35",
+        "r_age_40": "常驻人口_年龄_36_40",
+        "r_age_45": "常驻人口_年龄_41_45",
+        "r_age_60": "常驻人口_年龄_46_60",
+        "r_age_over_60": "常驻人口_年龄_61以上",
+        "r_sex_woman": "常驻人口_性别_女",
+        "r_sex_man": "常驻人口_性别_男",
+        "r_catering_50": "常驻人口_餐饮消费水平_50",
+        "r_catering_100": "常驻人口_餐饮消费水平_100",
+        "r_catering_150": "常驻人口_餐饮消费水平_150",
+        "r_catering_200": "常驻人口_餐饮消费水平_200",
+        "r_catering_500": "常驻人口_餐饮消费水平_500",
+        "r_catering_over_500": "常驻人口_餐饮消费水平_500以上",
+        "r_catering_times_2": "常驻人口_餐饮消费频次_1_2",
+        "r_catering_times_4": "常驻人口_餐饮消费频次_2_4",
+        "r_catering_times_6": "常驻人口_餐饮消费频次_4_6",
+        "r_catering_times_8": "常驻人口_餐饮消费频次_6_8",
+        "r_catering_times_10": "常驻人口_餐饮消费频次_8_10",
+        "r_catering_times_11": "常驻人口_餐饮消费频次_11以上",
+        "r_native_beijing": "常驻人口_家乡地_北京市",
+        "r_native_tianjing": "常驻人口_家乡地_天津市",
+        "r_native_hebei": "常驻人口_家乡地_河北省",
+        "r_native_shanxi": "常驻人口_家乡地_山西省",
+        "r_native_neimeng": "常驻人口_家乡地_内蒙古",
+        "r_native_liaoning": "常驻人口_家乡地_辽宁省",
+        "r_native_jilin": "常驻人口_家乡地_吉林省",
+        "r_native_heilongjiang": "常驻人口_家乡地_黑龙江省",
+        "r_native_shanghai": "常驻人口_家乡地_上海市",
+        "r_native_jiangsu": "常驻人口_家乡地_江苏省",
+        "r_native_zhejiang": "常驻人口_家乡地_浙江省",
+        "r_native_anhui": "常驻人口_家乡地_安徽省",
+        "r_native_fujian": "常驻人口_家乡地_福建省",
+        "r_native_jiangix": "常驻人口_家乡地_江西省",
+        "r_native_shandong": "常驻人口_家乡地_山东省",
+        "r_native_henan": "常驻人口_家乡地_河南省",
+        "r_native_hubei": "常驻人口_家乡地_湖北省",
+        "r_native_hunan": "常驻人口_家乡地_湖南省",
+        "r_native_guangdong": "常驻人口_家乡地_广东省",
+        "r_native_hainan": "常驻人口_家乡地_海南省",
+        "r_native_sichuan": "常驻人口_家乡地_四川省",
+        "r_native_guizhou": "常驻人口_家乡地_贵州省",
+        "r_native_yunnan": "常驻人口_家乡地_云南省",
+        "r_native_shan": "常驻人口_家乡地_陕西省",
+        "r_native_gansu": "常驻人口_家乡地_甘肃省",
+        "r_native_qinghai": "常驻人口_家乡地_青海省",
+        "r_native_guangxi": "常驻人口_家乡地_广西壮族自治区",
+        "r_native_ningxia": "常驻人口_家乡地_宁夏回族自治区",
+        "r_native_xinjiang": "常驻人口_家乡地_新疆维吾尔自治区",
+        "r_native_xizang": "常驻人口_家乡地_西藏自治区",
+        "r_native_chongqing": "常驻人口_家乡地_重庆市",
+        "r_native_hongkong": "常驻人口_家乡地_香港",
+        "r_native_macao": "常驻人口_家乡地_澳门",
+        "r_native_taiwan": "常驻人口_家乡地_台湾",
+        "r_native_other": "常驻人口_家乡地_其它",
+        "f_flow_num": "流动人口_日均流动人口数量",
+        "f_holiday_flow_num": "流动人口_节假日日均流动人口数量",
+        "f_workday_flow_num": "流动人口_工作日日均流动人口数量",
+        "f_flowurban_cons_middle": "日均流动_城市消费水平_中",
+        "f_flowurban_cons_low": "日均流动_城市消费水平_低",
+        "f_flowurban_cons_lower": "日均流动_城市消费水平_次低",
+        "f_flowurban_cons_second_high": "日均流动_城市消费水平_次高",
+        "f_flowurban_cons_high": "日均流动_城市消费水平_高",
+        "f_flowedu_junior_middle": "日均流动_学历_初中",
+        "f_flowedu_doctor": "日均流动_学历_博士",
+        "f_flowedu_specialty": "日均流动_学历_大专",
+        "f_flowedu_primary": "日均流动_学历_小学",
+        "f_flowedu_college": "日均流动_学历_本科",
+        "f_flowedu_postgraduate": "日均流动_学历_硕士",
+        "f_flowedu_senior_middle": "日均流动_学历_高中",
+        "f_flowhouse_middle": "日均流动_居住社区房价_中",
+        "f_flowhouse_low": "日均流动_居住社区房价_低",
+        "f_flowhouse_lower": "日均流动_居住社区房价_次低",
+        "f_flowhouse_second_high": "日均流动_居住社区房价_次高",
+        "f_flowhouse_high": "日均流动_居住社区房价_高",
+        "f_flowage_17": "日均流动_年龄_0_17",
+        "f_flowage_24": "日均流动_年龄_18_24",
+        "f_flowage_30": "日均流动_年龄_25_30",
+        "f_flowage_35": "日均流动_年龄_31_35",
+        "f_flowage_40": "日均流动_年龄_36_40",
+        "f_flowage_45": "日均流动_年龄_41_45",
+        "f_flowage_60": "日均流动_年龄_46_60",
+        "f_flowage_over_60": "日均流动_年龄_61以上",
+        "f_flowsex_woman": "日均流动_性别_女",
+        "f_flowsex_man": "日均流动_性别_男",
+        "f_holidayurban_cons_middle": "节假日流动_城市消费水平_中",
+        "f_holidayurban_cons_low": "节假日流动_城市消费水平_低",
+        "f_holidayurban_cons_lower": "节假日流动_城市消费水平_次低",
+        "f_holidayurban_cons_secondhigh": "节假日流动_城市消费水平_次高",
+        "f_holidayurban_cons_high": "节假日流动_城市消费水平_高",
+        "f_holidayedu_junior_middle": "节假日流动_学历_初中",
+        "f_holidayedu_doctor": "节假日流动_学历_博士",
+        "f_holidayedu_specialty": "节假日流动_学历_大专",
+        "f_holidayedu_primary": "节假日流动_学历_小学",
+        "f_holidayedu_college": "节假日流动_学历_本科",
+        "f_holidayedu_postgraduate": "节假日流动_学历_硕士",
+        "f_holidayedu_senior_middle": "节假日流动_学历_高中",
+        "f_holidayhouse_middle": "节假日流动_居住社区房价_中",
+        "f_holidayhouse_low": "节假日流动_居住社区房价_低",
+        "f_holidayhouse_lower": "节假日流动_居住社区房价_次低",
+        "f_holidayhouse_second_high": "节假日流动_居住社区房价_次高",
+        "f_holidayhouse_high": "节假日流动_居住社区房价_高",
+        "f_holidayage_17": "节假日流动_年龄_0_17",
+        "f_holidayage_24": "节假日流动_年龄_18_24",
+        "f_holidayage_30": "节假日流动_年龄_25_30",
+        "f_holidayage_35": "节假日流动_年龄_31_35",
+        "f_holidayage_40": "节假日流动_年龄_36_40",
+        "f_holidayage_45": "节假日流动_年龄_41_45",
+        "f_holidayage_60": "节假日流动_年龄_46_60",
+        "f_holidayage_over_60": "节假日流动_年龄_61以上",
+        "f_holidaysex_woman": "节假日流动_性别_女",
+        "f_holidaysex_man": "节假日流动_性别_男",
+        "f_workday_urban_cons_middle": "工作日流动_城市消费水平_中",
+        "f_workday_urban_cons_low": "工作日流动_城市消费水平_低",
+        "f_workday_urban_cons_lower": "工作日流动_城市消费水平_次低",
+        "f_workday_urban_cons_secondhigh": "工作日流动_城市消费水平_次高",
+        "f_workday_urban_cons_high": "工作日流动_城市消费水平_高",
+        "f_workday_edu_junior_middle": "工作日流动_学历_初中",
+        "f_workday_edu_doctor": "工作日流动_学历_博士",
+        "f_workday_edu_specialty": "工作日流动_学历_大专",
+        "f_workday_edu_primary": "工作日流动_学历_小学",
+        "f_workday_edu_college": "工作日流动_学历_本科",
+        "f_workday_edu_postgraduate": "工作日流动_学历_硕士",
+        "f_workday_edu_senior_middle": "工作日流动_学历_高中",
+        "f_workday_house_middle": "工作日流动_居住社区房价_中",
+        "f_workday_house_low": "工作日流动_居住社区房价_低",
+        "f_workday_house_lower": "工作日流动_居住社区房价_次低",
+        "f_workday_house_second_high": "工作日流动_居住社区房价_次高",
+        "f_workday_house_high": "工作日流动_居住社区房价_高",
+        "f_workday_age_17": "工作日流动_年龄_0_17",
+        "f_workday_age_24": "工作日流动_年龄_18_24",
+        "f_workday_age_30": "工作日流动_年龄_25_30",
+        "f_workday_age_35": "工作日流动_年龄_31_35",
+        "f_workday_age_40": "工作日流动_年龄_36_40",
+        "f_workday_age_45": "工作日流动_年龄_41_45",
+        "f_workday_age_60": "工作日流动_年龄_46_60",
+        "f_workday_age_over_60": "工作日流动_年龄_61以上",
+        "f_workday_sex_woman": "工作日流动_性别_女",
+        "f_workday_sex_man": "工作日流动_性别_男"
     }

+ 116 - 82
models/rank/data/preprocess.py

@@ -7,23 +7,30 @@ from sklearn.utils import shuffle
 import numpy as np
 
 class DataProcess():
-    def __init__(self, city_uuid, save_path):
+    def __init__(self, city_uuid, save_dir):
         self._mysql_dao = MySqlDao()
-        self._save_res_path = save_path
+        self.save_dir = save_dir
         print("正在加载cust_info...")
         self._cust_data = self._mysql_dao.load_cust_data(city_uuid)
         print("正在加载product_info...")
         self._product_data = self._mysql_dao.load_product_data(city_uuid)
         print("正在加载order_info...")
-        # self._order_data = self._mysql_dao.load_cust_data(city_uuid)
+        # self._order_data = self._mysql_dao.load_order_data(city_uuid)
         self._order_data = self._mysql_dao.load_mock_order_data()
         print("正在加载shopping_info...")
         self._shopping_data = self._mysql_dao.load_shopping_data(city_uuid)
         
     def data_process(self):
         """数据预处理"""
-        if os.path.exists(self._save_res_path):
-            os.remove(self._save_res_path)
+        ori_train_data_save_path = os.path.join(self.save_dir, "original_train_data.csv")
+        pos_train_data_save_path = os.path.join(self.save_dir, "pos_train_data.csv")
+        shopping_train_data_save_path = os.path.join(self.save_dir, "shopping_train_data.csv")
+        if os.path.exists(ori_train_data_save_path):
+            os.remove(ori_train_data_save_path)
+        if os.path.exists(pos_train_data_save_path):
+            os.remove(pos_train_data_save_path)
+        if os.path.exists(shopping_train_data_save_path):
+            os.remove(shopping_train_data_save_path)
         
         # 1. 获取指定的特征组合
         self._cust_data = self._cust_data[CustConfig.FEATURE_COLUMNS]
@@ -36,22 +43,16 @@ class DataProcess():
         self._clean_order_data()
         self._clean_shopping_data()
         
-        # # 3. 将零售户信息表与卷烟信息表进行笛卡尔积连接
-        # self._descartes()
+        # 3. 生成训练数据集
+        ori_train_data = self._generate_original_train_data(is_pos=False)
+        shopping_train_data = self._generate_shopping_train_data()
+        pos_train_data = self._generate_pos_train_data()
         
-        # # 4. 根据order表中的信息给数据打标签
-        # self._labeled_data()
+        ori_train_data.to_csv(ori_train_data_save_path, index=False)
+        shopping_train_data.to_csv(shopping_train_data_save_path, index=False)
+        pos_train_data.to_csv(pos_train_data_save_path, index=False)
         
-        # 3. 根据特征权重给order表中的记录打分
-        self._calculate_score()
-        
-        # 4. 根据中位数打标签
-        self.labeled_data()
-        
-        # 5. 选取训练样本
-        self._generate_train_data()
-        
-    
+
     def _clean_cust_data(self):
         """用户信息表数据清洗"""
         # 根据配置规则清洗数据
@@ -83,9 +84,13 @@ class DataProcess():
                 self._product_data[feature] = self._product_data[feature].infer_objects(copy=False)
                     
     def _clean_order_data(self):
+        remaining_cols = self._order_data.columns.drop(OrderConfig.POSFEATURES) # 数据清洗时先不对pos数据做处理
+        col_all_missing = remaining_cols[self._order_data[remaining_cols].isnull().all()].to_list()
+        self._order_data = self._order_data.drop(col_all_missing)
+        
         # 去除重复值和填补缺失值
         self._order_data.drop_duplicates(inplace=True)
-        self._order_data.fillna(0, inplace=True)
+        self._order_data[remaining_cols.drop(remaining_cols)].fillna(0, inplace=True)
         self._order_data = self._order_data.infer_objects(copy=False)
         
     def _clean_shopping_data(self):
@@ -102,84 +107,113 @@ class DataProcess():
         for col in col_all_missing:
             self._shopping_data[col] = self._shopping_data[col].fillna(0).infer_objects(copy=False)
     
-    def _calculate_score(self):
-        """计算order记录的fens"""
-        self._order_score = self._order_data.copy()
+    def _generate_original_train_data(self, is_pos):
+        union_data = self._union_order_cust_product(is_pos)
+        scored_data = self._calculate_score(union_data)
+        labeled_data = self._labeled_data(scored_data)
+        
+        # labeled_data.to_csv(save_path, index=False)
+        return labeled_data
+        
+        
+    
+    def _generate_pos_train_data(self):
+        pos_data = self._generate_original_train_data(is_pos=True)
+        pos_data = pos_data[pos_data['YLT_TURNOVER_RATE'] != 0]
+        return pos_data
+        
+    
+    def _generate_shopping_train_data(self):
+        orignal_data = self._generate_original_train_data(is_pos=False)
+        cust_feats = self._shopping_data.set_index("cust_code")
+        
+        shopping_train_data = orignal_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
+        return shopping_train_data
+    
+    def _union_order_cust_product(self, is_pos):
+        """联合order表、商户表、卷烟表"""
+        union_data = self._order_data.copy()
+        if not is_pos:
+            union_data.drop(OrderConfig.POSFEATURES, axis=1, inplace=True)
+        union_data.rename(columns={"PRODUCT_CODE": "product_code"}, inplace=True)
+        # union_data = union_data.drop(OrderConfig.POSFEATURES) # 去除pos数据特征字段
+        cust_feats = self._cust_data.set_index("BB_RETAIL_CUSTOMER_CODE")
+        product_feats = self._product_data.set_index("product_code")
+        
+        union_data = union_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
+        union_data = union_data.join(product_feats, on="product_code", how="inner")
+        
+        return union_data
+        # self._train_data = shuffle(self._train_data, random_state=42)
+        
+    def _calculate_score(self, union_data):
+        """计算联合数据记录的分数"""
         # 对参与算分的特征值进行归一化
         scaler = MinMaxScaler()
-        self._order_score[list(OrderConfig.WEIGHTS.keys())] = scaler.fit_transform(self._order_score[list(OrderConfig.WEIGHTS.keys())])
+        union_data[list(OrderConfig.WEIGHTS.keys())] = scaler.fit_transform(union_data[list(OrderConfig.WEIGHTS.keys())])
         # 计算加权分数
-        self._order_score["score"] = sum(self._order_score[feat] * weight 
+        union_data["score"] = sum(union_data[feat] * weight 
                           for feat, weight in OrderConfig.WEIGHTS.items())
+        
+        return union_data
     
-    def labeled_data(self):
+    def _labeled_data(self, scored_data):
         """通过计算分数打标签"""
         # 按品规分组计算中位数
-        product_medians = self._order_score.groupby("PRODUCT_CODE")["score"].median().reset_index()
-        product_medians.columns = ["PRODUCT_CODE", "median_score"]
+        product_medians = scored_data.groupby("product_code")["score"].median().reset_index()
+        product_medians.columns = ["product_code", "median_score"]
         
         # 合并中位数到原始订单数据
-        self._order_score = pd.merge(self._order_score, product_medians, on="PRODUCT_CODE")
+        temp_data = pd.merge(scored_data, product_medians, on="product_code")
         
         # 生成标签 (1: 大于等于中位数, 0: 小于中位数)
-        self._order_score["label"] = np.where(
-            self._order_score["score"] >= self._order_score["median_score"], 1, 0
+        scored_data["label"] = np.where(
+            scored_data["score"] >= temp_data["median_score"], 1, 0
         )
-        self._order_score = self._order_score.sort_values("score", ascending=False)
-        self._order_score = self._order_score[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE", "label"]]
-        self._order_score.rename(columns={"PRODUCT_CODE": "product_code"}, inplace=True)
-    
-    def _generate_train_data(self):
-        cust_feats = self._cust_data.set_index("BB_RETAIL_CUSTOMER_CODE")
-        product_feats = self._product_data.set_index("product_code")
+        scored_data = scored_data.sort_values("score", ascending=False)
         
-        self._train_data = self._order_score.copy()
-        
-        self._train_data = self._train_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="left")
-        self._train_data = self._train_data.join(product_feats, on="product_code", how="left")
-        
-        self._train_data = shuffle(self._train_data, random_state=42)
-
-        self._train_data.to_csv(self._save_res_path, index=False)
+        scored_data = shuffle(scored_data, random_state=42)
+        return scored_data
     
-    def _descartes(self):
-        """将零售户信息与卷烟信息进行笛卡尔积连接"""
-        self._cust_data["descartes"] = 1
-        self._product_data["descartes"] = 1
-        
-        self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
-        
-    def _labeled_data_from_descartes(self):
-        """根据order表信息给descartes_data数据打标签"""
-        # 获取order表中的正样本组合
-        order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
-        order_set = set(zip(order_combinations["BB_RETAIL_CUSTOMER_CODE"], order_combinations["PRODUCT_CODE"]))
-        
-        # 在descartes_data中打标签:正样本为1,负样本为0
-        self._descartes_data['label'] = self._descartes_data.apply(
-            lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
+    # def _descartes(self):
+    #     """将零售户信息与卷烟信息进行笛卡尔积连接"""
+    #     self._cust_data["descartes"] = 1
+    #     self._product_data["descartes"] = 1
+        
+    #     self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
+        
+    # def _labeled_data_from_descartes(self):
+    #     """根据order表信息给descartes_data数据打标签"""
+    #     # 获取order表中的正样本组合
+    #     order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
+    #     order_set = set(zip(order_combinations["BB_RETAIL_CUSTOMER_CODE"], order_combinations["PRODUCT_CODE"]))
+        
+    #     # 在descartes_data中打标签:正样本为1,负样本为0
+    #     self._descartes_data['label'] = self._descartes_data.apply(
+    #         lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
     
-    def _generate_train_data_from_descartes(self):
-        """从descartes_data中生成训练数据"""
-        positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
-        negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
-        
-        positive_count = len(positive_samples)
-        negative_count = min(1 * positive_count, len(negative_samples))
-        print(positive_count)
-        print(negative_count)
-        
-        # 随机抽取2倍正样本数量的负样本
-        negative_samples_sampled = negative_samples.sample(n=negative_count, random_state=42)
-        # 合并正负样本
-        self._train_data = pd.concat([positive_samples, negative_samples_sampled], axis=0)
-        self._train_data = self._train_data.sample(frac=1, random_state=42).reset_index(drop=True)
-        
-        # 保存训练数据
-        self._train_data.to_csv(self._save_res_path, index=False)
+    # def _generate_train_data_from_descartes(self):
+    #     """从descartes_data中生成训练数据"""
+    #     positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
+    #     negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
+        
+    #     positive_count = len(positive_samples)
+    #     negative_count = min(1 * positive_count, len(negative_samples))
+    #     print(positive_count)
+    #     print(negative_count)
+        
+    #     # 随机抽取2倍正样本数量的负样本
+    #     negative_samples_sampled = negative_samples.sample(n=negative_count, random_state=42)
+    #     # 合并正负样本
+    #     self._train_data = pd.concat([positive_samples, negative_samples_sampled], axis=0)
+    #     self._train_data = self._train_data.sample(frac=1, random_state=42).reset_index(drop=True)
+        
+    #     # 保存训练数据
+    #     self._train_data.to_csv(self._save_res_path, index=False)
     
 if __name__ == '__main__':
     city_uuid = "00000000000000000000000011445301"
-    save_path = "./models/rank/data/gbdt_data.csv"
-    processor = DataProcess(city_uuid, save_path)
+    # city_uuid = "00000000000000000000000011441801"
+    save_dir = "./data"
+    processor = DataProcess(city_uuid, save_dir)
     processor.data_process()

+ 0 - 199
models/rank/data/preprocess_test.py

@@ -1,199 +0,0 @@
-from database import MySqlDao
-from models.rank.data.config import CustConfig, ProductConfig, OrderConfig
-import os
-import pandas as pd
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.utils import shuffle
-import numpy as np
-
-class DataProcess():
-    def __init__(self, city_uuid, save_path):
-        self._mysql_dao = MySqlDao()
-        self._save_res_path = save_path
-        print("正在加载cust_info...")
-        self._cust_data = self._mysql_dao.load_cust_data(city_uuid)
-        print("正在加载product_info...")
-        self._product_data = self._mysql_dao.load_product_data(city_uuid)
-        print("正在加载order_info...")
-        # self._order_data = self._mysql_dao.load_cust_data(city_uuid)
-        self._order_data = self._mysql_dao.load_mock_order_data()
-        print("正在加载shopping_info...")
-        self._shopping_data = self._mysql_dao.load_shopping_data(city_uuid)
-        
-    def data_process(self):
-        """数据预处理"""
-        if os.path.exists(self._save_res_path):
-            os.remove(self._save_res_path)
-        
-        # 1. 获取指定的特征组合
-        self._cust_data = self._cust_data[CustConfig.FEATURE_COLUMNS]
-        self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
-        self._order_data = self._order_data[OrderConfig.FEATURE_COLUMNS]
-        
-        # 2. 数据清洗
-        self._clean_cust_data()
-        self._clean_product_data()
-        self._clean_order_data()
-        self._clean_shopping_data()
-        
-        # # 3. 将零售户信息表与卷烟信息表进行笛卡尔积连接
-        # self._descartes()
-        
-        # # 4. 根据order表中的信息给数据打标签
-        # self._labeled_data()
-        
-        # 3. 根据特征权重给order表中的记录打分
-        self._calculate_score()
-        
-        # 4. 根据中位数打标签
-        self.labeled_data()
-        
-        # 5. 选取训练样本
-        self._generate_train_data()
-        
-    
-    def _clean_cust_data(self):
-        """用户信息表数据清洗"""
-        # 根据配置规则清洗数据
-        for feature, rules, in CustConfig.CLEANING_RULES.items():
-            if rules["type"] == "num":
-                # 先将数值型字符串转换为数值
-                self._cust_data[feature] = pd.to_numeric(self._cust_data[feature], errors="coerce")
-                
-            if rules["method"] == "fillna":
-                if rules["opt"] == "fill":
-                    self._cust_data[feature] = self._cust_data[feature].fillna(rules["value"])
-                elif rules["opt"] == "replace":
-                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]])
-                elif rules["opt"] == "mean":
-                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean())
-                self._cust_data[feature] = self._cust_data[feature].infer_objects(copy=False)
-    
-    def _clean_product_data(self):
-        """卷烟信息表数据清洗"""
-        for feature, rules, in ProductConfig.CLEANING_RULES.items():
-            if rules["type"] == "num":
-                self._product_data[feature] = pd.to_numeric(self._product_data[feature], errors="coerce")
-            
-            if rules["method"] == "fillna":
-                if rules["opt"] == "fill":
-                    self._product_data[feature] = self._product_data[feature].fillna(rules["value"])
-                elif rules["opt"] == "mean":
-                    self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean())
-                self._product_data[feature] = self._product_data[feature].infer_objects(copy=False)
-                    
-    def _clean_order_data(self):
-        # 去除重复值和填补缺失值
-        self._order_data.drop_duplicates(inplace=True)
-        self._order_data.fillna(0, inplace=True)
-        self._order_data = self._order_data.infer_objects(copy=False)
-        
-    def _clean_shopping_data(self):
-        """处理商圈数据缺省值"""
-        self._shopping_data.drop(["cust_uuid", "longitude", "latitude", "range_radius"], axis=1, inplace=True)
-        remaining_cols = self._shopping_data.columns.drop(["city_uuid", "cust_code"])
-        col_with_missing = remaining_cols[self._shopping_data[remaining_cols].isnull().any()].tolist() # 判断有缺失的字段
-        col_all_missing = remaining_cols[self._shopping_data[remaining_cols].isnull().all()].to_list() # 全部缺失的字段
-        col_partial_missing = list(set(col_with_missing) - set(col_all_missing)) # 部分缺失的字段
-        
-        for col in col_partial_missing:
-            self._shopping_data[col] = self._shopping_data[col].fillna(self._shopping_data[col].mean())
-        
-        for col in col_all_missing:
-            self._shopping_data[col] = self._shopping_data[col].fillna(0).infer_objects(copy=False)
-    
-    def _generate_original_train_data(self):
-        union_data = self._union_order_cust_product()
-        scored_data = self._calculate_score(union_data)
-        
-        
-    
-    def _generate_pos_train_data(self):
-        pass
-    
-    def _generate_shopping_train_data(self):
-        pass
-    
-    def _union_order_cust_product(self):
-        """联合order表、商户表、卷烟表"""
-        union_data = self._order_score.copy()
-        union_data.rename(columns={"PRODUCT_CODE": "product_code"}, inplace=True)
-        union_data = union_data.drop(["YLT_TURNOVER_RATE", "YLT_BAR_PACKAGE_SALE_OCC", "POS_PACKAGE_PRICE"])
-        cust_feats = self._cust_data.set_index("BB_RETAIL_CUSTOMER_CODE")
-        product_feats = self._product_data.set_index("product_code")
-        
-        union_data = union_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
-        union_data = union_data.join(product_feats, on="product_code", how="inner")
-        
-        return union_data
-        # self._train_data = shuffle(self._train_data, random_state=42)
-        
-    def _calculate_score(self, union_data):
-        """计算联合数据记录的分数"""
-        # 对参与算分的特征值进行归一化
-        scaler = MinMaxScaler()
-        union_data[list(OrderConfig.WEIGHTS.keys())] = scaler.fit_transform(union_data[list(OrderConfig.WEIGHTS.keys())])
-        # 计算加权分数
-        union_data["score"] = sum(self._order_score[feat] * weight 
-                          for feat, weight in OrderConfig.WEIGHTS.items())
-        
-        return union_data
-    
-    def labeled_data(self, scored_data):
-        """通过计算分数打标签"""
-        # 按品规分组计算中位数
-        product_medians = scored_data("PRODUCT_CODE")["score"].median().reset_index()
-        product_medians.columns = ["PRODUCT_CODE", "median_score"]
-        
-        # 合并中位数到原始订单数据
-        temp_data = pd.merge(scored_data, product_medians, on="PRODUCT_CODE")
-        
-        # 生成标签 (1: 大于等于中位数, 0: 小于中位数)
-        self._order_score["label"] = np.where(
-            self._order_score["score"] >= self._order_score["median_score"], 1, 0
-        )
-        self._order_score = self._order_score.sort_values("score", ascending=False)
-        self._order_score = self._order_score[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE", "label"]]
-        self._order_score.rename(columns={"PRODUCT_CODE": "product_code"}, inplace=True)
-    
-    # def _descartes(self):
-    #     """将零售户信息与卷烟信息进行笛卡尔积连接"""
-    #     self._cust_data["descartes"] = 1
-    #     self._product_data["descartes"] = 1
-        
-    #     self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
-        
-    # def _labeled_data_from_descartes(self):
-    #     """根据order表信息给descartes_data数据打标签"""
-    #     # 获取order表中的正样本组合
-    #     order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
-    #     order_set = set(zip(order_combinations["BB_RETAIL_CUSTOMER_CODE"], order_combinations["PRODUCT_CODE"]))
-        
-    #     # 在descartes_data中打标签:正样本为1,负样本为0
-    #     self._descartes_data['label'] = self._descartes_data.apply(
-    #         lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
-    
-    # def _generate_train_data_from_descartes(self):
-    #     """从descartes_data中生成训练数据"""
-    #     positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
-    #     negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
-        
-    #     positive_count = len(positive_samples)
-    #     negative_count = min(1 * positive_count, len(negative_samples))
-    #     print(positive_count)
-    #     print(negative_count)
-        
-    #     # 随机抽取2倍正样本数量的负样本
-    #     negative_samples_sampled = negative_samples.sample(n=negative_count, random_state=42)
-    #     # 合并正负样本
-    #     self._train_data = pd.concat([positive_samples, negative_samples_sampled], axis=0)
-    #     self._train_data = self._train_data.sample(frac=1, random_state=42).reset_index(drop=True)
-        
-    #     # 保存训练数据
-    #     self._train_data.to_csv(self._save_res_path, index=False)
-    
-if __name__ == '__main__':
-    city_uuid = "00000000000000000000000011445301"
-    save_path = "./models/rank/data/gbdt_data.csv"
-    processor = DataProcess(city_uuid, save_path)
-    processor.data_process()

+ 3 - 1
models/rank/gbdt_lr_sort.py

@@ -109,6 +109,8 @@ class GbdtLrModel:
                 cust_features_importance.append({ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP[feat]: float(importance)})
             if feat in list(ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP.keys()):
                 product_features_importance.append({ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP[feat]: float(importance)})
+            if feat in list(ImportanceFeaturesMap.SHOPING_FEATURES_MAP.keys()):
+                product_features_importance.append({ImportanceFeaturesMap.SHOPING_FEATURES_MAP[feat]: float(importance)})
         return cust_features_importance, product_features_importance
     
 if __name__ == "__main__":
@@ -116,7 +118,7 @@ if __name__ == "__main__":
     city_uuid = "00000000000000000000000011445301"
     product_id = "110102"
     gbdt_sort = GbdtLrModel(model_path)
-    gbdt_sort.sort(city_uuid, product_id)
+    # gbdt_sort.sort(city_uuid, product_id)
     
     importances = gbdt_sort.generate_feats_importance()
     for importance in importances: