1 tahun lalu · 47c8e4c5b5
--- a/dao/dao.py
+++ b/dao/dao.py
@@ -3,16 +3,16 @@ from dao import Mysql
 
				 def load_order_data_from_mysql(city_uuid):
			
 
				     """从数据库中读取订单数据"""
			
 
				     client = Mysql()
			
 
				-    tablename = "tads_brandcul_cust_order"
			
 
				+    tablename = "yunfu_mock_data"
			
 
				     query_text = "*"
			
 
				     
			
 
				-    df = client.load_data(tablename, query_text, "city_uuid", city_uuid)
			
 
				+    # df = client.load_data(tablename, query_text, "city_uuid", city_uuid)
			
 
				+    df = client.load_mock_data(tablename, query_text)
			
 
				     if len(df) == 0:
			
 
				         return None
			
 
				     
			
 
				-    df.drop('stat_month', axis=1, inplace=True)
			
 
				-    df.drop('city_uuid', axis=1, inplace=True)
			
 
				-    print(df.columns)
			
 
				+    # df.drop('stat_month', axis=1, inplace=True)
			
 
				+    # df.drop('city_uuid', axis=1, inplace=True)
			
 
				     
			
 
				      # 去除重复值和填补缺失值
			
 
				     df.drop_duplicates(inplace=True)
			
@@ -44,5 +44,5 @@ def load_product_data_from_mysql(city_uuid):
 
				     return df
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    data = load_product_data_from_mysql("00000000000000000000000011445301")
			
 
				+    data = load_order_data_from_mysql("00000000000000000000000011445301")
			
 
				     print(data)
			
--- a/dao/mysql_client.py
+++ b/dao/mysql_client.py
@@ -71,6 +71,33 @@ class Mysql(object):
 
				         finally:
			
 
				             self.closed()
			
 
				             return total_df
			
 
				+        
			
 
				+    def load_mock_data(self, tablename, query_text, page=1, page_size=1000):
			
 
				+        # 创建一个空的DataFrame用于存储所有数据
			
 
				+        total_df = pd.DataFrame()
			
 
				+    
			
 
				+        try:
			
 
				+            while True:
			
 
				+                offset = (page - 1) * page_size  # 计算偏移量
			
 
				+                query = text(f"SELECT {query_text} FROM {tablename} LIMIT :limit OFFSET :offset")
			
 
				+    
			
 
				+                with self.create_session() as session:
			
 
				+                    results = session.execute(query, { "limit": page_size, "offset": offset}).fetchall()
			
 
				+                    df = pd.DataFrame(results)
			
 
				+                if df.empty:
			
 
				+                    break
			
 
				+            
			
 
				+                total_df = pd.concat([total_df, df], ignore_index=True)
			
 
				+                print(f"Page {page}: Retrieved {len(df)} rows, Total rows so far: {len(total_df)}")
			
 
				+                page += 1  # 继续下一页
			
 
				+                
			
 
				+        except Exception as e:
			
 
				+            print(f"Error: {e}")
			
 
				+            return None
			
 
				+        
			
 
				+        finally:
			
 
				+            self.closed()
			
 
				+            return total_df
			
 
				     
			
 
				     def closed(self):
			
 
				         """关闭连接，回收资源"""
			
--- a/models/rank/__init__.py
+++ b/models/rank/__init__.py
@@ -1,2 +1,7 @@
 
				 #!/usr/bin/env python3
			
 
				 # -*- coding:utf-8 -*-
			
 
				+from models.rank.data.preprocess import DataProcess
			
 
				+
			
 
				+__all__ = [
			
 
				+    "DataProcess"
			
 
				+]
			
--- a/models/rank/data/__init__.py
+++ b/models/rank/data/__init__.py
@@ -0,0 +1,5 @@
 
				+from models.rank.data.config import CustConfig, ProductConfig
			
 
				+__all__ = [
			
 
				+    "CustConfig",
			
 
				+    "ProductConfig"
			
 
				+]
			
--- a/models/rank/data/config.py
+++ b/models/rank/data/config.py
@@ -0,0 +1,109 @@
 
				+class CustConfig:
			
 
				+    FEATURE_COLUMNS = [
			
 
				+        "BB_RETAIL_CUSTOMER_CODE",                     # 零售户代码
			
 
				+        "BB_RTL_CUST_POSITION_TYPE_NAME",              # 零售户商圈类型名称
			
 
				+        "BB_RTL_CUST_MARKET_TYPE_NAME",                # 零售户市场类型名称
			
 
				+        # "BB_RTL_CUST_BUSINESS_TYPE_NAME",              # 零售户业态名称
			
 
				+        "BB_RTL_CUST_SUB_BUSI_PLACE_NAME",             # 零售户业态细分名称
			
 
				+        "BB_RTL_CUST_GRADE_NAME",                      # 零售户分档名称
			
 
				+        # "BB_RTL_CUST_TERMINAL_LEVEL_NAME",             # 零售户终端层级名称
			
 
				+        # "BB_RTL_CUST_TERMINALEVEL_NAME",               # 零售户终端层级细分名称
			
 
				+        # "MD04_MG_RTL_CUST_CREDITCLASS_NAME",           # 零售户信用等级名称
			
 
				+        # "MD04_MG_SAMPLE_CUST_FLAG",                    # 样本户标识
			
 
				+        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG",            # 零售户大户标识
			
 
				+        # "BB_CUSTOMER_MANAGER_SCOPE_NAME",              # 零售户经营范围名称
			
 
				+        # "BB_RTL_CUST_OPERATE_METHOD_NAME",             # 零售户经营方式名称
			
 
				+        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME",          # 零售户卷烟经营规模名称
			
 
				+        "BB_RTL_CUST_CHAIN_FLAG",                      # 零售户连锁标识
			
 
				+        "MD04_DIR_SAL_STORE_FLAG",                     # 直营店标识
			
 
				+        "STORE_AREA",                                  # 店铺经营面积
			
 
				+        "OPERATOR_AGE",                                # 经营者年龄
			
 
				+        "OPERATOR_EDU_LEVEL",                          # 零售客户经营者文化程
			
 
				+        # "AVERAGE_CONSUMER_FLOW",                       # 月均消费人流
			
 
				+        # "NEW_PRODUCT_MEMBERS_QTY",                     # 新品消费会员数量
			
 
				+    ]
			
 
				+    # 数据清洗规则
			
 
				+    CLEANING_RULES = {
			
 
				+        "BB_RTL_CUST_POSITION_TYPE_NAME":           {"method": "fillna", "opt": "fill", "value": "其它", "type": "str"},
			
 
				+        "BB_RTL_CUST_MARKET_TYPE_NAME":             {"method": "fillna", "opt": "fill", "value": "其它", "type": "str"},
			
 
				+        "BB_RTL_CUST_SUB_BUSI_PLACE_NAME":          {"method": "fillna", "opt": "fill", "value": "其它", "type": "str"},
			
 
				+        # "BB_RTL_CUST_TERMINALEVEL_NAME":          {"method": "fillna", "opt": "replace", "value": "BB_RTL_CUST_TERMINAL_LEVEL_NAME", "type": "str"},
			
 
				+        # "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        {"method": "fillna", "opt": "fill", "value": "未评价", "type": "str"},
			
 
				+        # "MD04_MG_SAMPLE_CUST_FLAG":                 {"method": "fillna", "value": "N", "opt": "fill"},
			
 
				+        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG":         {"method": "fillna", "value": "N", "opt": "fill"},
			
 
				+        "BB_RTL_CUST_CHAIN_FLAG":                   {"type": "fillna", "opt": "fill", "value": "0", "type": "str"},
			
 
				+        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME":       {"method": "fillna", "value": "中", "opt": "fill"},
			
 
				+        "BB_RTL_CUST_CHAIN_FLAG":                   {"method": "fillna", "opt": "fill", "value": "0", "type": "str"},
			
 
				+        "MD04_DIR_SAL_STORE_FLAG":                  {"method": "fillna", "opt": "fill", "value": "0", "type": "str"},
			
 
				+        "STORE_AREA":                               {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "OPERATOR_AGE":                             {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "OPERATOR_EDU_LEVEL":                       {"method": "fillna", "opt": "fill", "value": "00", "type": "str"},
			
 
				+    }
			
 
				+    # one-hot编码
			
 
				+    
			
 
				+    
			
 
				+class ProductConfig:
			
 
				+    FEATURE_COLUMNS = [
			
 
				+        "product_code",                                # 商品编码
			
 
				+        # "factory_name",                                # 工业公司名称
			
 
				+        # "brand_code",                                  # 品牌编码
			
 
				+        "adjust_price",                                # 含税调拨价
			
 
				+        "notwithtax_adjust_price",                     # 不含税调拨价
			
 
				+        "whole_sale_price",                            # 统一批发价
			
 
				+        "direct_retail_price",                         # 建议零售价
			
 
				+        "allot_price",                                 # 调拨价
			
 
				+        "direct_whole_price",                          # 批发指导价
			
 
				+        "retail_price",                                # 零售价
			
 
				+        "price_type_name",                             # 卷烟价类名称
			
 
				+        "gear_type_name",                              # 卷烟档位名称
			
 
				+        "category_type_name",                          # 卷烟品类名称
			
 
				+        "is_key_brand",                                # 是否重点品牌
			
 
				+        "is_high_level",                               # 是否高端烟
			
 
				+        "is_upscale_level",                            # 是否高端烟不含高价
			
 
				+        "is_high_price",                               # 是否高价烟
			
 
				+        "is_low_price",                                # 是否低价烟
			
 
				+        "is_low_tar",                                  # 是否低焦油烟
			
 
				+        "is_encourage",                                # 是否全国鼓励品牌
			
 
				+        "is_abnormity",                                # 是否异形包装
			
 
				+        "is_intake",                                   # 是否进口烟
			
 
				+        "is_short",                                    # 是否紧俏品牌
			
 
				+        "is_medium",                                   # 是否中支烟
			
 
				+        "is_shortbranch",                              # 是否短支烟
			
 
				+        "is_ordinary_price_type",                      # 是否普一类烟
			
 
				+        "source_type",                                 # 来源类型
			
 
				+        "tar_qty",                                     # 焦油含量
			
 
				+        "product_style_code_name",                     # 包装类型名称
			
 
				+        "chinese_mix",                                 # 中式混合
			
 
				+        "sub_price_type_name",                         # 细分卷烟价类名称
			
 
				+    ]
			
 
				+    
			
 
				+    CLEANING_RULES = {
			
 
				+        "adjust_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "notwithtax_adjust_price":                     {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "whole_sale_price":                            {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "direct_retail_price":                         {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "allot_price":                                 {"method": "fillna", "opt": "fill", "type": "num", "value": 0.0},
			
 
				+        "direct_whole_price":                          {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "retail_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "price_type_name":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
			
 
				+        "gear_type_name":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
			
 
				+        "category_type_name":                          {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
			
 
				+        "is_key_brand":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_high_level":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_upscale_level":                            {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_high_price":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_low_price":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_low_tar":                                  {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_encourage":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_abnormity":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_intake":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_short":                                    {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_medium":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_shortbranch":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "is_ordinary_price_type":                      {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "source_type":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
			
 
				+        "tar_qty":                                     {"method": "fillna", "opt": "mean", "type": "num"},
			
 
				+        "product_style_code_name":                     {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
			
 
				+        "chinese_mix":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
			
 
				+        "sub_price_type_name":                         {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
			
 
				+    }
			
--- a/models/rank/data/preprocess.py
+++ b/models/rank/data/preprocess.py
@@ -0,0 +1,101 @@
 
				+from dao.dao import load_cust_data_from_mysql, load_product_data_from_mysql, load_order_data_from_mysql
			
 
				+from models.rank.data.config import CustConfig, ProductConfig
			
 
				+import pandas as pd
			
 
				+
			
 
				+class DataProcess():
			
 
				+    def __init__(self, city_uuid):
			
 
				+        print("正在加载cust_info...")
			
 
				+        self._cust_data = load_cust_data_from_mysql(city_uuid)
			
 
				+        print("正在加载product_info...")
			
 
				+        self._product_data = load_product_data_from_mysql(city_uuid)
			
 
				+        print("正在加载order_info...")
			
 
				+        self._order_data = load_order_data_from_mysql(city_uuid)
			
 
				+        
			
 
				+    def data_process(self):
			
 
				+        """数据预处理"""
			
 
				+        
			
 
				+        # 1. 获取指定的特征组合
			
 
				+        self._cust_data = self._cust_data[CustConfig.FEATURE_COLUMNS]
			
 
				+        self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
			
 
				+        
			
 
				+        # 2. 数据清洗
			
 
				+        self._clean_cust_data()
			
 
				+        self._clean_product_data()
			
 
				+        
			
 
				+        # 3. 将零售户信息表与卷烟信息表进行笛卡尔积连接
			
 
				+        self._descartes()
			
 
				+        
			
 
				+        # 4. 根据order表中的信息给数据打标签
			
 
				+        self._labeled_data()
			
 
				+        
			
 
				+        # 5. 选取训练样本
			
 
				+        self._generate_train_data()
			
 
				+        
			
 
				+    
			
 
				+    def _clean_cust_data(self):
			
 
				+        """用户信息表数据清洗"""
			
 
				+        # 根据配置规则清洗数据
			
 
				+        for feature, rules, in CustConfig.CLEANING_RULES.items():
			
 
				+            if rules["type"] == "num":
			
 
				+                # 先将数值型字符串转换为数值
			
 
				+                self._cust_data[feature] = pd.to_numeric(self._cust_data[feature], errors="coerce")
			
 
				+                
			
 
				+            if rules["method"] == "fillna":
			
 
				+                if rules["opt"] == "fill":
			
 
				+                    self._cust_data[feature] = self._cust_data[feature].fillna(rules["value"])
			
 
				+                elif rules["opt"] == "replace":
			
 
				+                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]])
			
 
				+                elif rules["opt"] == "mean":
			
 
				+                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean())
			
 
				+    
			
 
				+    def _clean_product_data(self):
			
 
				+        """卷烟信息表数据清洗"""
			
 
				+        for feature, rules, in ProductConfig.CLEANING_RULES.items():
			
 
				+            if rules["type"] == "num":
			
 
				+                self._product_data[feature] = pd.to_numeric(self._product_data[feature], errors="coerce")
			
 
				+            
			
 
				+            if rules["method"] == "fillna":
			
 
				+                if rules["opt"] == "fill":
			
 
				+                    self._product_data[feature] = self._product_data[feature].fillna(rules["value"])
			
 
				+                elif rules["opt"] == "mean":
			
 
				+                    self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean())
			
 
				+    
			
 
				+    def _descartes(self):
			
 
				+        """将零售户信息与卷烟信息进行笛卡尔积连接"""
			
 
				+        self._cust_data["descartes"] = 1
			
 
				+        self._product_data["descartes"] = 1
			
 
				+        
			
 
				+        self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
			
 
				+        
			
 
				+    def _labeled_data(self):
			
 
				+        """根据order表信息给descartes_data数据打标签"""
			
 
				+        # 获取order表中的正样本组合
			
 
				+        order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
			
 
				+        order_set = set(zip(order_combinations["BB_RETAIL_CUSTOMER_CODE"], order_combinations["PRODUCT_CODE"]))
			
 
				+        
			
 
				+        # 在descartes_data中打标签：正样本为1，负样本为2
			
 
				+        self._descartes_data['label'] = self._descartes_data.apply(
			
 
				+            lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
			
 
				+    
			
 
				+    def _generate_train_data(self):
			
 
				+        """从descartes_data中生成训练数据"""
			
 
				+        positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
			
 
				+        negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
			
 
				+        
			
 
				+        positive_count = len(positive_samples)
			
 
				+        negative_count = min(2 * positive_count, len(negative_samples))
			
 
				+        print(positive_count)
			
 
				+        print(negative_count)
			
 
				+        
			
 
				+        # 随机抽取2倍正样本数量的负样本
			
 
				+        negative_samples_sampled = negative_samples.sample(n=negative_count, random_state=42)
			
 
				+        # 合并正负样本
			
 
				+        self._train_data = pd.concat([positive_samples, negative_samples_sampled], axis=0)
			
 
				+        self._train_data = self._train_data.sample(frac=1, random_state=42).reset_index(drop=True)
			
 
				+        
			
 
				+        # 保存训练数据
			
 
				+        self._train_data.to_csv("./models/rank/data/gbdt_data.csv", index=False)
			
 
				+    
			
 
				+if __name__ == '__main__':
			
 
				+    processor = DataProcess("00000000000000000000000011445301")
			
 
				+    processor.data_process()