Jelajahi Sumber

gbdt-lr数据预处理

Sherlock 1 tahun lalu
induk
melakukan
47c8e4c5b5

+ 6 - 6
dao/dao.py

@@ -3,16 +3,16 @@ from dao import Mysql
 def load_order_data_from_mysql(city_uuid):
     """从数据库中读取订单数据"""
     client = Mysql()
-    tablename = "tads_brandcul_cust_order"
+    tablename = "yunfu_mock_data"
     query_text = "*"
     
-    df = client.load_data(tablename, query_text, "city_uuid", city_uuid)
+    # df = client.load_data(tablename, query_text, "city_uuid", city_uuid)
+    df = client.load_mock_data(tablename, query_text)
     if len(df) == 0:
         return None
     
-    df.drop('stat_month', axis=1, inplace=True)
-    df.drop('city_uuid', axis=1, inplace=True)
-    print(df.columns)
+    # df.drop('stat_month', axis=1, inplace=True)
+    # df.drop('city_uuid', axis=1, inplace=True)
     
      # 去除重复值和填补缺失值
     df.drop_duplicates(inplace=True)
@@ -44,5 +44,5 @@ def load_product_data_from_mysql(city_uuid):
     return df
 
 if __name__ == '__main__':
-    data = load_product_data_from_mysql("00000000000000000000000011445301")
+    data = load_order_data_from_mysql("00000000000000000000000011445301")
     print(data)

+ 27 - 0
dao/mysql_client.py

@@ -71,6 +71,33 @@ class Mysql(object):
         finally:
             self.closed()
             return total_df
+        
+    def load_mock_data(self, tablename, query_text, page=1, page_size=1000):
+        # 创建一个空的DataFrame用于存储所有数据
+        total_df = pd.DataFrame()
+    
+        try:
+            while True:
+                offset = (page - 1) * page_size  # 计算偏移量
+                query = text(f"SELECT {query_text} FROM {tablename} LIMIT :limit OFFSET :offset")
+    
+                with self.create_session() as session:
+                    results = session.execute(query, { "limit": page_size, "offset": offset}).fetchall()
+                    df = pd.DataFrame(results)
+                if df.empty:
+                    break
+            
+                total_df = pd.concat([total_df, df], ignore_index=True)
+                print(f"Page {page}: Retrieved {len(df)} rows, Total rows so far: {len(total_df)}")
+                page += 1  # 继续下一页
+                
+        except Exception as e:
+            print(f"Error: {e}")
+            return None
+        
+        finally:
+            self.closed()
+            return total_df
     
     def closed(self):
         """关闭连接,回收资源"""

+ 5 - 0
models/rank/__init__.py

@@ -1,2 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
+from models.rank.data.preprocess import DataProcess
+
+__all__ = [
+    "DataProcess"
+]

+ 5 - 0
models/rank/data/__init__.py

@@ -0,0 +1,5 @@
+from models.rank.data.config import CustConfig, ProductConfig
+__all__ = [
+    "CustConfig",
+    "ProductConfig"
+]

+ 109 - 0
models/rank/data/config.py

@@ -0,0 +1,109 @@
+class CustConfig:
+    FEATURE_COLUMNS = [
+        "BB_RETAIL_CUSTOMER_CODE",                     # 零售户代码
+        "BB_RTL_CUST_POSITION_TYPE_NAME",              # 零售户商圈类型名称
+        "BB_RTL_CUST_MARKET_TYPE_NAME",                # 零售户市场类型名称
+        # "BB_RTL_CUST_BUSINESS_TYPE_NAME",              # 零售户业态名称
+        "BB_RTL_CUST_SUB_BUSI_PLACE_NAME",             # 零售户业态细分名称
+        "BB_RTL_CUST_GRADE_NAME",                      # 零售户分档名称
+        # "BB_RTL_CUST_TERMINAL_LEVEL_NAME",             # 零售户终端层级名称
+        # "BB_RTL_CUST_TERMINALEVEL_NAME",               # 零售户终端层级细分名称
+        # "MD04_MG_RTL_CUST_CREDITCLASS_NAME",           # 零售户信用等级名称
+        # "MD04_MG_SAMPLE_CUST_FLAG",                    # 样本户标识
+        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG",            # 零售户大户标识
+        # "BB_CUSTOMER_MANAGER_SCOPE_NAME",              # 零售户经营范围名称
+        # "BB_RTL_CUST_OPERATE_METHOD_NAME",             # 零售户经营方式名称
+        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME",          # 零售户卷烟经营规模名称
+        "BB_RTL_CUST_CHAIN_FLAG",                      # 零售户连锁标识
+        "MD04_DIR_SAL_STORE_FLAG",                     # 直营店标识
+        "STORE_AREA",                                  # 店铺经营面积
+        "OPERATOR_AGE",                                # 经营者年龄
+        "OPERATOR_EDU_LEVEL",                          # 零售客户经营者文化程
+        # "AVERAGE_CONSUMER_FLOW",                       # 月均消费人流
+        # "NEW_PRODUCT_MEMBERS_QTY",                     # 新品消费会员数量
+    ]
+    # 数据清洗规则
+    CLEANING_RULES = {
+        "BB_RTL_CUST_POSITION_TYPE_NAME":           {"method": "fillna", "opt": "fill", "value": "其它", "type": "str"},
+        "BB_RTL_CUST_MARKET_TYPE_NAME":             {"method": "fillna", "opt": "fill", "value": "其它", "type": "str"},
+        "BB_RTL_CUST_SUB_BUSI_PLACE_NAME":          {"method": "fillna", "opt": "fill", "value": "其它", "type": "str"},
+        # "BB_RTL_CUST_TERMINALEVEL_NAME":          {"method": "fillna", "opt": "replace", "value": "BB_RTL_CUST_TERMINAL_LEVEL_NAME", "type": "str"},
+        # "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        {"method": "fillna", "opt": "fill", "value": "未评价", "type": "str"},
+        # "MD04_MG_SAMPLE_CUST_FLAG":                 {"method": "fillna", "value": "N", "opt": "fill"},
+        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG":         {"method": "fillna", "value": "N", "opt": "fill"},
+        "BB_RTL_CUST_CHAIN_FLAG":                   {"type": "fillna", "opt": "fill", "value": "0", "type": "str"},
+        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME":       {"method": "fillna", "value": "中", "opt": "fill"},
+        "BB_RTL_CUST_CHAIN_FLAG":                   {"method": "fillna", "opt": "fill", "value": "0", "type": "str"},
+        "MD04_DIR_SAL_STORE_FLAG":                  {"method": "fillna", "opt": "fill", "value": "0", "type": "str"},
+        "STORE_AREA":                               {"method": "fillna", "opt": "mean", "type": "num"},
+        "OPERATOR_AGE":                             {"method": "fillna", "opt": "mean", "type": "num"},
+        "OPERATOR_EDU_LEVEL":                       {"method": "fillna", "opt": "fill", "value": "00", "type": "str"},
+    }
+    # one-hot编码
+    
+    
+class ProductConfig:
+    FEATURE_COLUMNS = [
+        "product_code",                                # 商品编码
+        # "factory_name",                                # 工业公司名称
+        # "brand_code",                                  # 品牌编码
+        "adjust_price",                                # 含税调拨价
+        "notwithtax_adjust_price",                     # 不含税调拨价
+        "whole_sale_price",                            # 统一批发价
+        "direct_retail_price",                         # 建议零售价
+        "allot_price",                                 # 调拨价
+        "direct_whole_price",                          # 批发指导价
+        "retail_price",                                # 零售价
+        "price_type_name",                             # 卷烟价类名称
+        "gear_type_name",                              # 卷烟档位名称
+        "category_type_name",                          # 卷烟品类名称
+        "is_key_brand",                                # 是否重点品牌
+        "is_high_level",                               # 是否高端烟
+        "is_upscale_level",                            # 是否高端烟不含高价
+        "is_high_price",                               # 是否高价烟
+        "is_low_price",                                # 是否低价烟
+        "is_low_tar",                                  # 是否低焦油烟
+        "is_encourage",                                # 是否全国鼓励品牌
+        "is_abnormity",                                # 是否异形包装
+        "is_intake",                                   # 是否进口烟
+        "is_short",                                    # 是否紧俏品牌
+        "is_medium",                                   # 是否中支烟
+        "is_shortbranch",                              # 是否短支烟
+        "is_ordinary_price_type",                      # 是否普一类烟
+        "source_type",                                 # 来源类型
+        "tar_qty",                                     # 焦油含量
+        "product_style_code_name",                     # 包装类型名称
+        "chinese_mix",                                 # 中式混合
+        "sub_price_type_name",                         # 细分卷烟价类名称
+    ]
+    
+    CLEANING_RULES = {
+        "adjust_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
+        "notwithtax_adjust_price":                     {"method": "fillna", "opt": "mean", "type": "num"},
+        "whole_sale_price":                            {"method": "fillna", "opt": "mean", "type": "num"},
+        "direct_retail_price":                         {"method": "fillna", "opt": "mean", "type": "num"},
+        "allot_price":                                 {"method": "fillna", "opt": "fill", "type": "num", "value": 0.0},
+        "direct_whole_price":                          {"method": "fillna", "opt": "mean", "type": "num"},
+        "retail_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
+        "price_type_name":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        "gear_type_name":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        "category_type_name":                          {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        "is_key_brand":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_high_level":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_upscale_level":                            {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_high_price":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_low_price":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_low_tar":                                  {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_encourage":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_abnormity":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_intake":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_short":                                    {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_medium":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_shortbranch":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_ordinary_price_type":                      {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "source_type":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        "tar_qty":                                     {"method": "fillna", "opt": "mean", "type": "num"},
+        "product_style_code_name":                     {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        "chinese_mix":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "sub_price_type_name":                         {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+    }

+ 101 - 0
models/rank/data/preprocess.py

@@ -0,0 +1,101 @@
+from dao.dao import load_cust_data_from_mysql, load_product_data_from_mysql, load_order_data_from_mysql
+from models.rank.data.config import CustConfig, ProductConfig
+import pandas as pd
+
+class DataProcess():
+    def __init__(self, city_uuid):
+        print("正在加载cust_info...")
+        self._cust_data = load_cust_data_from_mysql(city_uuid)
+        print("正在加载product_info...")
+        self._product_data = load_product_data_from_mysql(city_uuid)
+        print("正在加载order_info...")
+        self._order_data = load_order_data_from_mysql(city_uuid)
+        
+    def data_process(self):
+        """数据预处理"""
+        
+        # 1. 获取指定的特征组合
+        self._cust_data = self._cust_data[CustConfig.FEATURE_COLUMNS]
+        self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
+        
+        # 2. 数据清洗
+        self._clean_cust_data()
+        self._clean_product_data()
+        
+        # 3. 将零售户信息表与卷烟信息表进行笛卡尔积连接
+        self._descartes()
+        
+        # 4. 根据order表中的信息给数据打标签
+        self._labeled_data()
+        
+        # 5. 选取训练样本
+        self._generate_train_data()
+        
+    
+    def _clean_cust_data(self):
+        """用户信息表数据清洗"""
+        # 根据配置规则清洗数据
+        for feature, rules, in CustConfig.CLEANING_RULES.items():
+            if rules["type"] == "num":
+                # 先将数值型字符串转换为数值
+                self._cust_data[feature] = pd.to_numeric(self._cust_data[feature], errors="coerce")
+                
+            if rules["method"] == "fillna":
+                if rules["opt"] == "fill":
+                    self._cust_data[feature] = self._cust_data[feature].fillna(rules["value"])
+                elif rules["opt"] == "replace":
+                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]])
+                elif rules["opt"] == "mean":
+                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean())
+    
+    def _clean_product_data(self):
+        """卷烟信息表数据清洗"""
+        for feature, rules, in ProductConfig.CLEANING_RULES.items():
+            if rules["type"] == "num":
+                self._product_data[feature] = pd.to_numeric(self._product_data[feature], errors="coerce")
+            
+            if rules["method"] == "fillna":
+                if rules["opt"] == "fill":
+                    self._product_data[feature] = self._product_data[feature].fillna(rules["value"])
+                elif rules["opt"] == "mean":
+                    self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean())
+    
+    def _descartes(self):
+        """将零售户信息与卷烟信息进行笛卡尔积连接"""
+        self._cust_data["descartes"] = 1
+        self._product_data["descartes"] = 1
+        
+        self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
+        
+    def _labeled_data(self):
+        """根据order表信息给descartes_data数据打标签"""
+        # 获取order表中的正样本组合
+        order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
+        order_set = set(zip(order_combinations["BB_RETAIL_CUSTOMER_CODE"], order_combinations["PRODUCT_CODE"]))
+        
+        # 在descartes_data中打标签:正样本为1,负样本为2
+        self._descartes_data['label'] = self._descartes_data.apply(
+            lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
+    
+    def _generate_train_data(self):
+        """从descartes_data中生成训练数据"""
+        positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
+        negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
+        
+        positive_count = len(positive_samples)
+        negative_count = min(2 * positive_count, len(negative_samples))
+        print(positive_count)
+        print(negative_count)
+        
+        # 随机抽取2倍正样本数量的负样本
+        negative_samples_sampled = negative_samples.sample(n=negative_count, random_state=42)
+        # 合并正负样本
+        self._train_data = pd.concat([positive_samples, negative_samples_sampled], axis=0)
+        self._train_data = self._train_data.sample(frac=1, random_state=42).reset_index(drop=True)
+        
+        # 保存训练数据
+        self._train_data.to_csv("./models/rank/data/gbdt_data.csv", index=False)
+    
+if __name__ == '__main__':
+    processor = DataProcess("00000000000000000000000011445301")
+    processor.data_process()