Эх сурвалжийг харах

根据新数据修改gbdt-lr流程

yangzeyu 11 сар өмнө
parent
commit
920579945f

+ 3 - 3
.gitignore

@@ -1,4 +1,4 @@
-.idea/
-.vscode/
-__pycache__/
+.idea/
+.vscode/
+__pycache__/
 *.pyc

+ 23 - 23
Dockerfile

@@ -1,23 +1,23 @@
-FROM registry.cn-hangzhou.aliyuncs.com/hexiaoshi/python:3.10
-
-RUN apt-get update && apt-get -y install  tzdata cron vim && ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
-
-ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-
-WORKDIR /app
-
-COPY . /app/
-
-
-RUN mv /app/crontab /etc/cron.d/crontab && chmod 0644 /etc/cron.d/crontab \ 
-        && /usr/bin/crontab /etc/cron.d/crontab \ 
-        && pip install --upgrade pip setuptools -i https://mirrors.aliyun.com/pypi/simple  \ 
-        && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
-
-RUN find . | grep -E "(__pycache__|Dockerfile|\.md|\.pyc|\.pyo$)" | xargs rm -rf && python3 -m compileall -b . \ 
-        &&  find . -name "*.py" |xargs rm -rf && touch /var/log/cron.log
-
-VOLUME ["/etc/cron.d"]
-
-CMD /bin/bash -c "/usr/bin/crontab /etc/cron.d/crontab && cron && tail -f /var/log/cron.log"
-
+FROM registry.cn-hangzhou.aliyuncs.com/hexiaoshi/python:3.10
+
+RUN apt-get update && apt-get -y install  tzdata cron vim && ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+
+ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+WORKDIR /app
+
+COPY . /app/
+
+
+RUN mv /app/crontab /etc/cron.d/crontab && chmod 0644 /etc/cron.d/crontab \ 
+        && /usr/bin/crontab /etc/cron.d/crontab \ 
+        && pip install --upgrade pip setuptools -i https://mirrors.aliyun.com/pypi/simple  \ 
+        && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
+
+RUN find . | grep -E "(__pycache__|Dockerfile|\.md|\.pyc|\.pyo$)" | xargs rm -rf && python3 -m compileall -b . \ 
+        &&  find . -name "*.py" |xargs rm -rf && touch /var/log/cron.log
+
+VOLUME ["/etc/cron.d"]
+
+CMD /bin/bash -c "/usr/bin/crontab /etc/cron.d/crontab && cron && tail -f /var/log/cron.log"
+

+ 104 - 104
app.py

@@ -1,105 +1,105 @@
-import argparse
-from dao import load_order_data_from_mysql
-from dao.redis_db import Redis
-from models import HotRecallModel, UserItemScore, ItemCFModel, calculate_similarity_and_save_results
-import os
-
-def run_hot_recall(order_data, city_uuid):
-    """运行热度召回算法"""
-    hot_model = HotRecallModel(order_data)
-    hot_model.calculate_all_hot_score(city_uuid)
-    print("热度召回已完成!")
-
-def run_itemcf(order_data, args):
-    # """运行协同过滤算法"""
-    if os.path.exists(args.interst_score_path) and os.path.exists(args.similarity_matrix_path):
-        os.remove(args.interst_score_path)
-        os.remove(args.similarity_matrix_path)
-    
-    # 计算user-score-item数据
-    cal_interest_scores_model = UserItemScore()
-    scores = cal_interest_scores_model.score(order_data)
-    scores.to_csv(args.interst_score_path, index=False, encoding="utf-8")
-    print("Interest Scores cal done!")
-    
-    # 计算商户共现矩阵及相似度矩阵
-    calculate_similarity_and_save_results(order_data, args.similarity_matrix_path)
-    print("Shops similarity matrix cal done!")
-    
-    # 运行协同过滤召回
-    itemcf_model = ItemCFModel()
-    itemcf_model.train(args.interst_score_path, args.similarity_matrix_path, args.city_uuid, args.n, args.k, args.top_n, args.n_jobs)
-    print("协同过滤已完成!")
-
-def run_itemcf_inference(product_code):
-        """
-        从 Redis 中读取推荐结果,并返回 {shop_id: score} 的列表
-        """
-        redis_db = Redis()
-        redis_key = f"fc:{product_code}"
-        recommendations = redis_db.redis.zrange(redis_key, 0, -1, withscores=True, desc=True)
-        
-        # 将推荐结果转换为 {shop_id: score} 的字典列表
-        result = [{shop_id: float(score)} for shop_id, score in recommendations]
-        
-        return result
-
-def run():
-    parser = argparse.ArgumentParser()
-    
-    # 运行方式
-    parser.add_argument("--run_all", action='store_true')
-    parser.add_argument("--run_hot", action='store_true')
-    parser.add_argument("--run_itemcf", action='store_true')
-    parser.add_argument("--run_itemcf_inference", action='store_true')
-    
-    # 协同过滤相关配置
-    parser.add_argument("--matrix_path", type=str, default="./models/recall/itemCF/matrix")
-    # parser.add_argument("--interst_score_path", type=str, default="./models/recall/itemCF/matrix/score.csv")
-    # parser.add_argument("--similarity_matrix_path", type=str, default="./models/recall/itemCF/matrix/similarity.csv")
-    parser.add_argument("--n", type=int, default=100)
-    parser.add_argument("--k", type=int, default=20)
-    parser.add_argument("--top_n", type=int, default=2000, help='default n * k')
-    parser.add_argument("--n_jobs", type=int, default=4)
-    parser.add_argument("--city_uuid", type=str, default='00000000000000000000000011445301', help="City UUID for filtering data")
-    
-    # 协同过滤推理配置
-    parser.add_argument("--product_code", type=int, default=110111)
-    
-    args = parser.parse_args()
-    
-    # 初始化文件保存相关配置
-    if not os.path.exists(args.matrix_path):
-        os.makedirs(args.matrix_path)
-    args.interst_score_path = os.path.join(args.matrix_path, "score.csv")
-    args.similarity_matrix_path = os.path.join(args.matrix_path, "similarity.csv")
-    
-    
-    if args.run_all:
-        order_data = load_order_data_from_mysql(args.city_uuid)
-        if order_data is not None:
-            run_hot_recall(order_data, args.city_uuid)
-            run_itemcf(order_data, args)
-        else:
-            print("数据库中暂无数据")
-        
-    elif args.run_hot:
-        order_data = load_order_data_from_mysql(args.city_uuid)
-        if order_data is not None:
-            run_hot_recall(order_data, args.city_uuid)
-        else:
-            print("数据库中暂无数据")
-        
-    elif args.run_itemcf:
-        order_data = load_order_data_from_mysql(args.city_uuid)
-        if order_data is not None:
-            run_itemcf(order_data, args)
-        else:
-            print("数据库中暂无数据")  
-        
-    elif args.run_itemcf_inference:
-        recomments = run_itemcf_inference(args.product_code)
-        print(recomments)
-    
-if __name__ == "__main__":
+import argparse
+from dao import load_order_data_from_mysql
+from dao.redis_db import Redis
+from models import HotRecallModel, UserItemScore, ItemCFModel, calculate_similarity_and_save_results
+import os
+
+def run_hot_recall(order_data, city_uuid):
+    """运行热度召回算法"""
+    hot_model = HotRecallModel(order_data)
+    hot_model.calculate_all_hot_score(city_uuid)
+    print("热度召回已完成!")
+
+def run_itemcf(order_data, args):
+    # """运行协同过滤算法"""
+    if os.path.exists(args.interst_score_path) and os.path.exists(args.similarity_matrix_path):
+        os.remove(args.interst_score_path)
+        os.remove(args.similarity_matrix_path)
+    
+    # 计算user-score-item数据
+    cal_interest_scores_model = UserItemScore()
+    scores = cal_interest_scores_model.score(order_data)
+    scores.to_csv(args.interst_score_path, index=False, encoding="utf-8")
+    print("Interest Scores cal done!")
+    
+    # 计算商户共现矩阵及相似度矩阵
+    calculate_similarity_and_save_results(order_data, args.similarity_matrix_path)
+    print("Shops similarity matrix cal done!")
+    
+    # 运行协同过滤召回
+    itemcf_model = ItemCFModel()
+    itemcf_model.train(args.interst_score_path, args.similarity_matrix_path, args.city_uuid, args.n, args.k, args.top_n, args.n_jobs)
+    print("协同过滤已完成!")
+
+def run_itemcf_inference(product_code):
+        """
+        从 Redis 中读取推荐结果,并返回 {shop_id: score} 的列表
+        """
+        redis_db = Redis()
+        redis_key = f"fc:{product_code}"
+        recommendations = redis_db.redis.zrange(redis_key, 0, -1, withscores=True, desc=True)
+        
+        # 将推荐结果转换为 {shop_id: score} 的字典列表
+        result = [{shop_id: float(score)} for shop_id, score in recommendations]
+        
+        return result
+
+def run():
+    parser = argparse.ArgumentParser()
+    
+    # 运行方式
+    parser.add_argument("--run_all", action='store_true')
+    parser.add_argument("--run_hot", action='store_true')
+    parser.add_argument("--run_itemcf", action='store_true')
+    parser.add_argument("--run_itemcf_inference", action='store_true')
+    
+    # 协同过滤相关配置
+    parser.add_argument("--matrix_path", type=str, default="./models/recall/itemCF/matrix")
+    # parser.add_argument("--interst_score_path", type=str, default="./models/recall/itemCF/matrix/score.csv")
+    # parser.add_argument("--similarity_matrix_path", type=str, default="./models/recall/itemCF/matrix/similarity.csv")
+    parser.add_argument("--n", type=int, default=100)
+    parser.add_argument("--k", type=int, default=20)
+    parser.add_argument("--top_n", type=int, default=2000, help='default n * k')
+    parser.add_argument("--n_jobs", type=int, default=4)
+    parser.add_argument("--city_uuid", type=str, default='00000000000000000000000011445301', help="City UUID for filtering data")
+    
+    # 协同过滤推理配置
+    parser.add_argument("--product_code", type=int, default=110111)
+    
+    args = parser.parse_args()
+    
+    # 初始化文件保存相关配置
+    if not os.path.exists(args.matrix_path):
+        os.makedirs(args.matrix_path)
+    args.interst_score_path = os.path.join(args.matrix_path, "score.csv")
+    args.similarity_matrix_path = os.path.join(args.matrix_path, "similarity.csv")
+    
+    
+    if args.run_all:
+        order_data = load_order_data_from_mysql(args.city_uuid)
+        if order_data is not None:
+            run_hot_recall(order_data, args.city_uuid)
+            run_itemcf(order_data, args)
+        else:
+            print("数据库中暂无数据")
+        
+    elif args.run_hot:
+        order_data = load_order_data_from_mysql(args.city_uuid)
+        if order_data is not None:
+            run_hot_recall(order_data, args.city_uuid)
+        else:
+            print("数据库中暂无数据")
+        
+    elif args.run_itemcf:
+        order_data = load_order_data_from_mysql(args.city_uuid)
+        if order_data is not None:
+            run_itemcf(order_data, args)
+        else:
+            print("数据库中暂无数据")  
+        
+    elif args.run_itemcf_inference:
+        recomments = run_itemcf_inference(args.product_code)
+        print(recomments)
+    
+if __name__ == "__main__":
     run()

+ 6 - 6
config/__init__.py

@@ -1,7 +1,7 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-from config.config import load_config 
-
-__all__ = [
-    "load_config"
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from config.config import load_config 
+
+__all__ = [
+    "load_config"
 ]

+ 5 - 5
config/config.py

@@ -1,6 +1,6 @@
-import yaml
-
-def load_config():
-    with open('./config/database_config.yaml') as file:
-        config = yaml.safe_load(file)
+import yaml
+
+def load_config():
+    with open('./config/database_config.yaml') as file:
+        config = yaml.safe_load(file)
     return config

+ 12 - 12
config/database_config.yaml

@@ -1,12 +1,12 @@
-mysql:
-  host: 'rm-t4n6rz18y4t5x47y70o.mysql.singapore.rds.aliyuncs.com'
-  port: 3036
-  db: 'brand_cultivation'
-  user: 'BrandCultivation'
-  passwd: '8BfWBc18NBXl#CMd'
-
-redis:
-  host: 'r-t4nb4n9i8je7u6ogk1pd.redis.singapore.rds.aliyuncs.com'
-  port: 5000
-  db: 10
-  passwd: 'gHmNkVBd88sZybj'
+mysql:
+  host: 'rm-t4n6rz18y4t5x47y70o.mysql.singapore.rds.aliyuncs.com'
+  port: 3036
+  db: 'brand_cultivation'
+  user: 'BrandCultivation'
+  passwd: '8BfWBc18NBXl#CMd'
+
+redis:
+  host: 'r-t4nb4n9i8je7u6ogk1pd.redis.singapore.rds.aliyuncs.com'
+  port: 5000
+  db: 10
+  passwd: 'gHmNkVBd88sZybj'

+ 4 - 4
crontab

@@ -1,4 +1,4 @@
-# START CRON JOB
-11 13 * * * cd /app && /usr/local/bin/python app.pyc --run_all >> /var/log/app.log 2>&1
-#* * * * * echo "asdf" >> /var/log/test.log
-# END CRON JOB
+# START CRON JOB
+11 13 * * * cd /app && /usr/local/bin/python app.pyc --run_all >> /var/log/app.log 2>&1
+#* * * * * echo "asdf" >> /var/log/test.log
+# END CRON JOB

+ 8 - 8
database/__init__.py

@@ -1,9 +1,9 @@
-from database.db.mysql import MySqlDatabaseHelper
-from database.db.redis_db import RedisDatabaseHelper
-from database.dao.mysql_dao import MySqlDao
-
-__all__ = [
-    "MySqlDatabaseHelper",
-    "RedisDatabaseHelper",
-    "MySqlDao"
+from database.db.mysql import MySqlDatabaseHelper
+from database.db.redis_db import RedisDatabaseHelper
+from database.dao.mysql_dao import MySqlDao
+
+__all__ = [
+    "MySqlDatabaseHelper",
+    "RedisDatabaseHelper",
+    "MySqlDao"
 ]

+ 131 - 131
database/dao/mysql_dao.py

@@ -1,132 +1,132 @@
-from database import MySqlDatabaseHelper
-from sqlalchemy import text
-import pandas as pd
-
-class MySqlDao:
-    _instance = None
-    
-    def __new__(cls):
-        if not cls._instance:
-            cls._instance = super(MySqlDao, cls).__new__(cls)
-            cls._instance._initialized = False
-        return cls._instance
-    
-    
-    def __init__(self):
-        if self._initialized:
-            return
-        
-        self.db_helper = MySqlDatabaseHelper()
-        self._product_tablename = "tads_brandcul_product_info"
-        self._cust_tablename = "tads_brandcul_cust_info"
-        self._order_tablename = "tads_brandcul_cust_order"
-        self._mock_order_tablename = "yunfu_mock_data"
-        self._shopping_tablename = "tads_brandcul_cust_info_lbs"
-        # self._shopping_tablename = "yunfu_shopping_mock_data"
-        
-        self._initialized = True
-        
-    def load_product_data(self, city_uuid):
-        """从数据库中读取商品信息"""
-        query = f"SELECT * FROM {self._product_tablename} WHERE city_uuid = :city_uuid"
-        params = {"city_uuid": city_uuid}
-        
-        data = self.db_helper.load_data_with_page(query, params)
-        return data
-        
-    def load_cust_data(self, city_uuid):
-        """从数据库中读取商户信息"""
-        query = f"SELECT * FROM {self._cust_tablename} WHERE BA_CITY_ORG_CODE = :city_uuid"
-        params = {"city_uuid": city_uuid}
-        
-        data = self.db_helper.load_data_with_page(query, params)
-        return data
-    
-    def load_order_data(self, city_uuid):
-        """从数据库中读取订单信息"""
-        query = f"SELECT * FROM {self._order_tablename} WHERE city_uuid = :city_uuid"
-        params = {"city_uuid": city_uuid}
-        
-        data = self.db_helper.load_data_with_page(query, params)
-        data.drop('stat_month', axis=1, inplace=True)
-        data.drop('city_uuid', axis=1, inplace=True)
-        
-        return data
-    
-    def load_mock_order_data(self):
-        """从数据库中读取mock的订单信息"""
-        query = f"SELECT * FROM {self._mock_order_tablename}"
-        
-        data = self.db_helper.load_data_with_page(query, {})
-        
-        return data
-    
-    def load_shopping_data(self, city_uuid):
-        """从数据库中读取商圈数据"""
-        query = f"SELECT * FROM {self._shopping_tablename} WHERE city_uuid = :city_uuid"
-        params = {"city_uuid": city_uuid}
-        
-        data = self.db_helper.load_data_with_page(query, params)
-        
-        return data
-    
-    def get_cust_list(self, city_uuid):
-        """获取商户列表"""
-        data = self.load_cust_data(city_uuid)
-        cust_list = data["BB_RETAIL_CUSTOMER_CODE"].to_list()
-        if len(cust_list) == 0:
-            return []
-        
-        return cust_list
-    
-    def get_product_by_id(self, city_uuid, product_id):
-        """根据city_uuid 和 product_id 从表中获取拼柜信息"""
-        query = text(f"""
-            SELECT *
-            FROM {self._product_tablename}
-            WHERE city_uuid = :city_uuid
-            AND product_code = :product_id
-        """)
-        params = {"city_uuid": city_uuid, "product_id": product_id}
-        data = self.db_helper.fetch_one(query, params)
-        
-        return data
-    
-    def get_cust_by_ids(self, city_uuid, cust_id_list):
-        """根据零售户列表查询其信息"""
-        if not cust_id_list:
-            return None
-        
-        cust_id_str = ",".join([f"'{cust_id}'" for cust_id in cust_id_list])
-        query = text(f"""
-            SELECT *
-            FROM {self._cust_tablename}
-            WHERE BA_CITY_ORG_CODE = :city_uuid
-            AND BB_RETAIL_CUSTOMER_CODE IN ({cust_id_str})
-        """)
-        params = {"city_uuid": city_uuid}
-        data = self.db_helper.fetch_all(query, params)
-        
-        return data
-    
-    def data_preprocess(self, data: pd.DataFrame):
-        
-        data.drop(["cust_uuid", "longitude", "latitude", "range_radius"], axis=1, inplace=True)
-        remaining_cols = data.columns.drop(["city_uuid", "cust_code"])
-        col_with_missing = remaining_cols[data[remaining_cols].isnull().any()].tolist() # 判断有缺失的字段
-        col_all_missing = remaining_cols[data[remaining_cols].isnull().all()].to_list() # 全部缺失的字段
-        col_partial_missing = list(set(col_with_missing) - set(col_all_missing)) # 部分缺失的字段
-        
-        for col in col_partial_missing:
-            data[col] = data[col].fillna(data[col].mean())
-        
-        for col in col_all_missing:
-            data[col] = data[col].fillna(0).infer_objects(copy=False)
-        
-        
-if __name__ == "__main__":
-    dao = MySqlDao()
-    city_uuid = "00000000000000000000000011445301"
-    # city_uuid = "00000000000000000000000011441801"
-    cust_id_list = ["441800100006", "441800100051", "441800100811"]
+from database import MySqlDatabaseHelper
+from sqlalchemy import text
+import pandas as pd
+
+class MySqlDao:
+    _instance = None
+    
+    def __new__(cls):
+        if not cls._instance:
+            cls._instance = super(MySqlDao, cls).__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+    
+    
+    def __init__(self):
+        if self._initialized:
+            return
+        
+        self.db_helper = MySqlDatabaseHelper()
+        self._product_tablename = "tads_brandcul_product_info_f"
+        self._cust_tablename = "tads_brandcul_cust_info_f"
+        self._order_tablename = "tads_brandcul_consumer_order"
+        self._mock_order_tablename = "yunfu_mock_data"
+        self._shopping_tablename = "tads_brandcul_cust_info_lbs_f"
+        # self._shopping_tablename = "yunfu_shopping_mock_data"
+        
+        self._initialized = True
+        
+    def load_product_data(self, city_uuid):
+        """从数据库中读取商品信息"""
+        query = f"SELECT * FROM {self._product_tablename} WHERE city_uuid = :city_uuid"
+        params = {"city_uuid": city_uuid}
+        
+        data = self.db_helper.load_data_with_page(query, params)
+        return data
+        
+    def load_cust_data(self, city_uuid):
+        """从数据库中读取商户信息"""
+        query = f"SELECT * FROM {self._cust_tablename} WHERE BA_CITY_ORG_CODE = :city_uuid"
+        params = {"city_uuid": city_uuid}
+        
+        data = self.db_helper.load_data_with_page(query, params)
+        return data
+    
+    def load_order_data(self, city_uuid):
+        """从数据库中读取订单信息"""
+        query = f"SELECT * FROM {self._order_tablename} WHERE city_uuid = :city_uuid"
+        params = {"city_uuid": city_uuid}
+        
+        data = self.db_helper.load_data_with_page(query, params)
+        data.drop('stat_month', axis=1, inplace=True)
+        data.drop('city_uuid', axis=1, inplace=True)
+        
+        return data
+    
+    def load_mock_order_data(self):
+        """从数据库中读取mock的订单信息"""
+        query = f"SELECT * FROM {self._mock_order_tablename}"
+        
+        data = self.db_helper.load_data_with_page(query, {})
+        
+        return data
+    
+    def load_shopping_data(self, city_uuid):
+        """从数据库中读取商圈数据"""
+        query = f"SELECT * FROM {self._shopping_tablename} WHERE city_uuid = :city_uuid"
+        params = {"city_uuid": city_uuid}
+        
+        data = self.db_helper.load_data_with_page(query, params)
+        
+        return data
+    
+    def get_cust_list(self, city_uuid):
+        """获取商户列表"""
+        data = self.load_cust_data(city_uuid)
+        cust_list = data["BB_RETAIL_CUSTOMER_CODE"].to_list()
+        if len(cust_list) == 0:
+            return []
+        
+        return cust_list
+    
+    def get_product_by_id(self, city_uuid, product_id):
+        """根据city_uuid 和 product_id 从表中获取拼柜信息"""
+        query = text(f"""
+            SELECT *
+            FROM {self._product_tablename}
+            WHERE city_uuid = :city_uuid
+            AND product_code = :product_id
+        """)
+        params = {"city_uuid": city_uuid, "product_id": product_id}
+        data = self.db_helper.fetch_one(query, params)
+        
+        return data
+    
+    def get_cust_by_ids(self, city_uuid, cust_id_list):
+        """根据零售户列表查询其信息"""
+        if not cust_id_list:
+            return None
+        
+        cust_id_str = ",".join([f"'{cust_id}'" for cust_id in cust_id_list])
+        query = text(f"""
+            SELECT *
+            FROM {self._cust_tablename}
+            WHERE BA_CITY_ORG_CODE = :city_uuid
+            AND BB_RETAIL_CUSTOMER_CODE IN ({cust_id_str})
+        """)
+        params = {"city_uuid": city_uuid}
+        data = self.db_helper.fetch_all(query, params)
+        
+        return data
+    
+    def data_preprocess(self, data: pd.DataFrame):
+        
+        data.drop(["cust_uuid", "longitude", "latitude", "range_radius"], axis=1, inplace=True)
+        remaining_cols = data.columns.drop(["city_uuid", "cust_code"])
+        col_with_missing = remaining_cols[data[remaining_cols].isnull().any()].tolist() # 判断有缺失的字段
+        col_all_missing = remaining_cols[data[remaining_cols].isnull().all()].to_list() # 全部缺失的字段
+        col_partial_missing = list(set(col_with_missing) - set(col_all_missing)) # 部分缺失的字段
+        
+        for col in col_partial_missing:
+            data[col] = data[col].fillna(data[col].mean())
+        
+        for col in col_all_missing:
+            data[col] = data[col].fillna(0).infer_objects(copy=False)
+        
+        
+if __name__ == "__main__":
+    dao = MySqlDao()
+    city_uuid = "00000000000000000000000011445301"
+    # city_uuid = "00000000000000000000000011441801"
+    cust_id_list = ["441800100006", "441800100051", "441800100811"]
     cust_list = dao.load_mock_order_data()

+ 113 - 113
database/db/mysql.py

@@ -1,113 +1,113 @@
-from config import load_config
-import pandas as pd
-from sqlalchemy import create_engine, text
-from sqlalchemy.orm import sessionmaker
-from sqlalchemy.exc import SQLAlchemyError
-from tqdm import tqdm
-
-cfgs = load_config()
-
-
-class MySqlDatabaseHelper:
-    _instance = None
-    
-    def __new__(cls):
-        if not cls._instance:
-            cls._instance = super(MySqlDatabaseHelper, cls).__new__(cls)
-            cls._instance._initialized = False
-        return cls._instance
-        
-    def __init__(self):
-        if self._initialized:
-            return
-        
-        self._host = cfgs['mysql']['host']
-        self._port = cfgs['mysql']['port']
-        self._user = cfgs['mysql']['user']
-        self._passwd = cfgs['mysql']['passwd']
-        self._dbname = cfgs['mysql']['db']
-        
-        self.connect_database()
-        self._initialized = True
-        
-    def connect_database(self):
-        # 创建数据库连接
-        try:
-            conn = "mysql+pymysql://" + self._user + ":" + self._passwd + "@" + self._host + ":" + str(self._port) + "/" + self._dbname
-        except Exception as e:
-            raise ConnectionAbortedError(f"failed to create connection string: {e}")
-        
-        # 通过连接池创建engine
-        self.engine = create_engine(
-            conn,
-            pool_size=10, # 设置连接池大小
-            max_overflow=20, # 超过连接池大小时的额外连接数
-            pool_recycle=3600 # 回收连接时间
-        )
-        
-        self._DBSession = sessionmaker(bind=self.engine)
-        
-    def load_data_with_page(self, query, params, page_size=1000):
-        """分页查询数据"""
-        data = pd.DataFrame()
-        count_query = text(query.replace("SELECT *", "SELECT COUNT(*)"))
-        query += " LIMIT :limit OFFSET :offset"
-        query = text(query)
-    
-        # 获取总行数
-        total_rows = self.fetch_one(count_query, params)[0]
-
-        page = 1
-        with tqdm(total=total_rows, desc="Loading data", unit="rows") as pbar:  # 初始化进度条
-            while True:
-                offset = (page - 1) * page_size  # 计算偏移量
-                params["limit"] = page_size
-                params["offset"] = offset
-
-                df = pd.DataFrame(self.fetch_all(query, params))
-                if df.empty:
-                    break
-                data = pd.concat([data, df], ignore_index=True)
-            
-                # 更新进度条
-                pbar.update(len(df))  # 更新进度条的行数
-            
-                page += 1
-        return data
-        
-        
-    def fetch_all(self, query, params=None):
-        """执行SQL查询并返回所有结果"""
-        session = self._DBSession()
-        try:
-            results = session.execute(query, params or {}).fetchall()
-            return results
-        except SQLAlchemyError as e:
-            session.rollback()
-            print(f"error: {e}")
-        finally:
-            session.close()
-            
-    def fetch_one(self, query, params=None):
-        """执行SQL查询并返回单条结果"""
-        session = self._DBSession()
-        try:
-            result = session.execute(query, params or {}).fetchone()
-            return result
-        except SQLAlchemyError as e:
-            session.rollback()
-            print(f"error: {e}")
-        finally:
-            session.close()
-            
-    def execute_query(self, query, params=None):
-        """执行SQL语句 (无返回值, 如INSERT, UPDATE, DELETE)"""
-        session = self._DBSession()
-        try:
-            session.execute(query, params or {})
-            session.commit()
-        except SQLAlchemyError as e:
-            session.rollback()
-            print(f"Error: {e}")
-        finally:
-            session.close()
+from config import load_config
+import pandas as pd
+from sqlalchemy import create_engine, text
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.exc import SQLAlchemyError
+from tqdm import tqdm
+
+cfgs = load_config()
+
+
+class MySqlDatabaseHelper:
+    _instance = None
+    
+    def __new__(cls):
+        if not cls._instance:
+            cls._instance = super(MySqlDatabaseHelper, cls).__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+        
+    def __init__(self):
+        if self._initialized:
+            return
+        
+        self._host = cfgs['mysql']['host']
+        self._port = cfgs['mysql']['port']
+        self._user = cfgs['mysql']['user']
+        self._passwd = cfgs['mysql']['passwd']
+        self._dbname = cfgs['mysql']['db']
+        
+        self.connect_database()
+        self._initialized = True
+        
+    def connect_database(self):
+        # 创建数据库连接
+        try:
+            conn = "mysql+pymysql://" + self._user + ":" + self._passwd + "@" + self._host + ":" + str(self._port) + "/" + self._dbname
+        except Exception as e:
+            raise ConnectionAbortedError(f"failed to create connection string: {e}")
+        
+        # 通过连接池创建engine
+        self.engine = create_engine(
+            conn,
+            pool_size=10, # 设置连接池大小
+            max_overflow=20, # 超过连接池大小时的额外连接数
+            pool_recycle=3600 # 回收连接时间
+        )
+        
+        self._DBSession = sessionmaker(bind=self.engine)
+        
+    def load_data_with_page(self, query, params, page_size=1000):
+        """分页查询数据"""
+        data = pd.DataFrame()
+        count_query = text(query.replace("SELECT *", "SELECT COUNT(*)"))
+        query += " LIMIT :limit OFFSET :offset"
+        query = text(query)
+    
+        # 获取总行数
+        total_rows = self.fetch_one(count_query, params)[0]
+
+        page = 1
+        with tqdm(total=total_rows, desc="Loading data", unit="rows") as pbar:  # 初始化进度条
+            while True:
+                offset = (page - 1) * page_size  # 计算偏移量
+                params["limit"] = page_size
+                params["offset"] = offset
+
+                df = pd.DataFrame(self.fetch_all(query, params))
+                if df.empty:
+                    break
+                data = pd.concat([data, df], ignore_index=True)
+            
+                # 更新进度条
+                pbar.update(len(df))  # 更新进度条的行数
+            
+                page += 1
+        return data
+        
+        
+    def fetch_all(self, query, params=None):
+        """执行SQL查询并返回所有结果"""
+        session = self._DBSession()
+        try:
+            results = session.execute(query, params or {}).fetchall()
+            return results
+        except SQLAlchemyError as e:
+            session.rollback()
+            print(f"error: {e}")
+        finally:
+            session.close()
+            
+    def fetch_one(self, query, params=None):
+        """执行SQL查询并返回单条结果"""
+        session = self._DBSession()
+        try:
+            result = session.execute(query, params or {}).fetchone()
+            return result
+        except SQLAlchemyError as e:
+            session.rollback()
+            print(f"error: {e}")
+        finally:
+            session.close()
+            
+    def execute_query(self, query, params=None):
+        """执行SQL语句 (无返回值, 如INSERT, UPDATE, DELETE)"""
+        session = self._DBSession()
+        try:
+            session.execute(query, params or {})
+            session.commit()
+        except SQLAlchemyError as e:
+            session.rollback()
+            print(f"Error: {e}")
+        finally:
+            session.close()

+ 51 - 51
database/db/redis_db.py

@@ -1,52 +1,52 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-import redis
-from config import load_config
-
-cfgs = load_config()
-
-
-class RedisDatabaseHelper:
-    _instance = None
-    
-    def __new__(cls):
-        if not cls._instance:
-            cls._instance = super(RedisDatabaseHelper, cls).__new__(cls)
-            cls._instance._initialized = False
-        return cls._instance
-        
-    def __init__(self):
-        if self._initialized:
-            return
-        self.redis = redis.StrictRedis(host=cfgs['redis']['host'],
-                                       port=cfgs['redis']['port'],
-                                       password=cfgs['redis']['passwd'],
-                                       db=cfgs['redis']['db'],
-                                       decode_responses=True)
-        
-        self._initialized = True
-
-
-if __name__ == '__main__':
-    import random
-    # 连接到 Redis 服务器
-    r = RedisDatabaseHelper().redis
-
-    # 有序集合的键名
-    zset_key = 'configs:hotkeys'
-
-    data_list = ['ORDER_FULLORDR_RATE', 'MONTH6_SALE_QTY_YOY', 'MONTH6_SALE_QTY_MOM', 'MONTH6_SALE_QTY']
-
-    # 清空已有的有序集合(可选,若需要全新的集合可执行此操作)
-    r.delete(zset_key)
-    
-    for item in data_list:
-        # 生成 80 到 100 之间的随机数,小数点后保留 4 位
-        score = round(random.uniform(80, 100), 4)
-        # 将元素和对应的分数添加到有序集合中
-        r.zadd(zset_key, {item: score})
-
-    # # 从 Redis 中读取有序集合并打印
-    # result = r.zrange(zset_key, 0, -1, withscores=True)
-    # for item, score in result:
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import redis
+from config import load_config
+
+cfgs = load_config()
+
+
+class RedisDatabaseHelper:
+    _instance = None
+    
+    def __new__(cls):
+        if not cls._instance:
+            cls._instance = super(RedisDatabaseHelper, cls).__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+        
+    def __init__(self):
+        if self._initialized:
+            return
+        self.redis = redis.StrictRedis(host=cfgs['redis']['host'],
+                                       port=cfgs['redis']['port'],
+                                       password=cfgs['redis']['passwd'],
+                                       db=cfgs['redis']['db'],
+                                       decode_responses=True)
+        
+        self._initialized = True
+
+
+if __name__ == '__main__':
+    import random
+    # 连接到 Redis 服务器
+    r = RedisDatabaseHelper().redis
+
+    # 有序集合的键名
+    zset_key = 'configs:hotkeys'
+
+    data_list = ['ORDER_FULLORDR_RATE', 'MONTH6_SALE_QTY_YOY', 'MONTH6_SALE_QTY_MOM', 'MONTH6_SALE_QTY']
+
+    # 清空已有的有序集合(可选,若需要全新的集合可执行此操作)
+    r.delete(zset_key)
+    
+    for item in data_list:
+        # 生成 80 到 100 之间的随机数,小数点后保留 4 位
+        score = round(random.uniform(80, 100), 4)
+        # 将元素和对应的分数添加到有序集合中
+        r.zadd(zset_key, {item: score})
+
+    # # 从 Redis 中读取有序集合并打印
+    # result = r.zrange(zset_key, 0, -1, withscores=True)
+    # for item, score in result:
     #     print(f"元素: {item}, 分数: {score}")

+ 163 - 163
gbdt_lr.py

@@ -1,164 +1,164 @@
-import argparse
-import os
-from models.rank import DataProcess, Trainer, GbdtLrModel
-import time
-import pandas as pd
-
-# train_data_path = "./moldes/rank/data/gbdt_data.csv"
-# model_path = "./models/rank/weights"
-
-def train(args):
-    model_dir = os.path.join(args.model_path, args.city_uuid)
-    train_data_dir = args.train_data_dir
-    if not os.path.exists(model_dir):
-        os.makedirs(model_dir)
-    
-    if not os.path.exists(train_data_dir):
-        os.makedirs(train_data_dir)
-    
-    # 准备数据集  
-    print("正在整合训练数据...")
-    processor = DataProcess(args.city_uuid, args.train_data_dir)
-    processor.data_process()
-    print("训练数据整合完成!")
-    
-    # 进行训练
-    print("开始训练原始模型")
-    trainer(args, os.path.join(args.train_data_dir, "original_train_data.csv"), model_dir, "ori_model.pkl")
-    
-    print("开始训练pos模型")
-    trainer(args, os.path.join(args.train_data_dir, "pos_train_data.csv"), model_dir, "pos_model.pkl")
-    
-    print("开始训练shopping模型")
-    trainer(args, os.path.join(args.train_data_dir, "shopping_train_data.csv"), model_dir, "shopping_model.pkl")
-
-def trainer(args, train_data_path, model_dir, model_name):
-    trainer = Trainer(train_data_path)
-    
-    start_time = time.time()
-    trainer.train()
-    end_time = time.time()
-    
-    training_time_hours = (end_time - start_time) / 3600
-    print(f"训练时间: {training_time_hours:.4f} 小时")
-    
-    eval_metrics = trainer.evaluate()
-    
-    # 输出评估结果
-    print("GBDT-LR Evaluation Metrics:")
-    for metric, value in eval_metrics.items():
-        print(f"{metric}: {value:.4f}")
-        
-    # 保存模型
-    trainer.save_model(os.path.join(model_dir, model_name))
-
-def recommend_by_product(args):
-    model_dir = os.path.join(args.model_path, args.city_uuid)
-    if not os.path.exists(model_dir):
-        print("暂无该城市的模型,请先进行模型训练")
-        return
-    
-    # 加载模型
-    model = GbdtLrModel(os.path.join(model_dir, args.model_name))
-    recommend_list = model.sort(args.city_uuid, args.product_id)
-    for item in recommend_list[:min(args.last_n, len(recommend_list))]:
-        print(item)
-
-def get_features_importance(args):
-    model_dir = os.path.join(args.model_path, args.city_uuid)
-    if not os.path.exists(model_dir):
-        print("暂无该城市的模型,请先进行模型训练")
-        return
-    
-    # # 加载模型
-    # model = GbdtLrModel(os.path.join(model_dir, args.model_name))
-    # cust_features_importance, product_features_importance = model.generate_feats_importance()
-    
-    # # 将字典列表转换为 DataFrame
-    # cust_df = pd.DataFrame([
-    #     {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
-    #     for item in cust_features_importance
-    # ])
-    
-    # product_df = pd.DataFrame([
-    #     {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
-    #     for item in product_features_importance
-    # ])
-    
-    # cust_file_path = os.path.join(model_dir, "cust_features_importance.csv")
-    # product_file_path = os.path.join(model_dir, "product_features_importance.csv")
-    # cust_df.to_csv(cust_file_path, index=False, encoding='utf-8')
-    # product_df.to_csv(product_file_path, index=False, encoding='utf-8')
-    
-    get_features_importance_by_model(model_dir, "ori_model")
-    get_features_importance_by_model(model_dir, "pos_model")
-    get_features_importance_by_model(model_dir, "shopping_model")
-    
-def get_features_importance_by_model(model_dir, modelname):
-    model = GbdtLrModel(os.path.join(model_dir, f"{modelname}.pkl"))
-    cust_features_importance, product_features_importance, order_features_importance = model.generate_feats_importance()
-    
-    # 将字典列表转换为 DataFrame
-    cust_df = pd.DataFrame([
-        {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
-        for item in cust_features_importance
-    ])
-    
-    product_df = pd.DataFrame([
-        {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
-        for item in product_features_importance
-    ])
-    
-    order_df = pd.DataFrame([
-        {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
-        for item in order_features_importance
-    ])
-    
-    importance_dir = os.path.join(model_dir, "importance")
-    if modelname == 'ori_model':
-        importance_dir = os.path.join(importance_dir, "ori")
-    elif modelname == 'pos_model':
-        importance_dir = os.path.join(importance_dir, "pos")
-    elif modelname == 'shopping_model':
-        importance_dir = os.path.join(importance_dir, "shopping")
-    
-    if not os.path.exists(importance_dir):
-        os.makedirs(importance_dir)
-        
-    cust_file_path = os.path.join(importance_dir, "cust_features_importance.csv")
-    product_file_path = os.path.join(importance_dir, "product_features_importance.csv")
-    order_file_path = os.path.join(importance_dir, "order_features_importance.csv")
-    
-    cust_df.to_csv(cust_file_path, index=False, encoding='utf-8')
-    product_df.to_csv(product_file_path, index=False, encoding='utf-8')
-    order_df.to_csv(order_file_path, index=False, encoding='utf-8')
-        
-def run():
-    parser = argparse.ArgumentParser()
-    
-    parser.add_argument("--run_train", action='store_true')
-    parser.add_argument("--recommend", action='store_true')
-    parser.add_argument("--importance", action='store_true')
-    
-    parser.add_argument("--train_data_dir", type=str, default="./data")
-    parser.add_argument("--model_path", type=str, default="./models/rank/weights")
-    parser.add_argument("--model_name", type=str, default='model.pkl')
-    parser.add_argument("--last_n", type=int, default=200)
-    
-    parser.add_argument("--city_uuid", type=str, default='00000000000000000000000011445301')
-    parser.add_argument("--product_id", type=str, default='110102')
-    
-    
-    args = parser.parse_args()
-    
-    if args.run_train:
-        train(args)
-        
-    if args.recommend:
-        recommend_by_product(args)
-        
-    if args.importance:
-        get_features_importance(args)
-        
-if __name__ == "__main__":
+import argparse
+import os
+from models.rank import DataProcess, Trainer, GbdtLrModel
+import time
+import pandas as pd
+
+# train_data_path = "./moldes/rank/data/gbdt_data.csv"
+# model_path = "./models/rank/weights"
+
+def train(args):
+    model_dir = os.path.join(args.model_path, args.city_uuid)
+    train_data_dir = args.train_data_dir
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    
+    if not os.path.exists(train_data_dir):
+        os.makedirs(train_data_dir)
+    
+    # 准备数据集  
+    print("正在整合训练数据...")
+    processor = DataProcess(args.city_uuid, args.train_data_dir)
+    processor.data_process()
+    print("训练数据整合完成!")
+    
+    # 进行训练
+    print("开始训练原始模型")
+    trainer(args, os.path.join(args.train_data_dir, "original_train_data.csv"), model_dir, "ori_model.pkl")
+    
+    print("开始训练pos模型")
+    trainer(args, os.path.join(args.train_data_dir, "pos_train_data.csv"), model_dir, "pos_model.pkl")
+    
+    print("开始训练shopping模型")
+    trainer(args, os.path.join(args.train_data_dir, "shopping_train_data.csv"), model_dir, "shopping_model.pkl")
+
+def trainer(args, train_data_path, model_dir, model_name):
+    trainer = Trainer(train_data_path)
+    
+    start_time = time.time()
+    trainer.train()
+    end_time = time.time()
+    
+    training_time_hours = (end_time - start_time) / 3600
+    print(f"训练时间: {training_time_hours:.4f} 小时")
+    
+    eval_metrics = trainer.evaluate()
+    
+    # 输出评估结果
+    print("GBDT-LR Evaluation Metrics:")
+    for metric, value in eval_metrics.items():
+        print(f"{metric}: {value:.4f}")
+        
+    # 保存模型
+    trainer.save_model(os.path.join(model_dir, model_name))
+
+def recommend_by_product(args):
+    model_dir = os.path.join(args.model_path, args.city_uuid)
+    if not os.path.exists(model_dir):
+        print("暂无该城市的模型,请先进行模型训练")
+        return
+    
+    # 加载模型
+    model = GbdtLrModel(os.path.join(model_dir, args.model_name))
+    recommend_list = model.sort(args.city_uuid, args.product_id)
+    for item in recommend_list[:min(args.last_n, len(recommend_list))]:
+        print(item)
+
+def get_features_importance(args):
+    model_dir = os.path.join(args.model_path, args.city_uuid)
+    if not os.path.exists(model_dir):
+        print("暂无该城市的模型,请先进行模型训练")
+        return
+    
+    # # 加载模型
+    # model = GbdtLrModel(os.path.join(model_dir, args.model_name))
+    # cust_features_importance, product_features_importance = model.generate_feats_importance()
+    
+    # # 将字典列表转换为 DataFrame
+    # cust_df = pd.DataFrame([
+    #     {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+    #     for item in cust_features_importance
+    # ])
+    
+    # product_df = pd.DataFrame([
+    #     {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+    #     for item in product_features_importance
+    # ])
+    
+    # cust_file_path = os.path.join(model_dir, "cust_features_importance.csv")
+    # product_file_path = os.path.join(model_dir, "product_features_importance.csv")
+    # cust_df.to_csv(cust_file_path, index=False, encoding='utf-8')
+    # product_df.to_csv(product_file_path, index=False, encoding='utf-8')
+    
+    get_features_importance_by_model(model_dir, "ori_model")
+    get_features_importance_by_model(model_dir, "pos_model")
+    get_features_importance_by_model(model_dir, "shopping_model")
+    
+def get_features_importance_by_model(model_dir, modelname):
+    model = GbdtLrModel(os.path.join(model_dir, f"{modelname}.pkl"))
+    cust_features_importance, product_features_importance, order_features_importance = model.generate_feats_importance()
+    
+    # 将字典列表转换为 DataFrame
+    cust_df = pd.DataFrame([
+        {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+        for item in cust_features_importance
+    ])
+    
+    product_df = pd.DataFrame([
+        {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+        for item in product_features_importance
+    ])
+    
+    order_df = pd.DataFrame([
+        {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
+        for item in order_features_importance
+    ])
+    
+    importance_dir = os.path.join(model_dir, "importance")
+    if modelname == 'ori_model':
+        importance_dir = os.path.join(importance_dir, "ori")
+    elif modelname == 'pos_model':
+        importance_dir = os.path.join(importance_dir, "pos")
+    elif modelname == 'shopping_model':
+        importance_dir = os.path.join(importance_dir, "shopping")
+    
+    if not os.path.exists(importance_dir):
+        os.makedirs(importance_dir)
+        
+    cust_file_path = os.path.join(importance_dir, "cust_features_importance.csv")
+    product_file_path = os.path.join(importance_dir, "product_features_importance.csv")
+    order_file_path = os.path.join(importance_dir, "order_features_importance.csv")
+    
+    cust_df.to_csv(cust_file_path, index=False, encoding='utf-8')
+    product_df.to_csv(product_file_path, index=False, encoding='utf-8')
+    order_df.to_csv(order_file_path, index=False, encoding='utf-8')
+        
+def run():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--run_train", action='store_true')
+    parser.add_argument("--recommend", action='store_true')
+    parser.add_argument("--importance", action='store_true')
+    
+    parser.add_argument("--train_data_dir", type=str, default="./data")
+    parser.add_argument("--model_path", type=str, default="./models/rank/weights")
+    parser.add_argument("--model_name", type=str, default='model.pkl')
+    parser.add_argument("--last_n", type=int, default=200)
+    
+    parser.add_argument("--city_uuid", type=str, default='00000000000000000000000011445301')
+    parser.add_argument("--product_id", type=str, default='110102')
+    
+    
+    args = parser.parse_args()
+    
+    if args.run_train:
+        train(args)
+        
+    if args.recommend:
+        recommend_by_product(args)
+        
+    if args.importance:
+        get_features_importance(args)
+        
+if __name__ == "__main__":
     run()

+ 96 - 96
gbdt_lr_api.py

@@ -1,97 +1,97 @@
-import argparse
-import os
-from models.rank import DataProcess, Trainer, GbdtLrModel
-import time
-import pandas as pd
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-
-app = FastAPI()
-
-model_path = "./models/rank/weights"
-model_name = "model.pkl"
-
-# 定义请求体
-class TrainRequest(BaseModel):
-    city_uuid: str
-    train_data_path: str = "./models/rank/train_data/gbdt_data.csv"
-    model_path: str = model_path
-    model_name: str = model_name
-    
-class RecommendRequest(BaseModel):
-    city_uuid: str
-    product_id: str
-    last_n: int = 200
-    model_path: str = model_path
-    model_name: str = model_name
-    
-class ImportanceRequest(BaseModel):
-    city_uuid: str
-    model_path: str = model_path
-    model_name: str = model_name
-    
-@app.post("/train")
-def train(request: TrainRequest):
-    model_dir = os.path.join(request.model_path, request.city_uuid)
-    train_data_dir = os.path.dirname(request.train_data_path)
-    if not os.path.exists(model_dir):
-        os.makedirs(model_dir)
-    
-    if not os.path.exists(train_data_dir):
-        os.makedirs(train_data_dir)
-        
-    # 准备数据集  
-    print("正在整合训练数据...")
-    processor = DataProcess(request.city_uuid, request.train_data_path)
-    processor.data_process()
-    print("训练数据整合完成!")
-    
-    # 进行训练
-    trainer = Trainer(request.train_data_path)
-    
-    start_time = time.time()
-    trainer.train()
-    end_time = time.time()
-    
-    training_time_hours = (end_time - start_time) / 3600
-    print(f"训练时间: {training_time_hours:.4f} 小时")
-    
-    eval_metrics = trainer.evaluate()
-    
-    # 保存模型
-    trainer.save_model(os.path.join(model_dir, request.model_name))
-    
-    # 输出评估结果
-    print("GBDT-LR Evaluation Metrics:")
-    for metric, value in eval_metrics.items():
-        print(f"{metric}: {value:.4f}")
-    
-    return {"message": "训练完成!"}
-
-@app.post("/recommend")
-def recommend(request: RecommendRequest):
-    model_dir = os.path.join(request.model_path, request.city_uuid)
-    if not os.path.exists(model_dir):
-        raise HTTPException(status_code=404, detail="暂无该城市的模型,请先进行模型训练")
-    
-    # 加载模型
-    model = GbdtLrModel(os.path.join(model_dir, request.model_name))
-    recommend_list = model.sort(request.city_uuid, request.product_id)
-    
-    return {"recommendations": recommend_list[:min(request.last_n, len(recommend_list))]}
-
-@app.post("/importance")
-def importance(request: ImportanceRequest):
-    model_dir = os.path.join(request.model_path, request.city_uuid)
-    if not os.path.exists(model_dir):
-        raise HTTPException(status_code=404, detail="暂无该城市的模型,请先进行模型训练")
-    
-    # 加载模型
-    model = GbdtLrModel(os.path.join(model_dir, request.model_name))
-    cust_features_importance, product_features_importance = model.generate_feats_importance()
-    
-    return {"cust_features_importance": cust_features_importance, "product_features_importance": product_features_importance}
-
-if __name__ == "__main__":
-    import uvicorn
+import argparse
+import os
+from models.rank import DataProcess, Trainer, GbdtLrModel
+import time
+import pandas as pd
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+app = FastAPI()
+
+model_path = "./models/rank/weights"
+model_name = "model.pkl"
+
+# 定义请求体
+class TrainRequest(BaseModel):
+    city_uuid: str
+    train_data_path: str = "./models/rank/train_data/gbdt_data.csv"
+    model_path: str = model_path
+    model_name: str = model_name
+    
+class RecommendRequest(BaseModel):
+    city_uuid: str
+    product_id: str
+    last_n: int = 200
+    model_path: str = model_path
+    model_name: str = model_name
+    
+class ImportanceRequest(BaseModel):
+    city_uuid: str
+    model_path: str = model_path
+    model_name: str = model_name
+    
+@app.post("/train")
+def train(request: TrainRequest):
+    model_dir = os.path.join(request.model_path, request.city_uuid)
+    train_data_dir = os.path.dirname(request.train_data_path)
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    
+    if not os.path.exists(train_data_dir):
+        os.makedirs(train_data_dir)
+        
+    # 准备数据集  
+    print("正在整合训练数据...")
+    processor = DataProcess(request.city_uuid, request.train_data_path)
+    processor.data_process()
+    print("训练数据整合完成!")
+    
+    # 进行训练
+    trainer = Trainer(request.train_data_path)
+    
+    start_time = time.time()
+    trainer.train()
+    end_time = time.time()
+    
+    training_time_hours = (end_time - start_time) / 3600
+    print(f"训练时间: {training_time_hours:.4f} 小时")
+    
+    eval_metrics = trainer.evaluate()
+    
+    # 保存模型
+    trainer.save_model(os.path.join(model_dir, request.model_name))
+    
+    # 输出评估结果
+    print("GBDT-LR Evaluation Metrics:")
+    for metric, value in eval_metrics.items():
+        print(f"{metric}: {value:.4f}")
+    
+    return {"message": "训练完成!"}
+
+@app.post("/recommend")
+def recommend(request: RecommendRequest):
+    model_dir = os.path.join(request.model_path, request.city_uuid)
+    if not os.path.exists(model_dir):
+        raise HTTPException(status_code=404, detail="暂无该城市的模型,请先进行模型训练")
+    
+    # 加载模型
+    model = GbdtLrModel(os.path.join(model_dir, request.model_name))
+    recommend_list = model.sort(request.city_uuid, request.product_id)
+    
+    return {"recommendations": recommend_list[:min(request.last_n, len(recommend_list))]}
+
+@app.post("/importance")
+def importance(request: ImportanceRequest):
+    model_dir = os.path.join(request.model_path, request.city_uuid)
+    if not os.path.exists(model_dir):
+        raise HTTPException(status_code=404, detail="暂无该城市的模型,请先进行模型训练")
+    
+    # 加载模型
+    model = GbdtLrModel(os.path.join(model_dir, request.model_name))
+    cust_features_importance, product_features_importance = model.generate_feats_importance()
+    
+    return {"cust_features_importance": cust_features_importance, "product_features_importance": product_features_importance}
+
+if __name__ == "__main__":
+    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

+ 11 - 11
models/__init__.py

@@ -1,12 +1,12 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-from models.recall.hot_recall import HotRecallModel
-from models.recall.itemCF.calculate_similarity_matrix import calculate_similarity_and_save_results
-from models.recall.itemCF.user_item_score import UserItemScore
-from models.recall.itemCF.ItemCF import ItemCFModel
-__all__ = [
-    "HotRecallModel",
-    "UserItemScore",
-    "calculate_similarity_and_save_results",
-    "ItemCFModel"
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from models.recall.hot_recall import HotRecallModel
+from models.recall.itemCF.calculate_similarity_matrix import calculate_similarity_and_save_results
+from models.recall.itemCF.user_item_score import UserItemScore
+from models.recall.itemCF.ItemCF import ItemCFModel
+__all__ = [
+    "HotRecallModel",
+    "UserItemScore",
+    "calculate_similarity_and_save_results",
+    "ItemCFModel"
 ]

+ 10 - 10
models/rank/__init__.py

@@ -1,11 +1,11 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-from models.rank.data.preprocess import DataProcess
-from models.rank.gbdt_lr import Trainer
-from models.rank.gbdt_lr_sort import GbdtLrModel
-
-__all__ = [
-    "DataProcess",
-    "Trainer",
-    "GbdtLrModel"
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from models.rank.data.preprocess import DataProcess
+from models.rank.gbdt_lr import Trainer
+from models.rank.gbdt_lr_sort import GbdtLrModel
+
+__all__ = [
+    "DataProcess",
+    "Trainer",
+    "GbdtLrModel"
 ]

+ 12 - 11
models/rank/data/__init__.py

@@ -1,12 +1,13 @@
-from models.rank.data.config import CustConfig, ProductConfig, OrderConfig, ImportanceFeaturesMap
-from models.rank.data.dataloader import DataLoader
-from models.rank.data.utils import one_hot_embedding, sample_data_clear
-__all__ = [
-    "CustConfig",
-    "ProductConfig",
-    "OrderConfig",
-    "DataLoader",
-    "one_hot_embedding",
-    "sample_data_clear",
-    "ImportanceFeaturesMap"
+from models.rank.data.config import CustConfig, ProductConfig, OrderConfig, ShopConfig, ImportanceFeaturesMap
+from models.rank.data.dataloader import DataLoader
+from models.rank.data.utils import one_hot_embedding, sample_data_clear
+__all__ = [
+    "CustConfig",
+    "ProductConfig",
+    "OrderConfig",
+    "ShopConfig",
+    "DataLoader",
+    "one_hot_embedding",
+    "sample_data_clear",
+    "ImportanceFeaturesMap"
 ]

+ 867 - 447
models/rank/data/config.py

@@ -1,447 +1,867 @@
-class CustConfig:
-    FEATURE_COLUMNS = [
-        "BB_RETAIL_CUSTOMER_CODE",              # 零售户代码
-        "BB_RTL_CUST_MARKET_TYPE_NAME",         # 零售户市场类型名称
-        "BB_RTL_CUST_BUSINESS_TYPE_NAME",       # 零售客户业态名称
-        "BB_RTL_CUST_CHAIN_FLAG",               # 零售户连锁标识
-        "MD04_MG_RTL_CUST_CREDITCLASS_NAME",    # 零售户信用等级名称
-        "MD04_DIR_SAL_STORE_FLAG",              # 直营店标识
-        "BB_CUSTOMER_MANAGER_SCOPE_NAME",       # 零售户经营范围名称
-        "BB_RTL_CUST_TERMINAL_LEVEL_NAME",      # 零售户终端层级名称
-        "OPERATOR_EDU",                         # 零售客户经营者文化程度
-        "STORE_AREA",                           # 店铺经营面积
-        "OPERATOR_AGE",                         # 经营者年龄
-        "PRODUCT_INSALE_QTY",                   # 在销品规数
-    ]
-    
-    ONEHOT_CAT = {
-        "BB_RTL_CUST_MARKET_TYPE_NAME":           ["城网", "农网"],
-        "BB_RTL_CUST_BUSINESS_TYPE_NAME":         ["便利店", "超市", "烟草专业店", "娱乐服务类", "其他"],
-        "BB_RTL_CUST_CHAIN_FLAG":                 ["是", "否"],
-        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":      ["AAA", "AA", "A", "B", "C", "D"],
-        "MD04_DIR_SAL_STORE_FLAG":                ["是", "否"],
-        "BB_CUSTOMER_MANAGER_SCOPE_NAME":         ["是", "否"],
-        "BB_RTL_CUST_TERMINAL_LEVEL_NAME":        ["普通终端", "一般现代终端", "合作终端", "加盟终端", "直营终端"],
-        "OPERATOR_EDU":                           [1, 2, 3, 4, 5, 6, 7, "无数据"],
-        "STORE_AREA":                             ["0-20", "21-50", "51-100", "101-150", "151-200", "201-300", "301-400", "401-600", "601-1000", "1001-2000", "2000以上"],
-        "OPERATOR_AGE":                           ["19-30", "31-40", "41-50", "51-65", "66-80", "80以上"],
-        "PRODUCT_INSALE_QTY":                     ["0-10", "11-20", "21-30", "31-40", "41-50", "51-60", 
-                                                   "61-70", "71-80", "81-90", "91-100", "101-110", "111-120", 
-                                                   "121-130", "131-140", "141-150", "151-160", "161-170", "171-180", 
-                                                   "181-190", "191-200", "201-210", "211-220", "221-230", "231-240", 
-                                                   "241-250", "251-260", "261-270", "271-280", "281-290", "291-350"],
-    }
-
-class ProductConfig:
-    FEATURE_COLUMNS = [
-        "city_uuid",                     # 地市id
-        "product_code",                  # 商品编码
-        "factory_name",                  # 产地(工业公司名称)
-        "brand_name",                    # 品牌名称
-        "is_low_tar",                    # 低焦油卷烟
-        "is_medium",                     # 中支烟
-        "is_tiny",                       # 细支烟
-        "is_coarse",                     # 粗支烟(同时非中非细)
-        "is_exploding_beads",            # 爆珠烟
-        "no_is_exploding_beads",         # 非爆珠烟
-        "is_abnormity",                  # 异形包装
-        "no_is_abnormity",               # 非异形包装
-        "is_cig",                        # 雪茄烟
-        "no_is_cig",                     # 非雪茄烟
-        "is_chuangxin",                  # 创新品类
-        "no_is_chuangxin",               # 非创新品类
-        "direct_retail_price",           # 卷烟建议零售价
-        "tbc_total_length",              # 烟支总长度
-        "product_style",                 # 包装类型
-    ]
-    
-    ONEHOT_CAT = {
-        "factory_name":                    ["安徽中烟", "澳门云福卷烟厂", "北欧烟草集团", "博格集团", "重庆中烟", "川渝中烟", "菲利普莫里斯亚洲", 
-                                            "福建中烟", "甘肃工业", "广东中烟", "广西中烟", "贵州中烟", "海南红塔", "河北中烟", "河南中烟", 
-                                            "黑龙江工业", "红塔辽宁烟草", "湖北中烟", "湖南中烟", "吉林工业", "家源开发股份有限公司", 
-                                            "嘉莱赫国际有限公司", "江苏中烟", "江西中烟", "凯德控股有限公司", "力量雪茄烟草有限公司", 
-                                            "南洋兄弟烟草股份", "内蒙古昆明卷烟", "日本烟草(香港)有限公司", "三宝麟国际集团", "厦门调拨站", 
-                                            "山东中烟", "山西昆明烟草", "陕西中烟", "上海烟草(集团)公司", "上海烟草公司", "深圳工业", "四川中烟", 
-                                            "特富意烟草(国际)", "雪茄客烟草国际贸易有限公司", "耀莱雪茄控股有限公司", "引领国际有限公司", 
-                                            "英飞烽香港有限公司", "英美烟草中国有限公司", "云南中烟", "浙江中烟", "中茄国际贸易有限公司", 
-                                            "中烟英美烟草国际有限公司", "株式会社 KT&G"],
-        "brand_name":                      ["万宝路", "555", "骆驼(国外)", "大华", "娇子", "大青山", "龙凤呈祥", "黄鹤楼", "真龙", "七匹狼", 
-                                            "芙蓉王", "双喜(广)", "贵烟", "钓鱼台", "红双喜(南洋)", "云烟", "蒙特", "富恩特", "拉·加莱拉", "苏烟", 
-                                            "丹纳曼", "黄山", "南京", "利群", "金桥", "泰山", "好日子", "石林", "美登", "红河", "嘉辉", "七星", 
-                                            "都彭", "天下秀", "长城", "高希霸", "钻石", "金圣", "王冠雪茄", "黄金叶", "中南海", "长白山", "红旗渠", 
-                                            "建牌", "大卫杜夫", "罗密欧", "茂大", "红金龙", "天子", "熊猫", "双喜(深)", "大前门", "兰州", 
-                                            "红双喜(沪)", "雄狮", "广州", "红玫王", "黄果树", "红塔山", "福", "小熊猫", "爱喜", "蒙特利", "玉溪", 
-                                            "都宝", "麦克纽杜", "卡里罗", "中华", "牡丹(沪)", "阿里山", "顺百利", "白沙", "羊城", "白云", 
-                                            "特美思", "国宾", "帕特加", "比德奥", "冬虫夏草", "威龙(湛江)", "香格里拉", "红梅", "延安", 
-                                            "特富意", "石狮", "金香港", "好猫", "登喜路", "乐迪", "林海灵芝", "椰树", "北京", "大红鹰", "大丰收", 
-                                            "红双喜(武汉)", "五叶神", "狮", "优民", "将军", "遵义", "恒大", "飞马", "红三环", "芙蓉", "工字", 
-                                            "古田", "狮牌", "君力", "哈尔滨", "梦都", "香梅(阜阳)", "哈德门", "梅州", "红山茶", "猴王", "沙龙", 
-                                            "潘趣", "狮子牌", "上海", "红玫", "醒宝", "广州湾", "百乐门", "关塔那摩", "威斯", "五一", "寿百年", 
-                                            "人民大会堂", "土楼", "三沙", "西湖", "光明", "阿诗玛", "宝亨", "恭贺新禧", "长寿", "茶花", "迎客松", 
-                                            "龙烟", "金澳门", "宝岛", "多米尼加之花", "国喜", "金驼", "君特欧", "上游", "幸福", "春城", "吉庆", 
-                                            "黄山松", "黄金龙", "紫气东来", "彼亚赛", "银辉", "潮牌", "庐山", "三峡", "壹支笔", "双叶"],
-        "is_low_tar":                      ["是", "否"],
-        "is_medium":                       ["是", "否"],
-        "is_tiny":                         ["是", "否"],
-        "is_coarse":                       ["是", "否"],
-        "is_exploding_beads":              ["是", "否"],
-        "no_is_exploding_beads":           ["是", "否"],
-        "is_abnormity":                    ["是", "否"],
-        "no_is_abnormity":                 ["是", "否"],
-        "is_cig":                          ["是", "否"],
-        "no_is_cig":                       ["是", "否"],
-        "is_chuangxin":                    ["是", "否"],
-        "no_is_chuangxin":                 ["是", "否"],
-        "direct_retail_price":             ["0-10", "10-30", "31-50", "51-100", "10-19.9", "250-499.9", "200-249.9", 
-                                            "5-9.9", "0-5", "100-109.9", "150-199.9", "101-150", "120-129.9", "大于500", 
-                                            "20-29.9", "30-39.9", "140-149.9", "50-59.9", "40-49.9", "80-89.9", "60-69.9", 
-                                            "70-79.9", "大于150", "130-139.9", "90-99.9", "110-119.9"],
-        "tbc_total_length":                ["小于79", "80-89", "90-100", "大于120"],
-        "product_style":                   ["包装类型(条盒硬盒)", "包装类型(条包硬盒)", "包装类型(条盒软盒)", "包装类型(条包软盒)", "包装类型(铁盒)", "包装类型(其它)"],
-    }
-
-class OrderConfig:
-    FEATURE_COLUMNS = [
-        "cust_uuid",                      # 零售户uuid
-        "cust_code",                      # 零售户编码
-        "product_code",                   # 品牌规格编码
-        "sale_qty",                       # 销量包
-        "sale_qty_l",                     # 销量上期
-        "sale_qty_hb",                    # 销量环比
-        "sale_amt",                       # 销售额包
-    ]
-    
-
-class ShopConfig:
-    FEATURE_COLUMNS = [
-        "cust_code",                      # 客户编码
-        "r_home_num",                     # 常驻人口_居住人数
-        "r_work_num",                     # 常驻人口_工作人数
-        "r_resident_num",                 # 常驻人口_工作或居住人数
-        "r_urban_cons_middle",            # 常驻人口_城市消费水平_中
-        "r_urban_cons_low",               # 常驻人口_城市消费水平_低
-        "r_urban_cons_lower",             # 常驻人口_城市消费水平_次低
-        "r_urban_cons_secondhigh",        # 常驻人口_城市消费水平_次高
-        "r_urban_cons_high",              # 常驻人口_城市消费水平_高
-        "r_edu_junior_middle",            # 常驻人口_学历_初中
-        "r_edu_doctor",                   # 常驻人口_学历_博士
-        "r_edu_specialty",                # 常驻人口_学历_大专
-        "r_edu_primary",                  # 常驻人口_学历_小学
-        "r_edu_college",                  # 常驻人口_学历_本科
-        "r_edu_postgraduate",             # 常驻人口_学历_硕士
-        "r_edu_senior_middle",            # 常驻人口_学历_高中
-        "r_house_price79999",             # 常驻人口_居住社区房价_60000_79999
-        "r_house_price59999",             # 常驻人口_居住社区房价_40000_59999
-        "r_house_price39999",             # 常驻人口_居住社区房价_20000_39999
-        "r_house_price19999",             # 常驻人口_居住社区房价_10000_19999
-        "r_house_price9999",              # 常驻人口_居住社区房价_8000_9999
-        "r_house_price7999",              # 常驻人口_居住社区房价_5000_7999
-        "r_house_price4999",              # 常驻人口_居住社区房价_2000_4999
-        "r_age_17",                       # 常驻人口_年龄_0_17
-        "r_age_24",                       # 常驻人口_年龄_18_24
-        "r_age_30",                       # 常驻人口_年龄_25_30
-        "r_age_35",                       # 常驻人口_年龄_31_35
-        "r_age_40",                       # 常驻人口_年龄_36_40
-        "r_age_45",                       # 常驻人口_年龄_41_45
-        "r_age_60",                       # 常驻人口_年龄_46_60
-        "r_age_over_60",                  # 常驻人口_年龄_61以上
-        "r_sex_woman",                    # 常驻人口_性别_女
-        "r_sex_man",                      # 常驻人口_性别_男
-        "r_catering_50",                  # 常驻人口_餐饮消费水平_50
-        "r_catering_100",                 # 常驻人口_餐饮消费水平_100
-        "r_catering_150",                 # 常驻人口_餐饮消费水平_150
-        "r_catering_200",                 # 常驻人口_餐饮消费水平_200
-        "r_catering_500",                 # 常驻人口_餐饮消费水平_500
-        "r_catering_over_500",            # 常驻人口_餐饮消费水平_500以上
-        "r_catering_times_2",             # 常驻人口_餐饮消费频次_1_2
-        "r_catering_times_4",             # 常驻人口_餐饮消费频次_2_4
-        "r_catering_times_6",             # 常驻人口_餐饮消费频次_4_6
-        "r_catering_times_8",             # 常驻人口_餐饮消费频次_6_8
-        "r_catering_times_10",            # 常驻人口_餐饮消费频次_8_10
-        "r_catering_times_11",            # 常驻人口_餐饮消费频次_11以上
-        "r_native_beijing",               # 常驻人口_家乡地_北京市
-        "r_native_tianjing",              # 常驻人口_家乡地_天津市
-        "r_native_hebei",                 # 常驻人口_家乡地_河北省
-        "r_native_shanxi",                # 常驻人口_家乡地_山西省
-        "r_native_neimeng",               # 常驻人口_家乡地_内蒙古
-        "r_native_liaoning",              # 常驻人口_家乡地_辽宁省
-        "r_native_jilin",                 # 常驻人口_家乡地_吉林省
-        "r_native_heilongjiang",          # 常驻人口_家乡地_黑龙江省
-        "r_native_shanghai",              # 常驻人口_家乡地_上海市
-        "r_native_jiangsu",               # 常驻人口_家乡地_江苏省
-        "r_native_zhejiang",              # 常驻人口_家乡地_浙江省
-        "r_native_anhui",                 # 常驻人口_家乡地_安徽省
-        "r_native_fujian",                # 常驻人口_家乡地_福建省
-        "r_native_jiangix",               # 常驻人口_家乡地_江西省
-        "r_native_shandong",              # 常驻人口_家乡地_山东省
-        "r_native_henan",                 # 常驻人口_家乡地_河南省
-        "r_native_hubei",                 # 常驻人口_家乡地_湖北省
-        "r_native_hunan",                 # 常驻人口_家乡地_湖南省
-        "r_native_guangdong",             # 常驻人口_家乡地_广东省
-        "r_native_hainan",                # 常驻人口_家乡地_海南省
-        "r_native_sichuan",               # 常驻人口_家乡地_四川省
-        "r_native_guizhou",               # 常驻人口_家乡地_贵州省
-        "r_native_yunnan",                # 常驻人口_家乡地_云南省
-        "r_native_shan",                  # 常驻人口_家乡地_陕西省
-        "r_native_gansu",                 # 常驻人口_家乡地_甘肃省
-        "r_native_qinghai",               # 常驻人口_家乡地_青海省
-        "r_native_guangxi",               # 常驻人口_家乡地_广西壮族自治区
-        "r_native_ningxia",               # 常驻人口_家乡地_宁夏回族自治区
-        "r_native_xinjiang",              # 常驻人口_家乡地_新疆维吾尔自治区
-        "r_native_xizang",                # 常驻人口_家乡地_西藏自治区
-        "r_native_chongqing",             # 常驻人口_家乡地_重庆市
-        "r_native_hongkong",              # 常驻人口_家乡地_香港
-        "r_native_macao",                 # 常驻人口_家乡地_澳门
-        "r_native_taiwan",                # 常驻人口_家乡地_台湾
-        "r_native_other",                 # 常驻人口_家乡地_其它
-        "f_flow_num",                     # 流动人口_工作日_日均流动人口数量
-        "f_holiday_flow_num",             # 流动人口_节假日_日均流动人口数量
-        "f_workday_flow_num",             # 流动人口_日均流动人口数量
-        "f_flowurban_cons_middle",        # 日均流动_城市消费水平_中
-        "f_flowurban_cons_low",           # 日均流动_城市消费水平_低
-        "f_flowurban_cons_lower",         # 日均流动_城市消费水平_次低
-        "f_flowurban_cons_second_high",   # 日均流动_城市消费水平_次高
-        "f_flowurban_cons_high",          # 日均流动_城市消费水平_高
-        "f_flowedu_junior_middle",        # 日均流动_学历_初中
-        "f_flowedu_doctor",               # 日均流动_学历_博士
-        "f_flowedu_specialty",            # 日均流动_学历_大专
-        "f_flowedu_primary",              # 日均流动_学历_小学
-        "f_flowedu_college",              # 日均流动_学历_本科
-        "f_flowedu_postgraduate",         # 日均流动_学历_硕士
-        "f_flowedu_senior_middle",        # 日均流动_学历_高中
-        "f_flowhouse_middle",             # 日均流动_居住社区房价_中
-        "f_flowhouse_low",                # 日均流动_居住社区房价_低
-        "f_flowhouse_lower",              # 日均流动_居住社区房价_次低
-        "f_flowhouse_second_high",        # 日均流动_居住社区房价_次高
-        "f_flowhouse_high",               # 日均流动_居住社区房价_高
-        "f_flowage_17",                   # 日均流动_年龄_0_17
-        "f_flowage_24",                   # 日均流动_年龄_18_24
-        "f_flowage_30",                   # 日均流动_年龄_25_30
-        "f_flowage_35",                   # 日均流动_年龄_31_35
-        "f_flowage_40",                   # 日均流动_年龄_36_40
-        "f_flowage_45",                   # 日均流动_年龄_41_45
-        "f_flowage_60",                   # 日均流动_年龄_46_60
-        "f_flowage_over_60",              # 日均流动_年龄_61以上
-        "f_flowsex_woman",                # 日均流动_性别_女
-        "f_flowsex_man",                  # 日均流动_性别_男
-        "f_holidayurban_cons_middle",     # 节假日流动_城市消费水平_中
-        "f_holidayurban_cons_low",        # 节假日流动_城市消费水平_低
-        "f_holidayurban_cons_lower",      # 节假日流动_城市消费水平_次低
-        "f_holidayurban_cons_secondhigh", # 节假日流动_城市消费水平_次高
-        "f_holidayurban_cons_high",       # 节假日流动_城市消费水平_高
-        "f_holidayedu_junior_middle",     # 节假日流动_学历_初中
-        "f_holidayedu_doctor",            # 节假日流动_学历_博士
-        "f_holidayedu_specialty",         # 节假日流动_学历_大专
-        "f_holidayedu_primary",           # 节假日流动_学历_小学
-        "f_holidayedu_college",           # 节假日流动_学历_本科
-        "f_holidayedu_postgraduate",      # 节假日流动_学历_硕士
-        "f_holidayedu_senior_middle",     # 节假日流动_学历_高中
-        "f_holidayhouse_middle",          # 节假日流动_居住社区房价_中
-        "f_holidayhouse_low",             # 节假日流动_居住社区房价_低
-        "f_holidayhouse_lower",           # 节假日流动_居住社区房价_次低
-        "f_holidayhouse_second_high",     # 节假日流动_居住社区房价_次高
-        "f_holidayhouse_high",            # 节假日流动_居住社区房价_高
-        "f_holidayage_17",                # 节假日流动_年龄_0_17
-        "f_holidayage_24",                # 节假日流动_年龄_18_24
-        "f_holidayage_30",                # 节假日流动_年龄_25_30
-        "f_holidayage_35",                # 节假日流动_年龄_31_35
-        "f_holidayage_40",                # 节假日流动_年龄_36_40
-        "f_holidayage_45",                # 节假日流动_年龄_41_45
-        "f_holidayage_60",                # 节假日流动_年龄_46_60
-        "f_holidayage_over_60",           # 节假日流动_年龄_61以上
-        "f_holidaysex_woman",             # 节假日流动_性别_女
-        "f_holidaysex_man",               # 节假日流动_性别_男
-        "f_workday_urban_cons_middle",    # 工作日流动_城市消费水平_中
-        "f_workday_urban_cons_low",       # 工作日流动_城市消费水平_低
-        "f_workday_urban_cons_lower",     # 工作日流动_城市消费水平_次低
-        "f_workday_urban_cons_secondhigh",# 工作日流动_城市消费水平_次高
-        "f_workday_urban_cons_high",      # 工作日流动_城市消费水平_高
-        "f_workday_edu_junior_middle",    # 工作日流动_学历_初中
-        "f_workday_edu_doctor",           # 工作日流动_学历_博士
-        "f_workday_edu_specialty",        # 工作日流动_学历_大专
-        "f_workday_edu_primary",          # 工作日流动_学历_小学
-        "f_workday_edu_college",          # 工作日流动_学历_本科
-        "f_workday_edu_postgraduate",     # 工作日流动_学历_硕士
-        "f_workday_edu_senior_middle",    # 工作日流动_学历_高中
-        "f_workday_house_middle",         # 工作日流动_居住社区房价_中
-        "f_workday_house_low",            # 工作日流动_居住社区房价_低
-        "f_workday_house_lower",          # 工作日流动_居住社区房价_次低
-        "f_workday_house_second_high",    # 工作日流动_居住社区房价_次高
-        "f_workday_house_high",           # 工作日流动_居住社区房价_高
-        "f_workday_age_17",               # 工作日流动_年龄_0_17
-        "f_workday_age_24",               # 工作日流动_年龄_18_24
-        "f_workday_age_30",               # 工作日流动_年龄_25_30
-        "f_workday_age_35",               # 工作日流动_年龄_31_35
-        "f_workday_age_40",               # 工作日流动_年龄_36_40
-        "f_workday_age_45",               # 工作日流动_年龄_41_45
-        "f_workday_age_60",               # 工作日流动_年龄_46_60
-        "f_workday_age_over_60",          # 工作日流动_年龄_61以上
-        "f_workday_sex_woman",            # 工作日流动_性别_女
-        "f_workday_sex_man",              # 工作日流动_性别_男
-    ]
-    
-    ONEHOT_CAT = {
-        "r_home_num":                        ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10000以上"],
-        "r_work_num":                        ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10000以上"],
-        "r_resident_num":                    ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-20000", "20000以上"],
-        "r_urban_cons_middle":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_urban_cons_low":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_urban_cons_lower":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_urban_cons_secondhigh":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_urban_cons_high":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_edu_junior_middle":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_edu_doctor":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_edu_specialty":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_edu_primary":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_edu_college":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_edu_postgraduate":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_edu_senior_middle":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_house_price79999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_house_price59999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_house_price39999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_house_price19999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_house_price9999":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_house_price7999":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_house_price4999":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_17":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_24":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_30":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_35":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_40":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_45":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_60":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_age_over_60":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_sex_woman":                       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_sex_man":                         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_50":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_100":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_150":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_200":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_500":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_over_500":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_times_2":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_times_4":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_times_6":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_times_8":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_times_10":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_catering_times_11":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_beijing":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_tianjing":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_hebei":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_shanxi":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_neimeng":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_liaoning":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_jilin":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_heilongjiang":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_shanghai":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_jiangsu":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_zhejiang":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_anhui":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_fujian":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_jiangix":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_shandong":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_henan":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_hubei":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_hunan":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_guangdong":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_hainan":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_sichuan":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_guizhou":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_yunnan":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_shan":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_gansu":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_qinghai":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_guangxi":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_ningxia":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_xinjiang":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_xizang":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_chongqing":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_hongkong":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_macao":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_taiwan":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "r_native_other":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flow_num":                        ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-50000", "50001-100000", "100000以上"],
-        "f_holiday_flow_num":                ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-50000", "50001-100000", "100000以上"],
-        "f_workday_flow_num":                ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-50000", "50001-100000", "100000以上"],
-        "f_flowurban_cons_middle":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowurban_cons_low":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowurban_cons_lower":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowurban_cons_second_high":      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowurban_cons_high":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowedu_junior_middle":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowedu_doctor":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowedu_specialty":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowedu_primary":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowedu_college":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowedu_postgraduate":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowedu_senior_middle":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowhouse_middle":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowhouse_low":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowhouse_lower":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowhouse_second_high":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowhouse_high":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_17":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_24":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_30":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_35":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_40":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_45":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_60":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowage_over_60":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowsex_woman":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_flowsex_man":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayurban_cons_middle":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayurban_cons_low":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayurban_cons_lower":         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayurban_cons_secondhigh":    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayurban_cons_high":          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayedu_junior_middle":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayedu_doctor":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayedu_specialty":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayedu_primary":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayedu_college":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayedu_postgraduate":         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayedu_senior_middle":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayhouse_middle":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayhouse_low":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayhouse_lower":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayhouse_second_high":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayhouse_high":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_17":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_24":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_30":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_35":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_40":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_45":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_60":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidayage_over_60":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidaysex_woman":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_holidaysex_man":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_urban_cons_middle":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_urban_cons_low":          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_urban_cons_lower":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_urban_cons_secondhigh":   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_urban_cons_high":         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_edu_junior_middle":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_edu_doctor":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_edu_specialty":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_edu_primary":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_edu_college":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_edu_postgraduate":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_edu_senior_middle":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_house_middle":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_house_low":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_house_lower":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_house_second_high":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_house_high":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_17":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_24":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_30":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_35":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_40":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_45":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_60":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_age_over_60":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_sex_woman":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-        "f_workday_sex_man":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
-    }
+class CustConfig:
+    FEATURE_COLUMNS = [
+        "BB_RETAIL_CUSTOMER_CODE",              # 零售户代码
+        "BB_RTL_CUST_MARKET_TYPE_NAME",         # 零售户市场类型名称
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME",       # 零售客户业态名称
+        "BB_RTL_CUST_CHAIN_FLAG",               # 零售户连锁标识
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME",    # 零售户信用等级名称
+        "MD04_DIR_SAL_STORE_FLAG",              # 直营店标识
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME",       # 零售户经营范围名称
+        "BB_RTL_CUST_TERMINAL_LEVEL_NAME",      # 零售户终端层级名称
+        "OPERATOR_EDU",                         # 零售客户经营者文化程度
+        "STORE_AREA",                           # 店铺经营面积
+        "OPERATOR_AGE",                         # 经营者年龄
+        "PRODUCT_INSALE_QTY",                   # 在销品规数
+    ]
+    
+    ONEHOT_CAT = {
+        "BB_RTL_CUST_MARKET_TYPE_NAME":           ["城网", "农网"],
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME":         ["便利店", "超市", "烟草专业店", "娱乐服务类", "其他"],
+        "BB_RTL_CUST_CHAIN_FLAG":                 ["是", "否"],
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":      ["AAA", "AA", "A", "B", "C", "D"],
+        "MD04_DIR_SAL_STORE_FLAG":                ["是", "否"],
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":         ["是", "否"],
+        "BB_RTL_CUST_TERMINAL_LEVEL_NAME":        ["普通终端", "一般现代终端", "合作终端", "加盟终端", "直营终端"],
+        "OPERATOR_EDU":                           [1, 2, 3, 4, 5, 6, 7, "无数据"],
+        "STORE_AREA":                             ["0-20", "21-50", "51-100", "101-150", "151-200", "201-300", "301-400", "401-600", "601-1000", "1001-2000", "2000以上"],
+        "OPERATOR_AGE":                           ["19-30", "31-40", "41-50", "51-65", "66-80", "80以上"],
+        "PRODUCT_INSALE_QTY":                     ["0-10", "11-20", "21-30", "31-40", "41-50", "51-60", 
+                                                   "61-70", "71-80", "81-90", "91-100", "101-110", "111-120", 
+                                                   "121-130", "131-140", "141-150", "151-160", "161-170", "171-180", 
+                                                   "181-190", "191-200", "201-210", "211-220", "221-230", "231-240", 
+                                                   "241-250", "251-260", "261-270", "271-280", "281-290", "291-350"],
+    }
+    
+    CLEANING_RULES = {
+        "BB_RTL_CUST_MARKET_TYPE_NAME":         {"method": "fillna", "opt": "fill", "value": "城网", "type": "str"},
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME":       {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
+        "BB_RTL_CUST_CHAIN_FLAG":               {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":    {"method": "fillna", "opt": "fill", "value": "B", "type": "str"},
+        "MD04_DIR_SAL_STORE_FLAG":              {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":       {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "BB_RTL_CUST_TERMINAL_LEVEL_NAME":      {"method": "fillna", "opt": "fill", "value": "普通终端", "type": "str"},
+        "OPERATOR_EDU":                         {"method": "fillna", "opt": "fill", "value": "无数据", "type": "str"},
+        "STORE_AREA":                           {"method": "fillna", "opt": "fill", "value": "0-20", "type": "str"},
+        "OPERATOR_AGE":                         {"method": "fillna", "opt": "fill", "value": "31-40", "type": "str"},
+        "PRODUCT_INSALE_QTY":                   {"method": "fillna", "opt": "fill", "value": "0-10", "type": "str"},
+    }
+
+class ProductConfig:
+    FEATURE_COLUMNS = [
+        "product_code",                  # 商品编码
+        "factory_name",                  # 产地(工业公司名称)
+        "brand_name",                    # 品牌名称
+        "is_low_tar",                    # 低焦油卷烟
+        "is_medium",                     # 中支烟
+        "is_tiny",                       # 细支烟
+        "is_coarse",                     # 粗支烟(同时非中非细)
+        "is_exploding_beads",            # 爆珠烟
+        "is_abnormity",                  # 异形包装
+        "is_cig",                        # 雪茄烟
+        "is_chuangxin",                  # 创新品类
+        "direct_retail_price",           # 卷烟建议零售价
+        "tbc_total_length",              # 烟支总长度
+        "product_style",                 # 包装类型
+    ]
+    
+    ONEHOT_CAT = {
+        "factory_name":                    ["安徽中烟", "澳门云福卷烟厂", "北欧烟草集团", "博格集团", "重庆中烟", "川渝中烟", "菲利普莫里斯亚洲", 
+                                            "福建中烟", "甘肃工业", "广东中烟", "广西中烟", "贵州中烟", "海南红塔", "河北中烟", "河南中烟", 
+                                            "黑龙江工业", "红塔辽宁烟草", "湖北中烟", "湖南中烟", "吉林工业", "家源开发股份有限公司", 
+                                            "嘉莱赫国际有限公司", "江苏中烟", "江西中烟", "凯德控股有限公司", "力量雪茄烟草有限公司", 
+                                            "南洋兄弟烟草股份", "内蒙古昆明卷烟", "日本烟草(香港)有限公司", "三宝麟国际集团", "厦门调拨站", 
+                                            "山东中烟", "山西昆明烟草", "陕西中烟", "上海烟草(集团)公司", "上海烟草公司", "深圳工业", "四川中烟", 
+                                            "特富意烟草(国际)", "雪茄客烟草国际贸易有限公司", "耀莱雪茄控股有限公司", "引领国际有限公司", 
+                                            "英飞烽香港有限公司", "英美烟草中国有限公司", "云南中烟", "浙江中烟", "中茄国际贸易有限公司", 
+                                            "中烟英美烟草国际有限公司", "株式会社 KT&G", "无"],
+        "brand_name":                      ["万宝路", "555", "骆驼(国外)", "大华", "娇子", "大青山", "龙凤呈祥", "黄鹤楼", "真龙", "七匹狼", 
+                                            "芙蓉王", "双喜(广)", "贵烟", "钓鱼台", "红双喜(南洋)", "云烟", "蒙特", "富恩特", "拉·加莱拉", "苏烟", 
+                                            "丹纳曼", "黄山", "南京", "利群", "金桥", "泰山", "好日子", "石林", "美登", "红河", "嘉辉", "七星", 
+                                            "都彭", "天下秀", "长城", "高希霸", "钻石", "金圣", "王冠雪茄", "黄金叶", "中南海", "长白山", "红旗渠", 
+                                            "建牌", "大卫杜夫", "罗密欧", "茂大", "红金龙", "天子", "熊猫", "双喜(深)", "大前门", "兰州", 
+                                            "红双喜(沪)", "雄狮", "广州", "红玫王", "黄果树", "红塔山", "福", "小熊猫", "爱喜", "蒙特利", "玉溪", 
+                                            "都宝", "麦克纽杜", "卡里罗", "中华", "牡丹(沪)", "阿里山", "顺百利", "白沙", "羊城", "白云", 
+                                            "特美思", "国宾", "帕特加", "比德奥", "冬虫夏草", "威龙(湛江)", "香格里拉", "红梅", "延安", 
+                                            "特富意", "石狮", "金香港", "好猫", "登喜路", "乐迪", "林海灵芝", "椰树", "北京", "大红鹰", "大丰收", 
+                                            "红双喜(武汉)", "五叶神", "狮", "优民", "将军", "遵义", "恒大", "飞马", "红三环", "芙蓉", "工字", 
+                                            "古田", "狮牌", "君力", "哈尔滨", "梦都", "香梅(阜阳)", "哈德门", "梅州", "红山茶", "猴王", "沙龙", 
+                                            "潘趣", "狮子牌", "上海", "红玫", "醒宝", "广州湾", "百乐门", "关塔那摩", "威斯", "五一", "寿百年", 
+                                            "人民大会堂", "土楼", "三沙", "西湖", "光明", "阿诗玛", "宝亨", "恭贺新禧", "长寿", "茶花", "迎客松", 
+                                            "龙烟", "金澳门", "宝岛", "多米尼加之花", "国喜", "金驼", "君特欧", "上游", "幸福", "春城", "吉庆", 
+                                            "黄山松", "黄金龙", "紫气东来", "彼亚赛", "银辉", "潮牌", "庐山", "三峡", "壹支笔", "双叶", "无"],
+        "is_low_tar":                      ["是", "否"],
+        "is_medium":                       ["是", "否"],
+        "is_tiny":                         ["是", "否"],
+        "is_coarse":                       ["是", "否"],
+        "is_exploding_beads":              ["是", "否"],
+        "is_abnormity":                    ["是", "否"],
+        "is_cig":                          ["是", "否"],
+        "is_chuangxin":                    ["是", "否"],
+        "direct_retail_price":             ["0-10", "10-30", "31-50", "51-100", "10-19.9", "250-499.9", "200-249.9", 
+                                            "5-9.9", "0-5", "100-109.9", "150-199.9", "101-150", "120-129.9", "大于500", 
+                                            "20-29.9", "30-39.9", "140-149.9", "50-59.9", "40-49.9", "80-89.9", "60-69.9", 
+                                            "70-79.9", "大于150", "130-139.9", "90-99.9", "110-119.9"],
+        "tbc_total_length":                ["小于79", "80-89", "90-100", "大于120"],
+        "product_style":                   ["条盒硬盒", "条包硬盒", "条盒软盒", "条包软盒", "铁盒", "其他"],
+    }
+    
+    CLEANING_RULES = {
+        "factory_name":          {"method": "fillna", "opt": "fill", "value": "无", "type": "str"},
+        "brand_name":            {"method": "fillna", "opt": "fill", "value": "无", "type": "str"},
+        "is_low_tar":            {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "is_medium":             {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "is_tiny":               {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "is_coarse":             {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "is_exploding_beads":    {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "is_abnormity":          {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "is_cig":                {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "is_chuangxin":          {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "direct_retail_price":   {"method": "fillna", "opt": "fill", "value": "0-5", "type": "str"},
+        "tbc_total_length":      {"method": "fillna", "opt": "fill", "value": "小于79", "type": "str"},
+        "product_style":         {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
+    }
+
+class OrderConfig:
+    FEATURE_COLUMNS = [
+        "cust_code",                      # 零售户编码
+        "product_code",                   # 品牌规格编码
+        "sale_qty",                       # 销量包
+        # "sale_qty_l",                     # 销量上期
+        # "sale_qty_hb",                    # 销量环比
+        # "sale_amt",                       # 销售额包
+    ]
+    
+
+class ShopConfig:
+    FEATURE_COLUMNS = [
+        "cust_code",                      # 客户编码
+        "r_home_num",                     # 常驻人口_居住人数
+        "r_work_num",                     # 常驻人口_工作人数
+        "r_resident_num",                 # 常驻人口_工作或居住人数
+        "r_urban_cons_middle",            # 常驻人口_城市消费水平_中
+        "r_urban_cons_low",               # 常驻人口_城市消费水平_低
+        "r_urban_cons_lower",             # 常驻人口_城市消费水平_次低
+        "r_urban_cons_secondhigh",        # 常驻人口_城市消费水平_次高
+        "r_urban_cons_high",              # 常驻人口_城市消费水平_高
+        "r_edu_junior_middle",            # 常驻人口_学历_初中
+        "r_edu_doctor",                   # 常驻人口_学历_博士
+        "r_edu_specialty",                # 常驻人口_学历_大专
+        "r_edu_primary",                  # 常驻人口_学历_小学
+        "r_edu_college",                  # 常驻人口_学历_本科
+        "r_edu_postgraduate",             # 常驻人口_学历_硕士
+        "r_edu_senior_middle",            # 常驻人口_学历_高中
+        "r_house_price79999",             # 常驻人口_居住社区房价_60000_79999
+        "r_house_price59999",             # 常驻人口_居住社区房价_40000_59999
+        "r_house_price39999",             # 常驻人口_居住社区房价_20000_39999
+        "r_house_price19999",             # 常驻人口_居住社区房价_10000_19999
+        "r_house_price9999",              # 常驻人口_居住社区房价_8000_9999
+        "r_house_price7999",              # 常驻人口_居住社区房价_5000_7999
+        "r_house_price4999",              # 常驻人口_居住社区房价_2000_4999
+        "r_age_17",                       # 常驻人口_年龄_0_17
+        "r_age_24",                       # 常驻人口_年龄_18_24
+        "r_age_30",                       # 常驻人口_年龄_25_30
+        "r_age_35",                       # 常驻人口_年龄_31_35
+        "r_age_40",                       # 常驻人口_年龄_36_40
+        "r_age_45",                       # 常驻人口_年龄_41_45
+        "r_age_60",                       # 常驻人口_年龄_46_60
+        "r_age_over_60",                  # 常驻人口_年龄_61以上
+        "r_sex_woman",                    # 常驻人口_性别_女
+        "r_sex_man",                      # 常驻人口_性别_男
+        "r_catering_50",                  # 常驻人口_餐饮消费水平_50
+        "r_catering_100",                 # 常驻人口_餐饮消费水平_100
+        "r_catering_150",                 # 常驻人口_餐饮消费水平_150
+        "r_catering_200",                 # 常驻人口_餐饮消费水平_200
+        "r_catering_500",                 # 常驻人口_餐饮消费水平_500
+        "r_catering_over_500",            # 常驻人口_餐饮消费水平_500以上
+        "r_catering_times_2",             # 常驻人口_餐饮消费频次_1_2
+        "r_catering_times_4",             # 常驻人口_餐饮消费频次_2_4
+        "r_catering_times_6",             # 常驻人口_餐饮消费频次_4_6
+        "r_catering_times_8",             # 常驻人口_餐饮消费频次_6_8
+        "r_catering_times_10",            # 常驻人口_餐饮消费频次_8_10
+        "r_catering_times_11",            # 常驻人口_餐饮消费频次_11以上
+        "r_native_beijing",               # 常驻人口_家乡地_北京市
+        "r_native_tianjing",              # 常驻人口_家乡地_天津市
+        "r_native_hebei",                 # 常驻人口_家乡地_河北省
+        "r_native_shanxi",                # 常驻人口_家乡地_山西省
+        "r_native_neimeng",               # 常驻人口_家乡地_内蒙古
+        "r_native_liaoning",              # 常驻人口_家乡地_辽宁省
+        "r_native_jilin",                 # 常驻人口_家乡地_吉林省
+        "r_native_heilongjiang",          # 常驻人口_家乡地_黑龙江省
+        "r_native_shanghai",              # 常驻人口_家乡地_上海市
+        "r_native_jiangsu",               # 常驻人口_家乡地_江苏省
+        "r_native_zhejiang",              # 常驻人口_家乡地_浙江省
+        "r_native_anhui",                 # 常驻人口_家乡地_安徽省
+        "r_native_fujian",                # 常驻人口_家乡地_福建省
+        "r_native_jiangix",               # 常驻人口_家乡地_江西省
+        "r_native_shandong",              # 常驻人口_家乡地_山东省
+        "r_native_henan",                 # 常驻人口_家乡地_河南省
+        "r_native_hubei",                 # 常驻人口_家乡地_湖北省
+        "r_native_hunan",                 # 常驻人口_家乡地_湖南省
+        "r_native_guangdong",             # 常驻人口_家乡地_广东省
+        "r_native_hainan",                # 常驻人口_家乡地_海南省
+        "r_native_sichuan",               # 常驻人口_家乡地_四川省
+        "r_native_guizhou",               # 常驻人口_家乡地_贵州省
+        "r_native_yunnan",                # 常驻人口_家乡地_云南省
+        "r_native_shan",                  # 常驻人口_家乡地_陕西省
+        "r_native_gansu",                 # 常驻人口_家乡地_甘肃省
+        "r_native_qinghai",               # 常驻人口_家乡地_青海省
+        "r_native_guangxi",               # 常驻人口_家乡地_广西壮族自治区
+        "r_native_ningxia",               # 常驻人口_家乡地_宁夏回族自治区
+        "r_native_xinjiang",              # 常驻人口_家乡地_新疆维吾尔自治区
+        "r_native_xizang",                # 常驻人口_家乡地_西藏自治区
+        "r_native_chongqing",             # 常驻人口_家乡地_重庆市
+        "r_native_hongkong",              # 常驻人口_家乡地_香港
+        "r_native_macao",                 # 常驻人口_家乡地_澳门
+        "r_native_taiwan",                # 常驻人口_家乡地_台湾
+        "r_native_other",                 # 常驻人口_家乡地_其它
+        "f_flow_num",                     # 流动人口_工作日_日均流动人口数量
+        "f_holiday_flow_num",             # 流动人口_节假日_日均流动人口数量
+        "f_workday_flow_num",             # 流动人口_日均流动人口数量
+        "f_flowurban_cons_middle",        # 日均流动_城市消费水平_中
+        "f_flowurban_cons_low",           # 日均流动_城市消费水平_低
+        "f_flowurban_cons_lower",         # 日均流动_城市消费水平_次低
+        "f_flowurban_cons_second_high",   # 日均流动_城市消费水平_次高
+        "f_flowurban_cons_high",          # 日均流动_城市消费水平_高
+        "f_flowedu_junior_middle",        # 日均流动_学历_初中
+        "f_flowedu_doctor",               # 日均流动_学历_博士
+        "f_flowedu_specialty",            # 日均流动_学历_大专
+        "f_flowedu_primary",              # 日均流动_学历_小学
+        "f_flowedu_college",              # 日均流动_学历_本科
+        "f_flowedu_postgraduate",         # 日均流动_学历_硕士
+        "f_flowedu_senior_middle",        # 日均流动_学历_高中
+        "f_flowhouse_middle",             # 日均流动_居住社区房价_中
+        "f_flowhouse_low",                # 日均流动_居住社区房价_低
+        "f_flowhouse_lower",              # 日均流动_居住社区房价_次低
+        "f_flowhouse_second_high",        # 日均流动_居住社区房价_次高
+        "f_flowhouse_high",               # 日均流动_居住社区房价_高
+        "f_flowage_17",                   # 日均流动_年龄_0_17
+        "f_flowage_24",                   # 日均流动_年龄_18_24
+        "f_flowage_30",                   # 日均流动_年龄_25_30
+        "f_flowage_35",                   # 日均流动_年龄_31_35
+        "f_flowage_40",                   # 日均流动_年龄_36_40
+        "f_flowage_45",                   # 日均流动_年龄_41_45
+        "f_flowage_60",                   # 日均流动_年龄_46_60
+        "f_flowage_over_60",              # 日均流动_年龄_61以上
+        "f_flowsex_woman",                # 日均流动_性别_女
+        "f_flowsex_man",                  # 日均流动_性别_男
+        "f_holidayurban_cons_middle",     # 节假日流动_城市消费水平_中
+        "f_holidayurban_cons_low",        # 节假日流动_城市消费水平_低
+        "f_holidayurban_cons_lower",      # 节假日流动_城市消费水平_次低
+        "f_holidayurban_cons_secondhigh", # 节假日流动_城市消费水平_次高
+        "f_holidayurban_cons_high",       # 节假日流动_城市消费水平_高
+        "f_holidayedu_junior_middle",     # 节假日流动_学历_初中
+        "f_holidayedu_doctor",            # 节假日流动_学历_博士
+        "f_holidayedu_specialty",         # 节假日流动_学历_大专
+        "f_holidayedu_primary",           # 节假日流动_学历_小学
+        "f_holidayedu_college",           # 节假日流动_学历_本科
+        "f_holidayedu_postgraduate",      # 节假日流动_学历_硕士
+        "f_holidayedu_senior_middle",     # 节假日流动_学历_高中
+        "f_holidayhouse_middle",          # 节假日流动_居住社区房价_中
+        "f_holidayhouse_low",             # 节假日流动_居住社区房价_低
+        "f_holidayhouse_lower",           # 节假日流动_居住社区房价_次低
+        "f_holidayhouse_second_high",     # 节假日流动_居住社区房价_次高
+        "f_holidayhouse_high",            # 节假日流动_居住社区房价_高
+        "f_holidayage_17",                # 节假日流动_年龄_0_17
+        "f_holidayage_24",                # 节假日流动_年龄_18_24
+        "f_holidayage_30",                # 节假日流动_年龄_25_30
+        "f_holidayage_35",                # 节假日流动_年龄_31_35
+        "f_holidayage_40",                # 节假日流动_年龄_36_40
+        "f_holidayage_45",                # 节假日流动_年龄_41_45
+        "f_holidayage_60",                # 节假日流动_年龄_46_60
+        "f_holidayage_over_60",           # 节假日流动_年龄_61以上
+        "f_holidaysex_woman",             # 节假日流动_性别_女
+        "f_holidaysex_man",               # 节假日流动_性别_男
+        "f_workday_urban_cons_middle",    # 工作日流动_城市消费水平_中
+        "f_workday_urban_cons_low",       # 工作日流动_城市消费水平_低
+        "f_workday_urban_cons_lower",     # 工作日流动_城市消费水平_次低
+        "f_workday_urban_cons_secondhigh",# 工作日流动_城市消费水平_次高
+        "f_workday_urban_cons_high",      # 工作日流动_城市消费水平_高
+        "f_workday_edu_junior_middle",    # 工作日流动_学历_初中
+        "f_workday_edu_doctor",           # 工作日流动_学历_博士
+        "f_workday_edu_specialty",        # 工作日流动_学历_大专
+        "f_workday_edu_primary",          # 工作日流动_学历_小学
+        "f_workday_edu_college",          # 工作日流动_学历_本科
+        "f_workday_edu_postgraduate",     # 工作日流动_学历_硕士
+        "f_workday_edu_senior_middle",    # 工作日流动_学历_高中
+        "f_workday_house_middle",         # 工作日流动_居住社区房价_中
+        "f_workday_house_low",            # 工作日流动_居住社区房价_低
+        "f_workday_house_lower",          # 工作日流动_居住社区房价_次低
+        "f_workday_house_second_high",    # 工作日流动_居住社区房价_次高
+        "f_workday_house_high",           # 工作日流动_居住社区房价_高
+        "f_workday_age_17",               # 工作日流动_年龄_0_17
+        "f_workday_age_24",               # 工作日流动_年龄_18_24
+        "f_workday_age_30",               # 工作日流动_年龄_25_30
+        "f_workday_age_35",               # 工作日流动_年龄_31_35
+        "f_workday_age_40",               # 工作日流动_年龄_36_40
+        "f_workday_age_45",               # 工作日流动_年龄_41_45
+        "f_workday_age_60",               # 工作日流动_年龄_46_60
+        "f_workday_age_over_60",          # 工作日流动_年龄_61以上
+        "f_workday_sex_woman",            # 工作日流动_性别_女
+        "f_workday_sex_man",              # 工作日流动_性别_男
+    ]
+    
+    ONEHOT_CAT = {
+        "r_home_num":                        ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10000以上"],
+        "r_work_num":                        ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10000以上"],
+        "r_resident_num":                    ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-20000", "20000以上"],
+        "r_urban_cons_middle":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_urban_cons_low":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_urban_cons_lower":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_urban_cons_secondhigh":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_urban_cons_high":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_edu_junior_middle":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_edu_doctor":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_edu_specialty":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_edu_primary":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_edu_college":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_edu_postgraduate":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_edu_senior_middle":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_house_price79999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_house_price59999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_house_price39999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_house_price19999":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_house_price9999":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_house_price7999":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_house_price4999":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_17":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_24":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_30":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_35":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_40":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_45":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_60":                          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_age_over_60":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_sex_woman":                       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_sex_man":                         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_50":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_100":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_150":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_200":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_500":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_over_500":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_times_2":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_times_4":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_times_6":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_times_8":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_times_10":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_catering_times_11":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_beijing":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_tianjing":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_hebei":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_shanxi":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_neimeng":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_liaoning":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_jilin":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_heilongjiang":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_shanghai":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_jiangsu":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_zhejiang":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_anhui":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_fujian":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_jiangix":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_shandong":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_henan":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_hubei":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_hunan":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_guangdong":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_hainan":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_sichuan":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_guizhou":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_yunnan":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_shan":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_gansu":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_qinghai":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_guangxi":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_ningxia":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_xinjiang":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_xizang":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_chongqing":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_hongkong":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_macao":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_taiwan":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "r_native_other":                    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flow_num":                        ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-50000", "50001-100000", "100000以上"],
+        "f_holiday_flow_num":                ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-50000", "50001-100000", "100000以上"],
+        "f_workday_flow_num":                ["0-100", "101-500", "501-2000", "2001-5000", "5001-10000", "10001-50000", "50001-100000", "100000以上"],
+        "f_flowurban_cons_middle":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowurban_cons_low":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowurban_cons_lower":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowurban_cons_second_high":      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowurban_cons_high":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowedu_junior_middle":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowedu_doctor":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowedu_specialty":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowedu_primary":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowedu_college":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowedu_postgraduate":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowedu_senior_middle":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowhouse_middle":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowhouse_low":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowhouse_lower":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowhouse_second_high":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowhouse_high":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_17":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_24":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_30":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_35":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_40":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_45":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_60":                      ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowage_over_60":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowsex_woman":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_flowsex_man":                     ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayurban_cons_middle":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayurban_cons_low":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayurban_cons_lower":         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayurban_cons_secondhigh":    ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayurban_cons_high":          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayedu_junior_middle":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayedu_doctor":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayedu_specialty":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayedu_primary":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayedu_college":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayedu_postgraduate":         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayedu_senior_middle":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayhouse_middle":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayhouse_low":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayhouse_lower":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayhouse_second_high":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayhouse_high":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_17":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_24":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_30":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_35":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_40":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_45":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_60":                   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidayage_over_60":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidaysex_woman":                ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_holidaysex_man":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_urban_cons_middle":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_urban_cons_low":          ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_urban_cons_lower":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_urban_cons_secondhigh":   ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_urban_cons_high":         ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_edu_junior_middle":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_edu_doctor":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_edu_specialty":           ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_edu_primary":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_edu_college":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_edu_postgraduate":        ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_edu_senior_middle":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_house_middle":            ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_house_low":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_house_lower":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_house_second_high":       ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_house_high":              ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_17":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_24":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_30":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_35":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_40":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_45":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_60":                  ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_age_over_60":             ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_sex_woman":               ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+        "f_workday_sex_man":                 ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"],
+    }
+    
+    CLEANING_RULES = {
+        "r_home_num":                    {"method": "fillna", "opt": "fill", "value": "501-2000", "type": "str"},
+        "r_work_num":                    {"method": "fillna", "opt": "fill", "value": "501-2000", "type": "str"},
+        "r_resident_num":                {"method": "fillna", "opt": "fill", "value": "501-2000", "type": "str"},
+        "r_urban_cons_middle":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_urban_cons_low":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_urban_cons_lower":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_urban_cons_secondhigh":       {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_urban_cons_high":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_edu_junior_middle":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_edu_doctor":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_edu_specialty":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_edu_primary":                 {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_edu_college":                 {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_edu_postgraduate":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_edu_senior_middle":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_house_price79999":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_house_price59999":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_house_price39999":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_house_price19999":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_house_price9999":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_house_price7999":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_house_price4999":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_17":                      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_24":                      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_30":                      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_35":                      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_40":                      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_45":                      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_60":                      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_age_over_60":                 {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_sex_woman":                   {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_sex_man":                     {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_50":                 {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_100":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_150":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_200":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_500":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_over_500":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_times_2":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_times_4":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_times_6":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_times_8":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_times_10":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_catering_times_11":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_beijing":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_tianjing":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_hebei":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_shanxi":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_neimeng":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_liaoning":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_jilin":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_heilongjiang":         {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_shanghai":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_jiangsu":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_zhejiang":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_anhui":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_fujian":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_jiangix":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_shandong":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_henan":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_hubei":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_hunan":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_guangdong":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_hainan":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_sichuan":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_guizhou":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_yunnan":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_shan":                 {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_gansu":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_qinghai":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_guangxi":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_ningxia":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_xinjiang":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_xizang":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_chongqing":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_hongkong":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_macao":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_taiwan":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "r_native_other":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flow_num":                    {"method": "fillna", "opt": "fill", "value": "2001-5000", "type": "str"},
+        "f_holiday_flow_num":            {"method": "fillna", "opt": "fill", "value": "2001-5000", "type": "str"},
+        "f_workday_flow_num":            {"method": "fillna", "opt": "fill", "value": "2001-5000", "type": "str"},
+        "f_flowurban_cons_middle":       {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowurban_cons_low":          {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowurban_cons_lower":        {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowurban_cons_second_high":  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowurban_cons_high":         {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowedu_junior_middle":       {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowedu_doctor":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowedu_specialty":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowedu_primary":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowedu_college":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowedu_postgraduate":        {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowedu_senior_middle":       {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowhouse_middle":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowhouse_low":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowhouse_lower":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowhouse_second_high":       {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowhouse_high":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_17":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_24":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_30":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_35":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_40":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_45":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_60":                  {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowage_over_60":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowsex_woman":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_flowsex_man":                 {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayurban_cons_middle":    {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayurban_cons_low":       {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayurban_cons_lower":     {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayurban_cons_secondhigh": {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayurban_cons_high":      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayedu_junior_middle":    {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayedu_doctor":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayedu_specialty":        {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayedu_primary":          {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayedu_college":          {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayedu_postgraduate":     {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayedu_senior_middle":    {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayhouse_middle":         {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayhouse_low":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayhouse_lower":          {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayhouse_second_high":    {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayhouse_high":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_17":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_24":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_30":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_35":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_40":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_45":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_60":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidayage_over_60":          {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidaysex_woman":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_holidaysex_man":              {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_urban_cons_middle":     {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_urban_cons_low":        {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_urban_cons_lower":      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_urban_cons_secondhigh": {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_urban_cons_high":       {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_edu_junior_middle":     {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_edu_doctor":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_edu_specialty":         {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_edu_primary":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_edu_college":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_edu_postgraduate":      {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_edu_senior_middle":     {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_house_middle":          {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_house_low":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_house_lower":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_house_second_high":     {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_house_high":            {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_17":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_24":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_30":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_35":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_40":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_45":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_60":                {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_age_over_60":           {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_sex_woman":             {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+        "f_workday_sex_man":               {"method": "fillna", "opt": "fill", "value": "40-50", "type": "str"},
+    }
+
+class ImportanceFeaturesMap:
+    CUSTOM_FEATRUES_MAP = {
+        "BB_RTL_CUST_GRADE_NAME":                           "零售户分档名称",
+        "BB_RTL_CUST_MARKET_TYPE_NAME":                     "零售户市场类型名称",
+        "STORE_AREA":                                       "店铺经营面积",
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME":                   "零售户业态名称",
+        "OPERATOR_EDU_LEVEL":                               "零售客户经营者文化程",
+        "OPERATOR_AGE":                                     "经营者年龄",
+        "BB_RTL_CUST_CHAIN_FLAG":                           "零售户连锁标识",
+        "PRESENT_STAR_TERMINAL":                            "终端星级",
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":                "零售户信用等级名称",
+        "MD04_DIR_SAL_STORE_FLAG":                          "直营店标识",
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":                   "零售户经营范围名称",
+        "PRODUCT_INSALE_QTY":                               "在销品规数",
+        # "CUST_INVESTMENT":                                  "店铺资源投入建设",
+    }
+    
+    PRODUCT_FEATRUES_MAP = {
+        # ProductConfig 字段映射
+        "direct_retail_price":                              "建议零售价",
+        "is_low_tar":                                       "是否低焦油烟",
+        "tar_qty":                                          "焦油含量",
+        "is_exploding_beads":                               "是否爆珠",
+        "is_shortbranch":                                   "是否短支烟",
+        "is_medium":                                        "是否中支烟",
+        "is_tiny":                                          "是否细支",
+        "product_style_code_name":                          "包装类型名称",
+        "org_is_abnormity":                                 "是否异形包装",
+        "is_chuangxin":                                     "是否创新品类",
+        "is_key_brand":                                     "是否重点品牌",
+        "foster_level_hy":                                  "是否行业共育品规",
+        "foster_level_sj":                                  "是否省级共育品规",
+        "is_cigar":                                         "是否雪茄型卷烟",
+        "co_qty":                                           "一氧化碳含量",
+        "tbc_total_length":                                 "烟支总长度",
+        "tbc_length":                                       "烟支长度",
+        "filter_length":                                    "滤嘴长度",
+    }
+    
+    ORDER_FEATURE_MAP = {
+        "MONTH6_SALE_QTY": "近半年销量(箱)",
+        "MONTH6_SALE_AMT": "近半年销售额(万元)",
+        "MONTH6_GROSS_PROFIT_RATE": "近半年毛利率",
+        "MONTH6_SALE_QTY_YOY": "销量同比",
+        "MONTH6_SALE_QTY_MOM": "销量环比",
+        "MONTH6_SALE_AMT_YOY": "销售额(购进额)同比",
+        "MONTH6_SALE_AMT_MOM": "销售额(购进额)环比",
+        "STOCK_QTY": "库存",
+        "ORDER_FULLORDR_RATE": "订足率",
+        "ORDER_FULLORDR_RATE_MOM": "订足率环比",
+        "FULL_FILLMENT_RATE": "订单满足率",
+        "CUSTOMER_REPURCHASE_RATE": "会员重购率(部分有会员)",
+        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC": "新品订货量占同价类比重",
+        "DEMAND_RATE": "需求量满足率",
+        "LISTING_RATE": "品规上架率",
+        "PUT_MARKET_FINISH_RATE": "投放完成率",
+        "OUT_STOCK_DAYS": "断货天数(部分零售商有)",
+        "UNPACKING_RATE": "拆包率",
+        "city_uuid": "城市UUID"
+    }
+    
+    POS_FEATURE_MAP = {
+        "YLT_TURNOVER_RATE": "易灵通动销率",
+        "YLT_BAR_PACKAGE_SALE_OCC": "易灵通条包销售占比",
+        "POS_PACKAGE_PRICE": "POS机单包价格",
+    }
+    
+    SHOPING_FEATURES_MAP = {
+        # 商圈 字段映射
+        "r_home_num": "常驻人口_居住人数",
+        "r_work_num": "常驻人口_工作人数",
+        "r_resident_num": "常驻人口_工作或居住人数",
+        "r_urban_cons_middle": "常驻人口_城市消费水平_中",
+        "r_urban_cons_low": "常驻人口_城市消费水平_低",
+        "r_urban_cons_lower": "常驻人口_城市消费水平_次低",
+        "r_urban_cons_secondhigh": "常驻人口_城市消费水平_次高",
+        "r_urban_cons_high": "常驻人口_城市消费水平_高",
+        "r_edu_junior_middle": "常驻人口_学历_初中",
+        "r_edu_doctor": "常驻人口_学历_博士",
+        "r_edu_specialty": "常驻人口_学历_大专",
+        "r_edu_primary": "常驻人口_学历_小学",
+        "r_edu_college": "常驻人口_学历_本科",
+        "r_edu_postgraduate": "常驻人口_学历_硕士",
+        "r_edu_senior_middle": "常驻人口_学历_高中",
+        "r_house_price79999": "常驻人口_居住社区房价_60000_79999",
+        "r_house_price59999": "常驻人口_居住社区房价_40000_59999",
+        "r_house_price39999": "常驻人口_居住社区房价_20000_39999",
+        "r_house_price19999": "常驻人口_居住社区房价_10000_19999",
+        "r_house_price9999": "常驻人口_居住社区房价_8000_9999",
+        "r_house_price7999": "常驻人口_居住社区房价_5000_7999",
+        "r_house_price4999": "常驻人口_居住社区房价_2000_4999",
+        "r_age_17": "常驻人口_年龄_0_17",
+        "r_age_24": "常驻人口_年龄_18_24",
+        "r_age_30": "常驻人口_年龄_25_30",
+        "r_age_35": "常驻人口_年龄_31_35",
+        "r_age_40": "常驻人口_年龄_36_40",
+        "r_age_45": "常驻人口_年龄_41_45",
+        "r_age_60": "常驻人口_年龄_46_60",
+        "r_age_over_60": "常驻人口_年龄_61以上",
+        "r_sex_woman": "常驻人口_性别_女",
+        "r_sex_man": "常驻人口_性别_男",
+        "r_catering_50": "常驻人口_餐饮消费水平_50",
+        "r_catering_100": "常驻人口_餐饮消费水平_100",
+        "r_catering_150": "常驻人口_餐饮消费水平_150",
+        "r_catering_200": "常驻人口_餐饮消费水平_200",
+        "r_catering_500": "常驻人口_餐饮消费水平_500",
+        "r_catering_over_500": "常驻人口_餐饮消费水平_500以上",
+        "r_catering_times_2": "常驻人口_餐饮消费频次_1_2",
+        "r_catering_times_4": "常驻人口_餐饮消费频次_2_4",
+        "r_catering_times_6": "常驻人口_餐饮消费频次_4_6",
+        "r_catering_times_8": "常驻人口_餐饮消费频次_6_8",
+        "r_catering_times_10": "常驻人口_餐饮消费频次_8_10",
+        "r_catering_times_11": "常驻人口_餐饮消费频次_11以上",
+        "r_native_beijing": "常驻人口_家乡地_北京市",
+        "r_native_tianjing": "常驻人口_家乡地_天津市",
+        "r_native_hebei": "常驻人口_家乡地_河北省",
+        "r_native_shanxi": "常驻人口_家乡地_山西省",
+        "r_native_neimeng": "常驻人口_家乡地_内蒙古",
+        "r_native_liaoning": "常驻人口_家乡地_辽宁省",
+        "r_native_jilin": "常驻人口_家乡地_吉林省",
+        "r_native_heilongjiang": "常驻人口_家乡地_黑龙江省",
+        "r_native_shanghai": "常驻人口_家乡地_上海市",
+        "r_native_jiangsu": "常驻人口_家乡地_江苏省",
+        "r_native_zhejiang": "常驻人口_家乡地_浙江省",
+        "r_native_anhui": "常驻人口_家乡地_安徽省",
+        "r_native_fujian": "常驻人口_家乡地_福建省",
+        "r_native_jiangix": "常驻人口_家乡地_江西省",
+        "r_native_shandong": "常驻人口_家乡地_山东省",
+        "r_native_henan": "常驻人口_家乡地_河南省",
+        "r_native_hubei": "常驻人口_家乡地_湖北省",
+        "r_native_hunan": "常驻人口_家乡地_湖南省",
+        "r_native_guangdong": "常驻人口_家乡地_广东省",
+        "r_native_hainan": "常驻人口_家乡地_海南省",
+        "r_native_sichuan": "常驻人口_家乡地_四川省",
+        "r_native_guizhou": "常驻人口_家乡地_贵州省",
+        "r_native_yunnan": "常驻人口_家乡地_云南省",
+        "r_native_shan": "常驻人口_家乡地_陕西省",
+        "r_native_gansu": "常驻人口_家乡地_甘肃省",
+        "r_native_qinghai": "常驻人口_家乡地_青海省",
+        "r_native_guangxi": "常驻人口_家乡地_广西壮族自治区",
+        "r_native_ningxia": "常驻人口_家乡地_宁夏回族自治区",
+        "r_native_xinjiang": "常驻人口_家乡地_新疆维吾尔自治区",
+        "r_native_xizang": "常驻人口_家乡地_西藏自治区",
+        "r_native_chongqing": "常驻人口_家乡地_重庆市",
+        "r_native_hongkong": "常驻人口_家乡地_香港",
+        "r_native_macao": "常驻人口_家乡地_澳门",
+        "r_native_taiwan": "常驻人口_家乡地_台湾",
+        "r_native_other": "常驻人口_家乡地_其它",
+        "f_flow_num": "流动人口_日均流动人口数量",
+        "f_holiday_flow_num": "流动人口_节假日日均流动人口数量",
+        "f_workday_flow_num": "流动人口_工作日日均流动人口数量",
+        "f_flowurban_cons_middle": "日均流动_城市消费水平_中",
+        "f_flowurban_cons_low": "日均流动_城市消费水平_低",
+        "f_flowurban_cons_lower": "日均流动_城市消费水平_次低",
+        "f_flowurban_cons_second_high": "日均流动_城市消费水平_次高",
+        "f_flowurban_cons_high": "日均流动_城市消费水平_高",
+        "f_flowedu_junior_middle": "日均流动_学历_初中",
+        "f_flowedu_doctor": "日均流动_学历_博士",
+        "f_flowedu_specialty": "日均流动_学历_大专",
+        "f_flowedu_primary": "日均流动_学历_小学",
+        "f_flowedu_college": "日均流动_学历_本科",
+        "f_flowedu_postgraduate": "日均流动_学历_硕士",
+        "f_flowedu_senior_middle": "日均流动_学历_高中",
+        "f_flowhouse_middle": "日均流动_居住社区房价_中",
+        "f_flowhouse_low": "日均流动_居住社区房价_低",
+        "f_flowhouse_lower": "日均流动_居住社区房价_次低",
+        "f_flowhouse_second_high": "日均流动_居住社区房价_次高",
+        "f_flowhouse_high": "日均流动_居住社区房价_高",
+        "f_flowage_17": "日均流动_年龄_0_17",
+        "f_flowage_24": "日均流动_年龄_18_24",
+        "f_flowage_30": "日均流动_年龄_25_30",
+        "f_flowage_35": "日均流动_年龄_31_35",
+        "f_flowage_40": "日均流动_年龄_36_40",
+        "f_flowage_45": "日均流动_年龄_41_45",
+        "f_flowage_60": "日均流动_年龄_46_60",
+        "f_flowage_over_60": "日均流动_年龄_61以上",
+        "f_flowsex_woman": "日均流动_性别_女",
+        "f_flowsex_man": "日均流动_性别_男",
+        "f_holidayurban_cons_middle": "节假日流动_城市消费水平_中",
+        "f_holidayurban_cons_low": "节假日流动_城市消费水平_低",
+        "f_holidayurban_cons_lower": "节假日流动_城市消费水平_次低",
+        "f_holidayurban_cons_secondhigh": "节假日流动_城市消费水平_次高",
+        "f_holidayurban_cons_high": "节假日流动_城市消费水平_高",
+        "f_holidayedu_junior_middle": "节假日流动_学历_初中",
+        "f_holidayedu_doctor": "节假日流动_学历_博士",
+        "f_holidayedu_specialty": "节假日流动_学历_大专",
+        "f_holidayedu_primary": "节假日流动_学历_小学",
+        "f_holidayedu_college": "节假日流动_学历_本科",
+        "f_holidayedu_postgraduate": "节假日流动_学历_硕士",
+        "f_holidayedu_senior_middle": "节假日流动_学历_高中",
+        "f_holidayhouse_middle": "节假日流动_居住社区房价_中",
+        "f_holidayhouse_low": "节假日流动_居住社区房价_低",
+        "f_holidayhouse_lower": "节假日流动_居住社区房价_次低",
+        "f_holidayhouse_second_high": "节假日流动_居住社区房价_次高",
+        "f_holidayhouse_high": "节假日流动_居住社区房价_高",
+        "f_holidayage_17": "节假日流动_年龄_0_17",
+        "f_holidayage_24": "节假日流动_年龄_18_24",
+        "f_holidayage_30": "节假日流动_年龄_25_30",
+        "f_holidayage_35": "节假日流动_年龄_31_35",
+        "f_holidayage_40": "节假日流动_年龄_36_40",
+        "f_holidayage_45": "节假日流动_年龄_41_45",
+        "f_holidayage_60": "节假日流动_年龄_46_60",
+        "f_holidayage_over_60": "节假日流动_年龄_61以上",
+        "f_holidaysex_woman": "节假日流动_性别_女",
+        "f_holidaysex_man": "节假日流动_性别_男",
+        "f_workday_urban_cons_middle": "工作日流动_城市消费水平_中",
+        "f_workday_urban_cons_low": "工作日流动_城市消费水平_低",
+        "f_workday_urban_cons_lower": "工作日流动_城市消费水平_次低",
+        "f_workday_urban_cons_secondhigh": "工作日流动_城市消费水平_次高",
+        "f_workday_urban_cons_high": "工作日流动_城市消费水平_高",
+        "f_workday_edu_junior_middle": "工作日流动_学历_初中",
+        "f_workday_edu_doctor": "工作日流动_学历_博士",
+        "f_workday_edu_specialty": "工作日流动_学历_大专",
+        "f_workday_edu_primary": "工作日流动_学历_小学",
+        "f_workday_edu_college": "工作日流动_学历_本科",
+        "f_workday_edu_postgraduate": "工作日流动_学历_硕士",
+        "f_workday_edu_senior_middle": "工作日流动_学历_高中",
+        "f_workday_house_middle": "工作日流动_居住社区房价_中",
+        "f_workday_house_low": "工作日流动_居住社区房价_低",
+        "f_workday_house_lower": "工作日流动_居住社区房价_次低",
+        "f_workday_house_second_high": "工作日流动_居住社区房价_次高",
+        "f_workday_house_high": "工作日流动_居住社区房价_高",
+        "f_workday_age_17": "工作日流动_年龄_0_17",
+        "f_workday_age_24": "工作日流动_年龄_18_24",
+        "f_workday_age_30": "工作日流动_年龄_25_30",
+        "f_workday_age_35": "工作日流动_年龄_31_35",
+        "f_workday_age_40": "工作日流动_年龄_36_40",
+        "f_workday_age_45": "工作日流动_年龄_41_45",
+        "f_workday_age_60": "工作日流动_年龄_46_60",
+        "f_workday_age_over_60": "工作日流动_年龄_61以上",
+        "f_workday_sex_woman": "工作日流动_性别_女",
+        "f_workday_sex_man": "工作日流动_性别_男"
+    }

+ 510 - 510
models/rank/data/config_ori.py

@@ -1,511 +1,511 @@
-class CustConfig:
-    FEATURE_COLUMNS = [
-        "BB_RETAIL_CUSTOMER_CODE",                     # 零售户代码
-        "BB_RTL_CUST_GRADE_NAME",                      # 零售户分档名称
-        "BB_RTL_CUST_MARKET_TYPE_NAME",                # 零售户市场类型名称
-        "STORE_AREA",                                  # 店铺经营面积
-        "BB_RTL_CUST_BUSINESS_TYPE_NAME",              # 零售户业态名称
-        "OPERATOR_EDU_LEVEL",                          # 零售客户经营者文化程
-        "OPERATOR_AGE",                                # 经营者年龄
-        "BB_RTL_CUST_CHAIN_FLAG",                      # 零售户连锁标识
-        "PRESENT_STAR_TERMINAL",                       # 终端星级
-        "MD04_MG_RTL_CUST_CREDITCLASS_NAME",           # 零售户信用等级名称
-        "MD04_DIR_SAL_STORE_FLAG",                     # 直营店标识
-        "BB_CUSTOMER_MANAGER_SCOPE_NAME",              # 零售户经营范围名称
-        "PRODUCT_INSALE_QTY",                          # 在销品规数
-        # "CUST_INVESTMENT",                             # 店铺资源投入建设
-        
-        # "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",       # 新品订货量占同价类比重
-        # "PRODUCT_LISTING_RATE",                        # 品规上架率
-        # "STOCKOUT_DAYS",                              # 断货天数
-        # "YLT_TURNOVER_RATE",                           # 易灵通动销率
-        # "YLT_BAR_PACKAGE_SALE_OCC",                    # 易灵通条包销售占比
-        # "UNPACKING_RATE",                              # 拆包率
-        
-        
-        # "BB_RTL_CUST_POSITION_TYPE_NAME",              # 零售户商圈类型名称
-        
-        # "BB_RTL_CUST_SUB_BUSI_PLACE_NAME",             # 零售户业态细分名称
-        
-        # "BB_RTL_CUST_TERMINAL_LEVEL_NAME",             # 零售户终端层级名称
-        # "BB_RTL_CUST_TERMINALEVEL_NAME",               # 零售户终端层级细分名称
-        # "MD04_MG_SAMPLE_CUST_FLAG",                    # 样本户标识
-        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG",            # 零售户大户标识
-        # "BB_RTL_CUST_OPERATE_METHOD_NAME",             # 零售户经营方式名称
-        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME",          # 零售户卷烟经营规模名称
-        
-        # "AVERAGE_CONSUMER_FLOW",                       # 月均消费人流
-        # "NEW_PRODUCT_MEMBERS_QTY",                     # 新品消费会员数量
-    ]
-    # 数据清洗规则
-    CLEANING_RULES = {
-        "BB_RTL_CUST_GRADE_NAME":                   {"method": "fillna", "opt": "fill", "value": "十五档", "type": "str"},
-        "BB_RTL_CUST_MARKET_TYPE_NAME":             {"method": "fillna", "opt": "fill", "value": "城网", "type": "str"},
-        "STORE_AREA":                               {"method": "fillna", "opt": "mean", "type": "num"},
-        "BB_RTL_CUST_BUSINESS_TYPE_NAME":           {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
-        "OPERATOR_EDU_LEVEL":                       {"method": "fillna", "opt": "fill", "value": "无数据", "type": "str"},
-        "OPERATOR_AGE":                             {"method": "fillna", "opt": "mean", "type": "num"},
-        "BB_RTL_CUST_CHAIN_FLAG":                   {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
-        "PRESENT_STAR_TERMINAL":                    {"method": "fillna", "opt": "fill", "value": "非星级", "type": "str"},
-        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        {"method": "fillna", "opt": "fill", "value": "B", "type": "str"},
-        "MD04_DIR_SAL_STORE_FLAG":                  {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
-        "BB_CUSTOMER_MANAGER_SCOPE_NAME":           {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
-        "PRODUCT_INSALE_QTY":                       {"method": "fillna", "opt": "mean", "type": "num"},
-        # "CUST_INVESTMENT":                          {"method": "fillna", "opt": "fill", "type": 0}
-        
-        
-        # "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC":    {"method": "fillna", "opt": "mean", "type": "num"},
-        # "PRODUCT_LISTING_RATE":                     {"method": "fillna", "opt": "mean", "type": "num"},
-        # "STOCKOUT_DAYS":                            {"method": "fillna", "opt": "mean", "type": "num"},
-        # "YLT_TURNOVER_RATE":                        {"method": "fillna", "opt": "mean", "type": "num"},
-        # "NEW_PRODUCT_MEMBERS_QTY":                  {"method": "fillna", "opt": "mean", "type": "num"},
-        # "PRODUCT_INSALE_QTY":                       {"method": "fillna", "opt": "mean", "type": "num"},
-        # "UNPACKING_RATE":                           {"method": "fillna", "opt": "mean", "type": "num"},
-        
-        
-        
-        
-        # "BB_RTL_CUST_POSITION_TYPE_NAME":           {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
-        # "BB_RTL_CUST_SUB_BUSI_PLACE_NAME":          {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
-        # "BB_RTL_CUST_TERMINALEVEL_NAME":          {"method": "fillna", "opt": "replace", "value": "BB_RTL_CUST_TERMINAL_LEVEL_NAME", "type": "str"},
-        # "MD04_MG_SAMPLE_CUST_FLAG":                 {"method": "fillna", "value": "N", "opt": "fill"},
-        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG":         {"method": "fillna", "value": "N", "opt": "fill"},
-        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME":       {"method": "fillna", "value": "中", "opt": "fill"},
-    }
-    
-    ONEHOT_CAT = {
-        "BB_RTL_CUST_GRADE_NAME":                   ['一档', '二档', '三档', '四档', '五档', '六档', '七档', '八档', '九档', '十档', '十一档', '十二档', 
-                                                    '十三档', '十四档', '十五档', '十六档', '十七档', '十八档', '十九档', '二十档', '二十一档', '二十二档', 
-                                                    '二十三档', '二十四档', '二十五档', '二十六档', '二十七档', '二十八档', '二十九档', '三十档'],
-        "BB_RTL_CUST_MARKET_TYPE_NAME":             ["城网", "农网"],
-        "BB_RTL_CUST_BUSINESS_TYPE_NAME":           ["便利店", "超市", "烟草专业店", "娱乐服务类", "其他"],
-        "OPERATOR_EDU_LEVEL":                       [1, 2, 3, 4, 5, 6, 7, "无数据"],
-        "BB_RTL_CUST_CHAIN_FLAG":                   ["是", "否"],
-        "PRESENT_STAR_TERMINAL":                    ["一星", "二星", "三星", "四星", "五星", "非星级"],
-        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        ["AAA", "AA", "A", "B", "C", "D"],
-        "MD04_DIR_SAL_STORE_FLAG":                  ["是", "否"],
-        "BB_CUSTOMER_MANAGER_SCOPE_NAME":           ["是", "否"],
-        
-        
-        
-        # "BB_RTL_CUST_POSITION_TYPE_NAME":           ["居民区", "商业娱乐区", "交通枢纽区", "旅游景区", "工业区", "集贸区", "院校学区", "办公区", "其他"]
-    }
-    
-class ProductConfig:
-    FEATURE_COLUMNS = [
-        "product_code",                                # 商品编码
-        "direct_retail_price",                         # 建议零售价
-        "is_low_tar",                                  # 是否低焦油烟
-        "tar_qty",                                     # 焦油含量
-        "is_exploding_beads",                          # 是否爆珠
-        "is_shortbranch",                              # 是否短支烟
-        "is_medium",                                   # 是否中支烟
-        "is_tiny",                                     # 是否细支
-        "product_style_code_name",                     # 包装类型名称
-        "org_is_abnormity",                            # 是否异形包装
-        "is_chuangxin",                                # 是否创新品类
-        "is_key_brand",                                # 是否重点品牌
-        "foster_level_hy",                             # 是否行业共育品规
-        "foster_level_sj",                             # 是否省级共育品规
-        "is_cigar",                                    # 是否雪茄型卷烟
-        "co_qty",                                      # 一氧化碳含量
-        "tbc_total_length",                            # 烟支总长度
-        "tbc_length",                                  # 烟支长度
-        "filter_length",                               # 滤嘴长度
-        
-
-        
-        # "adjust_price",                                # 含税调拨价
-        # "notwithtax_adjust_price",                     # 不含税调拨价
-        # "whole_sale_price",                            # 统一批发价
-        # "allot_price",                                 # 调拨价
-        # "direct_whole_price",                          # 批发指导价
-        # "retail_price",                                # 零售价
-        # "price_type_name",                             # 卷烟价类名称
-        # "gear_type_name",                              # 卷烟档位名称
-        # "category_type_name",                          # 卷烟品类名称
-        # "is_high_level",                               # 是否高端烟
-        # "is_upscale_level",                            # 是否高端烟不含高价
-        # "is_high_price",                               # 是否高价烟
-        # "is_low_price",                                # 是否低价烟
-        # "is_encourage",                                # 是否全国鼓励品牌
-        # "is_abnormity",                                # 是否异形包装
-        # "is_intake",                                   # 是否进口烟
-        # "is_short",                                    # 是否紧俏品牌
-        # "is_ordinary_price_type",                      # 是否普一类烟
-        # "source_type",                                 # 来源类型
-        # "chinese_mix",                                 # 中式混合
-        # "sub_price_type_name",                         # 细分卷烟价类名称
-    ]
-    
-    CLEANING_RULES = {
-        "direct_retail_price":                         {"method": "fillna", "opt": "mean", "type": "num"},
-        "is_low_tar":                                  {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "tar_qty":                                     {"method": "fillna", "opt": "mean", "type": "num"},
-        "is_exploding_beads":                          {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "is_shortbranch":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "is_medium":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "is_tiny":                                     {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "product_style_code_name":                     {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
-        "org_is_abnormity":                            {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "is_chuangxin":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "is_key_brand":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "foster_level_hy":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "foster_level_sj":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "is_cigar":                                    {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        "co_qty":                                      {"method": "fillna", "opt": "mean", "type": "num"},
-        "tbc_total_length":                            {"method": "fillna", "opt": "mean", "type": "num"},
-        "tbc_length":                                  {"method": "fillna", "opt": "mean", "type": "num"},
-        "filter_length":                               {"method": "fillna", "opt": "mean", "type": "num"},
-        
-        
-        # "adjust_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
-        # "notwithtax_adjust_price":                     {"method": "fillna", "opt": "mean", "type": "num"},
-        # "whole_sale_price":                            {"method": "fillna", "opt": "mean", "type": "num"},
-        # "allot_price":                                 {"method": "fillna", "opt": "fill", "type": "num", "value": 0.0},
-        # "direct_whole_price":                          {"method": "fillna", "opt": "mean", "type": "num"},
-        # "retail_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
-        # "price_type_name":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "一类烟"},
-        # "gear_type_name":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
-        # "category_type_name":                          {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
-        # "is_high_level":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_upscale_level":                            {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_high_price":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_low_price":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_encourage":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_abnormity":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_intake":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_short":                                    {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "is_ordinary_price_type":                      {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "source_type":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
-        # "chinese_mix":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
-        # "sub_price_type_name":                         {"method": "fillna", "opt": "fill", "type": "str", "value": "普一类烟"},
-    }
-    
-
-    ONEHOT_CAT = {
-        "is_low_tar":                                  ["是", "否"],
-        "is_exploding_beads":                          ["是", "否"],
-        "is_shortbranch":                              ["是", "否"],
-        "is_medium":                                   ["是", "否"],
-        "is_tiny":                                     ["是", "否"],
-        "product_style_code_name":                     ["条盒硬盒", "条包硬盒", "条盒软盒", "条包软盒", "铁盒", "其他"],
-        "org_is_abnormity":                            ["是", "否"],
-        "is_chuangxin":                                ["是", "否"],
-        "is_key_brand":                                ["是", "否"],
-        "foster_level_hy":                             ["是", "否"],
-        "foster_level_sj":                             ["是", "否"],
-        "is_cigar":                                    ["是", "否"],
-        
-        
-        
-        # "price_type_name":                             ["一类烟", "二类烟", "三类烟", "四类烟", "五类烟", "无价类"],
-        # "gear_type_name":                              ["第一档位", "第二档位", "第三档位", "第四档位", "第五档位", "第六档位", "第七档位", "第八档位", "其他"],
-        # "category_type_name":                          ["第1品类", "第2品类", "第3品类", "第4品类", "第5品类", "第6品类", "第7品类", 
-        #                                                 "第8品类", "第9品类", "第10品类", "第11品类", "第12品类", "第13品类", "其他"],
-        # "is_high_level":                               ["是", "否"],
-        # "is_upscale_level":                            ["是", "否"],
-        # "is_high_price":                               ["是", "否"],
-        # "is_low_price":                                ["是", "否"],
-        # "is_encourage":                                ["是", "否"],
-        # "is_abnormity":                                ["是", "否"],
-        # "is_intake":                                   ["是", "否"],
-        # "is_short":                                    ["是", "否"],
-        # "is_ordinary_price_type":                      ["是", "否"],
-        # "source_type":                                 ["是", "否"],
-        # "chinese_mix":                                 ["是", "否"],
-        # "sub_price_type_name":                         ["高端烟", "高价位烟", "普一类烟", "二类烟", "三类烟", "四类烟", "五类烟", "无价类"],
-    }
-    
-class OrderConfig:
-    FEATURE_COLUMNS = [
-        "BB_RETAIL_CUSTOMER_CODE",                          # 零售户编码
-        "PRODUCT_CODE",                                     # 卷烟编码
-        "MONTH6_SALE_QTY",                                  # 近半年销量(箱)
-        "MONTH6_SALE_AMT",                                  # 近半年销售额(万元)
-        "MONTH6_GROSS_PROFIT_RATE",                         # 近半年毛利率
-        "MONTH6_SALE_QTY_YOY",                              # 销售量同比
-        "MONTH6_SALE_QTY_MOM",                              # 销售量环比
-        "MONTH6_SALE_AMT_YOY",                              # 销售额(购进额)同比
-        "MONTH6_SALE_AMT_MOM",                              # 销售额(购进额)环比
-        "STOCK_QTY",                                        # 库存
-        "ORDER_FULLORDR_RATE",                              # 订足率
-        "FULL_FILLMENT_RATE",                               # 订单满足率
-        "ORDER_FULLORDR_RATE_MOM",                          # 订足率环比
-        "CUSTOMER_REPURCHASE_RATE",                         # 会员重购率  
-        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",            # 新品订货量占同价类比重/decimal(18,6)
-        "DEMAND_RATE",                                      # 需求量满足率
-        "LISTING_RATE",                                     # 品规商上架率
-        "PUT_MARKET_FINISH_RATE",                           # 投放完成率
-        "OUT_STOCK_DAYS",                                   # 断货天数
-        "YLT_TURNOVER_RATE",                                # 易灵通动销率
-        "YLT_BAR_PACKAGE_SALE_OCC",                         # 易灵通调包销售占比
-        "UNPACKING_RATE",                                   # 拆包率
-        "POS_PACKAGE_PRICE",                                # pos机单包价格
-    ]
-    
-    CLEANING_FEATURES = [
-        "MONTH6_SALE_QTY",
-        "MONTH6_SALE_AMT",
-        "MONTH6_GROSS_PROFIT_RATE",
-        "MONTH6_SALE_QTY_YOY",
-        "MONTH6_SALE_QTY_MOM",
-        "MONTH6_SALE_AMT_YOY",
-        "MONTH6_SALE_AMT_MOM",
-        "STOCK_QTY",
-        "ORDER_FULLORDR_RATE",
-        "FULL_FILLMENT_RATE",
-        "ORDER_FULLORDR_RATE_MOM",
-        "CUSTOMER_REPURCHASE_RATE",
-        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",
-        "DEMAND_RATE",
-        "LISTING_RATE",
-        "PUT_MARKET_FINISH_RATE",
-        "OUT_STOCK_DAYS",
-        "UNPACKING_RATE",
-    ]
-    
-    WEIGHTS = {
-        "MONTH6_SALE_QTY":                                  0.15,
-        "MONTH6_SALE_QTY_MOM":                              0.2,
-        "ORDER_FULLORDR_RATE":                              0.3,
-        "ORDER_FULLORDR_RATE_MOM":                          0.35,
-    }
-    
-    POSFEATURES = [
-        "YLT_TURNOVER_RATE","YLT_BAR_PACKAGE_SALE_OCC","POS_PACKAGE_PRICE"
-    ]
-    
-class ImportanceFeaturesMap:
-    CUSTOM_FEATRUES_MAP = {
-        "BB_RTL_CUST_GRADE_NAME":                           "零售户分档名称",
-        "BB_RTL_CUST_MARKET_TYPE_NAME":                     "零售户市场类型名称",
-        "STORE_AREA":                                       "店铺经营面积",
-        "BB_RTL_CUST_BUSINESS_TYPE_NAME":                   "零售户业态名称",
-        "OPERATOR_EDU_LEVEL":                               "零售客户经营者文化程",
-        "OPERATOR_AGE":                                     "经营者年龄",
-        "BB_RTL_CUST_CHAIN_FLAG":                           "零售户连锁标识",
-        "PRESENT_STAR_TERMINAL":                            "终端星级",
-        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":                "零售户信用等级名称",
-        "MD04_DIR_SAL_STORE_FLAG":                          "直营店标识",
-        "BB_CUSTOMER_MANAGER_SCOPE_NAME":                   "零售户经营范围名称",
-        "PRODUCT_INSALE_QTY":                               "在销品规数",
-        # "CUST_INVESTMENT":                                  "店铺资源投入建设",
-    }
-    
-    PRODUCT_FEATRUES_MAP = {
-        # ProductConfig 字段映射
-        "direct_retail_price":                              "建议零售价",
-        "is_low_tar":                                       "是否低焦油烟",
-        "tar_qty":                                          "焦油含量",
-        "is_exploding_beads":                               "是否爆珠",
-        "is_shortbranch":                                   "是否短支烟",
-        "is_medium":                                        "是否中支烟",
-        "is_tiny":                                          "是否细支",
-        "product_style_code_name":                          "包装类型名称",
-        "org_is_abnormity":                                 "是否异形包装",
-        "is_chuangxin":                                     "是否创新品类",
-        "is_key_brand":                                     "是否重点品牌",
-        "foster_level_hy":                                  "是否行业共育品规",
-        "foster_level_sj":                                  "是否省级共育品规",
-        "is_cigar":                                         "是否雪茄型卷烟",
-        "co_qty":                                           "一氧化碳含量",
-        "tbc_total_length":                                 "烟支总长度",
-        "tbc_length":                                       "烟支长度",
-        "filter_length":                                    "滤嘴长度",
-    }
-    
-    ORDER_FEATURE_MAP = {
-        "MONTH6_SALE_QTY": "近半年销量(箱)",
-        "MONTH6_SALE_AMT": "近半年销售额(万元)",
-        "MONTH6_GROSS_PROFIT_RATE": "近半年毛利率",
-        "MONTH6_SALE_QTY_YOY": "销量同比",
-        "MONTH6_SALE_QTY_MOM": "销量环比",
-        "MONTH6_SALE_AMT_YOY": "销售额(购进额)同比",
-        "MONTH6_SALE_AMT_MOM": "销售额(购进额)环比",
-        "STOCK_QTY": "库存",
-        "ORDER_FULLORDR_RATE": "订足率",
-        "ORDER_FULLORDR_RATE_MOM": "订足率环比",
-        "FULL_FILLMENT_RATE": "订单满足率",
-        "CUSTOMER_REPURCHASE_RATE": "会员重购率(部分有会员)",
-        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC": "新品订货量占同价类比重",
-        "DEMAND_RATE": "需求量满足率",
-        "LISTING_RATE": "品规上架率",
-        "PUT_MARKET_FINISH_RATE": "投放完成率",
-        "OUT_STOCK_DAYS": "断货天数(部分零售商有)",
-        "UNPACKING_RATE": "拆包率",
-        "city_uuid": "城市UUID"
-    }
-    
-    POS_FEATURE_MAP = {
-        "YLT_TURNOVER_RATE": "易灵通动销率",
-        "YLT_BAR_PACKAGE_SALE_OCC": "易灵通条包销售占比",
-        "POS_PACKAGE_PRICE": "POS机单包价格",
-    }
-    
-    SHOPING_FEATURES_MAP = {
-        # 商圈 字段映射
-        "r_home_num": "常驻人口_居住人数",
-        "r_work_num": "常驻人口_工作人数",
-        "r_resident_num": "常驻人口_工作或居住人数",
-        "r_urban_cons_middle": "常驻人口_城市消费水平_中",
-        "r_urban_cons_low": "常驻人口_城市消费水平_低",
-        "r_urban_cons_lower": "常驻人口_城市消费水平_次低",
-        "r_urban_cons_secondhigh": "常驻人口_城市消费水平_次高",
-        "r_urban_cons_high": "常驻人口_城市消费水平_高",
-        "r_edu_junior_middle": "常驻人口_学历_初中",
-        "r_edu_doctor": "常驻人口_学历_博士",
-        "r_edu_specialty": "常驻人口_学历_大专",
-        "r_edu_primary": "常驻人口_学历_小学",
-        "r_edu_college": "常驻人口_学历_本科",
-        "r_edu_postgraduate": "常驻人口_学历_硕士",
-        "r_edu_senior_middle": "常驻人口_学历_高中",
-        "r_house_price79999": "常驻人口_居住社区房价_60000_79999",
-        "r_house_price59999": "常驻人口_居住社区房价_40000_59999",
-        "r_house_price39999": "常驻人口_居住社区房价_20000_39999",
-        "r_house_price19999": "常驻人口_居住社区房价_10000_19999",
-        "r_house_price9999": "常驻人口_居住社区房价_8000_9999",
-        "r_house_price7999": "常驻人口_居住社区房价_5000_7999",
-        "r_house_price4999": "常驻人口_居住社区房价_2000_4999",
-        "r_age_17": "常驻人口_年龄_0_17",
-        "r_age_24": "常驻人口_年龄_18_24",
-        "r_age_30": "常驻人口_年龄_25_30",
-        "r_age_35": "常驻人口_年龄_31_35",
-        "r_age_40": "常驻人口_年龄_36_40",
-        "r_age_45": "常驻人口_年龄_41_45",
-        "r_age_60": "常驻人口_年龄_46_60",
-        "r_age_over_60": "常驻人口_年龄_61以上",
-        "r_sex_woman": "常驻人口_性别_女",
-        "r_sex_man": "常驻人口_性别_男",
-        "r_catering_50": "常驻人口_餐饮消费水平_50",
-        "r_catering_100": "常驻人口_餐饮消费水平_100",
-        "r_catering_150": "常驻人口_餐饮消费水平_150",
-        "r_catering_200": "常驻人口_餐饮消费水平_200",
-        "r_catering_500": "常驻人口_餐饮消费水平_500",
-        "r_catering_over_500": "常驻人口_餐饮消费水平_500以上",
-        "r_catering_times_2": "常驻人口_餐饮消费频次_1_2",
-        "r_catering_times_4": "常驻人口_餐饮消费频次_2_4",
-        "r_catering_times_6": "常驻人口_餐饮消费频次_4_6",
-        "r_catering_times_8": "常驻人口_餐饮消费频次_6_8",
-        "r_catering_times_10": "常驻人口_餐饮消费频次_8_10",
-        "r_catering_times_11": "常驻人口_餐饮消费频次_11以上",
-        "r_native_beijing": "常驻人口_家乡地_北京市",
-        "r_native_tianjing": "常驻人口_家乡地_天津市",
-        "r_native_hebei": "常驻人口_家乡地_河北省",
-        "r_native_shanxi": "常驻人口_家乡地_山西省",
-        "r_native_neimeng": "常驻人口_家乡地_内蒙古",
-        "r_native_liaoning": "常驻人口_家乡地_辽宁省",
-        "r_native_jilin": "常驻人口_家乡地_吉林省",
-        "r_native_heilongjiang": "常驻人口_家乡地_黑龙江省",
-        "r_native_shanghai": "常驻人口_家乡地_上海市",
-        "r_native_jiangsu": "常驻人口_家乡地_江苏省",
-        "r_native_zhejiang": "常驻人口_家乡地_浙江省",
-        "r_native_anhui": "常驻人口_家乡地_安徽省",
-        "r_native_fujian": "常驻人口_家乡地_福建省",
-        "r_native_jiangix": "常驻人口_家乡地_江西省",
-        "r_native_shandong": "常驻人口_家乡地_山东省",
-        "r_native_henan": "常驻人口_家乡地_河南省",
-        "r_native_hubei": "常驻人口_家乡地_湖北省",
-        "r_native_hunan": "常驻人口_家乡地_湖南省",
-        "r_native_guangdong": "常驻人口_家乡地_广东省",
-        "r_native_hainan": "常驻人口_家乡地_海南省",
-        "r_native_sichuan": "常驻人口_家乡地_四川省",
-        "r_native_guizhou": "常驻人口_家乡地_贵州省",
-        "r_native_yunnan": "常驻人口_家乡地_云南省",
-        "r_native_shan": "常驻人口_家乡地_陕西省",
-        "r_native_gansu": "常驻人口_家乡地_甘肃省",
-        "r_native_qinghai": "常驻人口_家乡地_青海省",
-        "r_native_guangxi": "常驻人口_家乡地_广西壮族自治区",
-        "r_native_ningxia": "常驻人口_家乡地_宁夏回族自治区",
-        "r_native_xinjiang": "常驻人口_家乡地_新疆维吾尔自治区",
-        "r_native_xizang": "常驻人口_家乡地_西藏自治区",
-        "r_native_chongqing": "常驻人口_家乡地_重庆市",
-        "r_native_hongkong": "常驻人口_家乡地_香港",
-        "r_native_macao": "常驻人口_家乡地_澳门",
-        "r_native_taiwan": "常驻人口_家乡地_台湾",
-        "r_native_other": "常驻人口_家乡地_其它",
-        "f_flow_num": "流动人口_日均流动人口数量",
-        "f_holiday_flow_num": "流动人口_节假日日均流动人口数量",
-        "f_workday_flow_num": "流动人口_工作日日均流动人口数量",
-        "f_flowurban_cons_middle": "日均流动_城市消费水平_中",
-        "f_flowurban_cons_low": "日均流动_城市消费水平_低",
-        "f_flowurban_cons_lower": "日均流动_城市消费水平_次低",
-        "f_flowurban_cons_second_high": "日均流动_城市消费水平_次高",
-        "f_flowurban_cons_high": "日均流动_城市消费水平_高",
-        "f_flowedu_junior_middle": "日均流动_学历_初中",
-        "f_flowedu_doctor": "日均流动_学历_博士",
-        "f_flowedu_specialty": "日均流动_学历_大专",
-        "f_flowedu_primary": "日均流动_学历_小学",
-        "f_flowedu_college": "日均流动_学历_本科",
-        "f_flowedu_postgraduate": "日均流动_学历_硕士",
-        "f_flowedu_senior_middle": "日均流动_学历_高中",
-        "f_flowhouse_middle": "日均流动_居住社区房价_中",
-        "f_flowhouse_low": "日均流动_居住社区房价_低",
-        "f_flowhouse_lower": "日均流动_居住社区房价_次低",
-        "f_flowhouse_second_high": "日均流动_居住社区房价_次高",
-        "f_flowhouse_high": "日均流动_居住社区房价_高",
-        "f_flowage_17": "日均流动_年龄_0_17",
-        "f_flowage_24": "日均流动_年龄_18_24",
-        "f_flowage_30": "日均流动_年龄_25_30",
-        "f_flowage_35": "日均流动_年龄_31_35",
-        "f_flowage_40": "日均流动_年龄_36_40",
-        "f_flowage_45": "日均流动_年龄_41_45",
-        "f_flowage_60": "日均流动_年龄_46_60",
-        "f_flowage_over_60": "日均流动_年龄_61以上",
-        "f_flowsex_woman": "日均流动_性别_女",
-        "f_flowsex_man": "日均流动_性别_男",
-        "f_holidayurban_cons_middle": "节假日流动_城市消费水平_中",
-        "f_holidayurban_cons_low": "节假日流动_城市消费水平_低",
-        "f_holidayurban_cons_lower": "节假日流动_城市消费水平_次低",
-        "f_holidayurban_cons_secondhigh": "节假日流动_城市消费水平_次高",
-        "f_holidayurban_cons_high": "节假日流动_城市消费水平_高",
-        "f_holidayedu_junior_middle": "节假日流动_学历_初中",
-        "f_holidayedu_doctor": "节假日流动_学历_博士",
-        "f_holidayedu_specialty": "节假日流动_学历_大专",
-        "f_holidayedu_primary": "节假日流动_学历_小学",
-        "f_holidayedu_college": "节假日流动_学历_本科",
-        "f_holidayedu_postgraduate": "节假日流动_学历_硕士",
-        "f_holidayedu_senior_middle": "节假日流动_学历_高中",
-        "f_holidayhouse_middle": "节假日流动_居住社区房价_中",
-        "f_holidayhouse_low": "节假日流动_居住社区房价_低",
-        "f_holidayhouse_lower": "节假日流动_居住社区房价_次低",
-        "f_holidayhouse_second_high": "节假日流动_居住社区房价_次高",
-        "f_holidayhouse_high": "节假日流动_居住社区房价_高",
-        "f_holidayage_17": "节假日流动_年龄_0_17",
-        "f_holidayage_24": "节假日流动_年龄_18_24",
-        "f_holidayage_30": "节假日流动_年龄_25_30",
-        "f_holidayage_35": "节假日流动_年龄_31_35",
-        "f_holidayage_40": "节假日流动_年龄_36_40",
-        "f_holidayage_45": "节假日流动_年龄_41_45",
-        "f_holidayage_60": "节假日流动_年龄_46_60",
-        "f_holidayage_over_60": "节假日流动_年龄_61以上",
-        "f_holidaysex_woman": "节假日流动_性别_女",
-        "f_holidaysex_man": "节假日流动_性别_男",
-        "f_workday_urban_cons_middle": "工作日流动_城市消费水平_中",
-        "f_workday_urban_cons_low": "工作日流动_城市消费水平_低",
-        "f_workday_urban_cons_lower": "工作日流动_城市消费水平_次低",
-        "f_workday_urban_cons_secondhigh": "工作日流动_城市消费水平_次高",
-        "f_workday_urban_cons_high": "工作日流动_城市消费水平_高",
-        "f_workday_edu_junior_middle": "工作日流动_学历_初中",
-        "f_workday_edu_doctor": "工作日流动_学历_博士",
-        "f_workday_edu_specialty": "工作日流动_学历_大专",
-        "f_workday_edu_primary": "工作日流动_学历_小学",
-        "f_workday_edu_college": "工作日流动_学历_本科",
-        "f_workday_edu_postgraduate": "工作日流动_学历_硕士",
-        "f_workday_edu_senior_middle": "工作日流动_学历_高中",
-        "f_workday_house_middle": "工作日流动_居住社区房价_中",
-        "f_workday_house_low": "工作日流动_居住社区房价_低",
-        "f_workday_house_lower": "工作日流动_居住社区房价_次低",
-        "f_workday_house_second_high": "工作日流动_居住社区房价_次高",
-        "f_workday_house_high": "工作日流动_居住社区房价_高",
-        "f_workday_age_17": "工作日流动_年龄_0_17",
-        "f_workday_age_24": "工作日流动_年龄_18_24",
-        "f_workday_age_30": "工作日流动_年龄_25_30",
-        "f_workday_age_35": "工作日流动_年龄_31_35",
-        "f_workday_age_40": "工作日流动_年龄_36_40",
-        "f_workday_age_45": "工作日流动_年龄_41_45",
-        "f_workday_age_60": "工作日流动_年龄_46_60",
-        "f_workday_age_over_60": "工作日流动_年龄_61以上",
-        "f_workday_sex_woman": "工作日流动_性别_女",
-        "f_workday_sex_man": "工作日流动_性别_男"
+class CustConfig:
+    FEATURE_COLUMNS = [
+        "BB_RETAIL_CUSTOMER_CODE",                     # 零售户代码
+        "BB_RTL_CUST_GRADE_NAME",                      # 零售户分档名称
+        "BB_RTL_CUST_MARKET_TYPE_NAME",                # 零售户市场类型名称
+        "STORE_AREA",                                  # 店铺经营面积
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME",              # 零售户业态名称
+        "OPERATOR_EDU_LEVEL",                          # 零售客户经营者文化程
+        "OPERATOR_AGE",                                # 经营者年龄
+        "BB_RTL_CUST_CHAIN_FLAG",                      # 零售户连锁标识
+        "PRESENT_STAR_TERMINAL",                       # 终端星级
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME",           # 零售户信用等级名称
+        "MD04_DIR_SAL_STORE_FLAG",                     # 直营店标识
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME",              # 零售户经营范围名称
+        "PRODUCT_INSALE_QTY",                          # 在销品规数
+        # "CUST_INVESTMENT",                             # 店铺资源投入建设
+        
+        # "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",       # 新品订货量占同价类比重
+        # "PRODUCT_LISTING_RATE",                        # 品规上架率
+        # "STOCKOUT_DAYS",                              # 断货天数
+        # "YLT_TURNOVER_RATE",                           # 易灵通动销率
+        # "YLT_BAR_PACKAGE_SALE_OCC",                    # 易灵通条包销售占比
+        # "UNPACKING_RATE",                              # 拆包率
+        
+        
+        # "BB_RTL_CUST_POSITION_TYPE_NAME",              # 零售户商圈类型名称
+        
+        # "BB_RTL_CUST_SUB_BUSI_PLACE_NAME",             # 零售户业态细分名称
+        
+        # "BB_RTL_CUST_TERMINAL_LEVEL_NAME",             # 零售户终端层级名称
+        # "BB_RTL_CUST_TERMINALEVEL_NAME",               # 零售户终端层级细分名称
+        # "MD04_MG_SAMPLE_CUST_FLAG",                    # 样本户标识
+        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG",            # 零售户大户标识
+        # "BB_RTL_CUST_OPERATE_METHOD_NAME",             # 零售户经营方式名称
+        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME",          # 零售户卷烟经营规模名称
+        
+        # "AVERAGE_CONSUMER_FLOW",                       # 月均消费人流
+        # "NEW_PRODUCT_MEMBERS_QTY",                     # 新品消费会员数量
+    ]
+    # 数据清洗规则
+    CLEANING_RULES = {
+        "BB_RTL_CUST_GRADE_NAME":                   {"method": "fillna", "opt": "fill", "value": "十五档", "type": "str"},
+        "BB_RTL_CUST_MARKET_TYPE_NAME":             {"method": "fillna", "opt": "fill", "value": "城网", "type": "str"},
+        "STORE_AREA":                               {"method": "fillna", "opt": "mean", "type": "num"},
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME":           {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
+        "OPERATOR_EDU_LEVEL":                       {"method": "fillna", "opt": "fill", "value": "无数据", "type": "str"},
+        "OPERATOR_AGE":                             {"method": "fillna", "opt": "mean", "type": "num"},
+        "BB_RTL_CUST_CHAIN_FLAG":                   {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "PRESENT_STAR_TERMINAL":                    {"method": "fillna", "opt": "fill", "value": "非星级", "type": "str"},
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        {"method": "fillna", "opt": "fill", "value": "B", "type": "str"},
+        "MD04_DIR_SAL_STORE_FLAG":                  {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":           {"method": "fillna", "opt": "fill", "value": "否", "type": "str"},
+        "PRODUCT_INSALE_QTY":                       {"method": "fillna", "opt": "mean", "type": "num"},
+        # "CUST_INVESTMENT":                          {"method": "fillna", "opt": "fill", "type": 0}
+        
+        
+        # "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC":    {"method": "fillna", "opt": "mean", "type": "num"},
+        # "PRODUCT_LISTING_RATE":                     {"method": "fillna", "opt": "mean", "type": "num"},
+        # "STOCKOUT_DAYS":                            {"method": "fillna", "opt": "mean", "type": "num"},
+        # "YLT_TURNOVER_RATE":                        {"method": "fillna", "opt": "mean", "type": "num"},
+        # "NEW_PRODUCT_MEMBERS_QTY":                  {"method": "fillna", "opt": "mean", "type": "num"},
+        # "PRODUCT_INSALE_QTY":                       {"method": "fillna", "opt": "mean", "type": "num"},
+        # "UNPACKING_RATE":                           {"method": "fillna", "opt": "mean", "type": "num"},
+        
+        
+        
+        
+        # "BB_RTL_CUST_POSITION_TYPE_NAME":           {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
+        # "BB_RTL_CUST_SUB_BUSI_PLACE_NAME":          {"method": "fillna", "opt": "fill", "value": "其他", "type": "str"},
+        # "BB_RTL_CUST_TERMINALEVEL_NAME":          {"method": "fillna", "opt": "replace", "value": "BB_RTL_CUST_TERMINAL_LEVEL_NAME", "type": "str"},
+        # "MD04_MG_SAMPLE_CUST_FLAG":                 {"method": "fillna", "value": "N", "opt": "fill"},
+        # "MD07_RTL_CUST_IS_SALE_LARGE_FLAG":         {"method": "fillna", "value": "N", "opt": "fill"},
+        # "BB_RTL_CUST_CGT_OPERATE_SCOPE_NAME":       {"method": "fillna", "value": "中", "opt": "fill"},
+    }
+    
+    ONEHOT_CAT = {
+        "BB_RTL_CUST_GRADE_NAME":                   ['一档', '二档', '三档', '四档', '五档', '六档', '七档', '八档', '九档', '十档', '十一档', '十二档', 
+                                                    '十三档', '十四档', '十五档', '十六档', '十七档', '十八档', '十九档', '二十档', '二十一档', '二十二档', 
+                                                    '二十三档', '二十四档', '二十五档', '二十六档', '二十七档', '二十八档', '二十九档', '三十档'],
+        "BB_RTL_CUST_MARKET_TYPE_NAME":             ["城网", "农网"],
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME":           ["便利店", "超市", "烟草专业店", "娱乐服务类", "其他"],
+        "OPERATOR_EDU_LEVEL":                       [1, 2, 3, 4, 5, 6, 7, "无数据"],
+        "BB_RTL_CUST_CHAIN_FLAG":                   ["是", "否"],
+        "PRESENT_STAR_TERMINAL":                    ["一星", "二星", "三星", "四星", "五星", "非星级"],
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":        ["AAA", "AA", "A", "B", "C", "D"],
+        "MD04_DIR_SAL_STORE_FLAG":                  ["是", "否"],
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":           ["是", "否"],
+        
+        
+        
+        # "BB_RTL_CUST_POSITION_TYPE_NAME":           ["居民区", "商业娱乐区", "交通枢纽区", "旅游景区", "工业区", "集贸区", "院校学区", "办公区", "其他"]
+    }
+    
+class ProductConfig:
+    FEATURE_COLUMNS = [
+        "product_code",                                # 商品编码
+        "direct_retail_price",                         # 建议零售价
+        "is_low_tar",                                  # 是否低焦油烟
+        "tar_qty",                                     # 焦油含量
+        "is_exploding_beads",                          # 是否爆珠
+        "is_shortbranch",                              # 是否短支烟
+        "is_medium",                                   # 是否中支烟
+        "is_tiny",                                     # 是否细支
+        "product_style_code_name",                     # 包装类型名称
+        "org_is_abnormity",                            # 是否异形包装
+        "is_chuangxin",                                # 是否创新品类
+        "is_key_brand",                                # 是否重点品牌
+        "foster_level_hy",                             # 是否行业共育品规
+        "foster_level_sj",                             # 是否省级共育品规
+        "is_cigar",                                    # 是否雪茄型卷烟
+        "co_qty",                                      # 一氧化碳含量
+        "tbc_total_length",                            # 烟支总长度
+        "tbc_length",                                  # 烟支长度
+        "filter_length",                               # 滤嘴长度
+        
+
+        
+        # "adjust_price",                                # 含税调拨价
+        # "notwithtax_adjust_price",                     # 不含税调拨价
+        # "whole_sale_price",                            # 统一批发价
+        # "allot_price",                                 # 调拨价
+        # "direct_whole_price",                          # 批发指导价
+        # "retail_price",                                # 零售价
+        # "price_type_name",                             # 卷烟价类名称
+        # "gear_type_name",                              # 卷烟档位名称
+        # "category_type_name",                          # 卷烟品类名称
+        # "is_high_level",                               # 是否高端烟
+        # "is_upscale_level",                            # 是否高端烟不含高价
+        # "is_high_price",                               # 是否高价烟
+        # "is_low_price",                                # 是否低价烟
+        # "is_encourage",                                # 是否全国鼓励品牌
+        # "is_abnormity",                                # 是否异形包装
+        # "is_intake",                                   # 是否进口烟
+        # "is_short",                                    # 是否紧俏品牌
+        # "is_ordinary_price_type",                      # 是否普一类烟
+        # "source_type",                                 # 来源类型
+        # "chinese_mix",                                 # 中式混合
+        # "sub_price_type_name",                         # 细分卷烟价类名称
+    ]
+    
+    CLEANING_RULES = {
+        "direct_retail_price":                         {"method": "fillna", "opt": "mean", "type": "num"},
+        "is_low_tar":                                  {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "tar_qty":                                     {"method": "fillna", "opt": "mean", "type": "num"},
+        "is_exploding_beads":                          {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_shortbranch":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_medium":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_tiny":                                     {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "product_style_code_name":                     {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        "org_is_abnormity":                            {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_chuangxin":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_key_brand":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "foster_level_hy":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "foster_level_sj":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "is_cigar":                                    {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        "co_qty":                                      {"method": "fillna", "opt": "mean", "type": "num"},
+        "tbc_total_length":                            {"method": "fillna", "opt": "mean", "type": "num"},
+        "tbc_length":                                  {"method": "fillna", "opt": "mean", "type": "num"},
+        "filter_length":                               {"method": "fillna", "opt": "mean", "type": "num"},
+        
+        
+        # "adjust_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
+        # "notwithtax_adjust_price":                     {"method": "fillna", "opt": "mean", "type": "num"},
+        # "whole_sale_price":                            {"method": "fillna", "opt": "mean", "type": "num"},
+        # "allot_price":                                 {"method": "fillna", "opt": "fill", "type": "num", "value": 0.0},
+        # "direct_whole_price":                          {"method": "fillna", "opt": "mean", "type": "num"},
+        # "retail_price":                                {"method": "fillna", "opt": "mean", "type": "num"},
+        # "price_type_name":                             {"method": "fillna", "opt": "fill", "type": "str", "value": "一类烟"},
+        # "gear_type_name":                              {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        # "category_type_name":                          {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        # "is_high_level":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_upscale_level":                            {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_high_price":                               {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_low_price":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_encourage":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_abnormity":                                {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_intake":                                   {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_short":                                    {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "is_ordinary_price_type":                      {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "source_type":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "其他"},
+        # "chinese_mix":                                 {"method": "fillna", "opt": "fill", "type": "str", "value": "否"},
+        # "sub_price_type_name":                         {"method": "fillna", "opt": "fill", "type": "str", "value": "普一类烟"},
+    }
+    
+
+    ONEHOT_CAT = {
+        "is_low_tar":                                  ["是", "否"],
+        "is_exploding_beads":                          ["是", "否"],
+        "is_shortbranch":                              ["是", "否"],
+        "is_medium":                                   ["是", "否"],
+        "is_tiny":                                     ["是", "否"],
+        "product_style_code_name":                     ["条盒硬盒", "条包硬盒", "条盒软盒", "条包软盒", "铁盒", "其他"],
+        "org_is_abnormity":                            ["是", "否"],
+        "is_chuangxin":                                ["是", "否"],
+        "is_key_brand":                                ["是", "否"],
+        "foster_level_hy":                             ["是", "否"],
+        "foster_level_sj":                             ["是", "否"],
+        "is_cigar":                                    ["是", "否"],
+        
+        
+        
+        # "price_type_name":                             ["一类烟", "二类烟", "三类烟", "四类烟", "五类烟", "无价类"],
+        # "gear_type_name":                              ["第一档位", "第二档位", "第三档位", "第四档位", "第五档位", "第六档位", "第七档位", "第八档位", "其他"],
+        # "category_type_name":                          ["第1品类", "第2品类", "第3品类", "第4品类", "第5品类", "第6品类", "第7品类", 
+        #                                                 "第8品类", "第9品类", "第10品类", "第11品类", "第12品类", "第13品类", "其他"],
+        # "is_high_level":                               ["是", "否"],
+        # "is_upscale_level":                            ["是", "否"],
+        # "is_high_price":                               ["是", "否"],
+        # "is_low_price":                                ["是", "否"],
+        # "is_encourage":                                ["是", "否"],
+        # "is_abnormity":                                ["是", "否"],
+        # "is_intake":                                   ["是", "否"],
+        # "is_short":                                    ["是", "否"],
+        # "is_ordinary_price_type":                      ["是", "否"],
+        # "source_type":                                 ["是", "否"],
+        # "chinese_mix":                                 ["是", "否"],
+        # "sub_price_type_name":                         ["高端烟", "高价位烟", "普一类烟", "二类烟", "三类烟", "四类烟", "五类烟", "无价类"],
+    }
+    
+class OrderConfig:
+    FEATURE_COLUMNS = [
+        "BB_RETAIL_CUSTOMER_CODE",                          # 零售户编码
+        "PRODUCT_CODE",                                     # 卷烟编码
+        "MONTH6_SALE_QTY",                                  # 近半年销量(箱)
+        "MONTH6_SALE_AMT",                                  # 近半年销售额(万元)
+        "MONTH6_GROSS_PROFIT_RATE",                         # 近半年毛利率
+        "MONTH6_SALE_QTY_YOY",                              # 销售量同比
+        "MONTH6_SALE_QTY_MOM",                              # 销售量环比
+        "MONTH6_SALE_AMT_YOY",                              # 销售额(购进额)同比
+        "MONTH6_SALE_AMT_MOM",                              # 销售额(购进额)环比
+        "STOCK_QTY",                                        # 库存
+        "ORDER_FULLORDR_RATE",                              # 订足率
+        "FULL_FILLMENT_RATE",                               # 订单满足率
+        "ORDER_FULLORDR_RATE_MOM",                          # 订足率环比
+        "CUSTOMER_REPURCHASE_RATE",                         # 会员重购率  
+        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",            # 新品订货量占同价类比重/decimal(18,6)
+        "DEMAND_RATE",                                      # 需求量满足率
+        "LISTING_RATE",                                     # 品规商上架率
+        "PUT_MARKET_FINISH_RATE",                           # 投放完成率
+        "OUT_STOCK_DAYS",                                   # 断货天数
+        "YLT_TURNOVER_RATE",                                # 易灵通动销率
+        "YLT_BAR_PACKAGE_SALE_OCC",                         # 易灵通调包销售占比
+        "UNPACKING_RATE",                                   # 拆包率
+        "POS_PACKAGE_PRICE",                                # pos机单包价格
+    ]
+    
+    CLEANING_FEATURES = [
+        "MONTH6_SALE_QTY",
+        "MONTH6_SALE_AMT",
+        "MONTH6_GROSS_PROFIT_RATE",
+        "MONTH6_SALE_QTY_YOY",
+        "MONTH6_SALE_QTY_MOM",
+        "MONTH6_SALE_AMT_YOY",
+        "MONTH6_SALE_AMT_MOM",
+        "STOCK_QTY",
+        "ORDER_FULLORDR_RATE",
+        "FULL_FILLMENT_RATE",
+        "ORDER_FULLORDR_RATE_MOM",
+        "CUSTOMER_REPURCHASE_RATE",
+        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC",
+        "DEMAND_RATE",
+        "LISTING_RATE",
+        "PUT_MARKET_FINISH_RATE",
+        "OUT_STOCK_DAYS",
+        "UNPACKING_RATE",
+    ]
+    
+    WEIGHTS = {
+        "MONTH6_SALE_QTY":                                  0.15,
+        "MONTH6_SALE_QTY_MOM":                              0.2,
+        "ORDER_FULLORDR_RATE":                              0.3,
+        "ORDER_FULLORDR_RATE_MOM":                          0.35,
+    }
+    
+    POSFEATURES = [
+        "YLT_TURNOVER_RATE","YLT_BAR_PACKAGE_SALE_OCC","POS_PACKAGE_PRICE"
+    ]
+    
+class ImportanceFeaturesMap:
+    CUSTOM_FEATRUES_MAP = {
+        "BB_RTL_CUST_GRADE_NAME":                           "零售户分档名称",
+        "BB_RTL_CUST_MARKET_TYPE_NAME":                     "零售户市场类型名称",
+        "STORE_AREA":                                       "店铺经营面积",
+        "BB_RTL_CUST_BUSINESS_TYPE_NAME":                   "零售户业态名称",
+        "OPERATOR_EDU_LEVEL":                               "零售客户经营者文化程",
+        "OPERATOR_AGE":                                     "经营者年龄",
+        "BB_RTL_CUST_CHAIN_FLAG":                           "零售户连锁标识",
+        "PRESENT_STAR_TERMINAL":                            "终端星级",
+        "MD04_MG_RTL_CUST_CREDITCLASS_NAME":                "零售户信用等级名称",
+        "MD04_DIR_SAL_STORE_FLAG":                          "直营店标识",
+        "BB_CUSTOMER_MANAGER_SCOPE_NAME":                   "零售户经营范围名称",
+        "PRODUCT_INSALE_QTY":                               "在销品规数",
+        # "CUST_INVESTMENT":                                  "店铺资源投入建设",
+    }
+    
+    PRODUCT_FEATRUES_MAP = {
+        # ProductConfig 字段映射
+        "direct_retail_price":                              "建议零售价",
+        "is_low_tar":                                       "是否低焦油烟",
+        "tar_qty":                                          "焦油含量",
+        "is_exploding_beads":                               "是否爆珠",
+        "is_shortbranch":                                   "是否短支烟",
+        "is_medium":                                        "是否中支烟",
+        "is_tiny":                                          "是否细支",
+        "product_style_code_name":                          "包装类型名称",
+        "org_is_abnormity":                                 "是否异形包装",
+        "is_chuangxin":                                     "是否创新品类",
+        "is_key_brand":                                     "是否重点品牌",
+        "foster_level_hy":                                  "是否行业共育品规",
+        "foster_level_sj":                                  "是否省级共育品规",
+        "is_cigar":                                         "是否雪茄型卷烟",
+        "co_qty":                                           "一氧化碳含量",
+        "tbc_total_length":                                 "烟支总长度",
+        "tbc_length":                                       "烟支长度",
+        "filter_length":                                    "滤嘴长度",
+    }
+    
+    ORDER_FEATURE_MAP = {
+        "MONTH6_SALE_QTY": "近半年销量(箱)",
+        "MONTH6_SALE_AMT": "近半年销售额(万元)",
+        "MONTH6_GROSS_PROFIT_RATE": "近半年毛利率",
+        "MONTH6_SALE_QTY_YOY": "销量同比",
+        "MONTH6_SALE_QTY_MOM": "销量环比",
+        "MONTH6_SALE_AMT_YOY": "销售额(购进额)同比",
+        "MONTH6_SALE_AMT_MOM": "销售额(购进额)环比",
+        "STOCK_QTY": "库存",
+        "ORDER_FULLORDR_RATE": "订足率",
+        "ORDER_FULLORDR_RATE_MOM": "订足率环比",
+        "FULL_FILLMENT_RATE": "订单满足率",
+        "CUSTOMER_REPURCHASE_RATE": "会员重购率(部分有会员)",
+        "NEW_PRODUCT_MEMBERS_QTY_SAMEPRICE_OCC": "新品订货量占同价类比重",
+        "DEMAND_RATE": "需求量满足率",
+        "LISTING_RATE": "品规上架率",
+        "PUT_MARKET_FINISH_RATE": "投放完成率",
+        "OUT_STOCK_DAYS": "断货天数(部分零售商有)",
+        "UNPACKING_RATE": "拆包率",
+        "city_uuid": "城市UUID"
+    }
+    
+    POS_FEATURE_MAP = {
+        "YLT_TURNOVER_RATE": "易灵通动销率",
+        "YLT_BAR_PACKAGE_SALE_OCC": "易灵通条包销售占比",
+        "POS_PACKAGE_PRICE": "POS机单包价格",
+    }
+    
+    SHOPING_FEATURES_MAP = {
+        # 商圈 字段映射
+        "r_home_num": "常驻人口_居住人数",
+        "r_work_num": "常驻人口_工作人数",
+        "r_resident_num": "常驻人口_工作或居住人数",
+        "r_urban_cons_middle": "常驻人口_城市消费水平_中",
+        "r_urban_cons_low": "常驻人口_城市消费水平_低",
+        "r_urban_cons_lower": "常驻人口_城市消费水平_次低",
+        "r_urban_cons_secondhigh": "常驻人口_城市消费水平_次高",
+        "r_urban_cons_high": "常驻人口_城市消费水平_高",
+        "r_edu_junior_middle": "常驻人口_学历_初中",
+        "r_edu_doctor": "常驻人口_学历_博士",
+        "r_edu_specialty": "常驻人口_学历_大专",
+        "r_edu_primary": "常驻人口_学历_小学",
+        "r_edu_college": "常驻人口_学历_本科",
+        "r_edu_postgraduate": "常驻人口_学历_硕士",
+        "r_edu_senior_middle": "常驻人口_学历_高中",
+        "r_house_price79999": "常驻人口_居住社区房价_60000_79999",
+        "r_house_price59999": "常驻人口_居住社区房价_40000_59999",
+        "r_house_price39999": "常驻人口_居住社区房价_20000_39999",
+        "r_house_price19999": "常驻人口_居住社区房价_10000_19999",
+        "r_house_price9999": "常驻人口_居住社区房价_8000_9999",
+        "r_house_price7999": "常驻人口_居住社区房价_5000_7999",
+        "r_house_price4999": "常驻人口_居住社区房价_2000_4999",
+        "r_age_17": "常驻人口_年龄_0_17",
+        "r_age_24": "常驻人口_年龄_18_24",
+        "r_age_30": "常驻人口_年龄_25_30",
+        "r_age_35": "常驻人口_年龄_31_35",
+        "r_age_40": "常驻人口_年龄_36_40",
+        "r_age_45": "常驻人口_年龄_41_45",
+        "r_age_60": "常驻人口_年龄_46_60",
+        "r_age_over_60": "常驻人口_年龄_61以上",
+        "r_sex_woman": "常驻人口_性别_女",
+        "r_sex_man": "常驻人口_性别_男",
+        "r_catering_50": "常驻人口_餐饮消费水平_50",
+        "r_catering_100": "常驻人口_餐饮消费水平_100",
+        "r_catering_150": "常驻人口_餐饮消费水平_150",
+        "r_catering_200": "常驻人口_餐饮消费水平_200",
+        "r_catering_500": "常驻人口_餐饮消费水平_500",
+        "r_catering_over_500": "常驻人口_餐饮消费水平_500以上",
+        "r_catering_times_2": "常驻人口_餐饮消费频次_1_2",
+        "r_catering_times_4": "常驻人口_餐饮消费频次_2_4",
+        "r_catering_times_6": "常驻人口_餐饮消费频次_4_6",
+        "r_catering_times_8": "常驻人口_餐饮消费频次_6_8",
+        "r_catering_times_10": "常驻人口_餐饮消费频次_8_10",
+        "r_catering_times_11": "常驻人口_餐饮消费频次_11以上",
+        "r_native_beijing": "常驻人口_家乡地_北京市",
+        "r_native_tianjing": "常驻人口_家乡地_天津市",
+        "r_native_hebei": "常驻人口_家乡地_河北省",
+        "r_native_shanxi": "常驻人口_家乡地_山西省",
+        "r_native_neimeng": "常驻人口_家乡地_内蒙古",
+        "r_native_liaoning": "常驻人口_家乡地_辽宁省",
+        "r_native_jilin": "常驻人口_家乡地_吉林省",
+        "r_native_heilongjiang": "常驻人口_家乡地_黑龙江省",
+        "r_native_shanghai": "常驻人口_家乡地_上海市",
+        "r_native_jiangsu": "常驻人口_家乡地_江苏省",
+        "r_native_zhejiang": "常驻人口_家乡地_浙江省",
+        "r_native_anhui": "常驻人口_家乡地_安徽省",
+        "r_native_fujian": "常驻人口_家乡地_福建省",
+        "r_native_jiangix": "常驻人口_家乡地_江西省",
+        "r_native_shandong": "常驻人口_家乡地_山东省",
+        "r_native_henan": "常驻人口_家乡地_河南省",
+        "r_native_hubei": "常驻人口_家乡地_湖北省",
+        "r_native_hunan": "常驻人口_家乡地_湖南省",
+        "r_native_guangdong": "常驻人口_家乡地_广东省",
+        "r_native_hainan": "常驻人口_家乡地_海南省",
+        "r_native_sichuan": "常驻人口_家乡地_四川省",
+        "r_native_guizhou": "常驻人口_家乡地_贵州省",
+        "r_native_yunnan": "常驻人口_家乡地_云南省",
+        "r_native_shan": "常驻人口_家乡地_陕西省",
+        "r_native_gansu": "常驻人口_家乡地_甘肃省",
+        "r_native_qinghai": "常驻人口_家乡地_青海省",
+        "r_native_guangxi": "常驻人口_家乡地_广西壮族自治区",
+        "r_native_ningxia": "常驻人口_家乡地_宁夏回族自治区",
+        "r_native_xinjiang": "常驻人口_家乡地_新疆维吾尔自治区",
+        "r_native_xizang": "常驻人口_家乡地_西藏自治区",
+        "r_native_chongqing": "常驻人口_家乡地_重庆市",
+        "r_native_hongkong": "常驻人口_家乡地_香港",
+        "r_native_macao": "常驻人口_家乡地_澳门",
+        "r_native_taiwan": "常驻人口_家乡地_台湾",
+        "r_native_other": "常驻人口_家乡地_其它",
+        "f_flow_num": "流动人口_日均流动人口数量",
+        "f_holiday_flow_num": "流动人口_节假日日均流动人口数量",
+        "f_workday_flow_num": "流动人口_工作日日均流动人口数量",
+        "f_flowurban_cons_middle": "日均流动_城市消费水平_中",
+        "f_flowurban_cons_low": "日均流动_城市消费水平_低",
+        "f_flowurban_cons_lower": "日均流动_城市消费水平_次低",
+        "f_flowurban_cons_second_high": "日均流动_城市消费水平_次高",
+        "f_flowurban_cons_high": "日均流动_城市消费水平_高",
+        "f_flowedu_junior_middle": "日均流动_学历_初中",
+        "f_flowedu_doctor": "日均流动_学历_博士",
+        "f_flowedu_specialty": "日均流动_学历_大专",
+        "f_flowedu_primary": "日均流动_学历_小学",
+        "f_flowedu_college": "日均流动_学历_本科",
+        "f_flowedu_postgraduate": "日均流动_学历_硕士",
+        "f_flowedu_senior_middle": "日均流动_学历_高中",
+        "f_flowhouse_middle": "日均流动_居住社区房价_中",
+        "f_flowhouse_low": "日均流动_居住社区房价_低",
+        "f_flowhouse_lower": "日均流动_居住社区房价_次低",
+        "f_flowhouse_second_high": "日均流动_居住社区房价_次高",
+        "f_flowhouse_high": "日均流动_居住社区房价_高",
+        "f_flowage_17": "日均流动_年龄_0_17",
+        "f_flowage_24": "日均流动_年龄_18_24",
+        "f_flowage_30": "日均流动_年龄_25_30",
+        "f_flowage_35": "日均流动_年龄_31_35",
+        "f_flowage_40": "日均流动_年龄_36_40",
+        "f_flowage_45": "日均流动_年龄_41_45",
+        "f_flowage_60": "日均流动_年龄_46_60",
+        "f_flowage_over_60": "日均流动_年龄_61以上",
+        "f_flowsex_woman": "日均流动_性别_女",
+        "f_flowsex_man": "日均流动_性别_男",
+        "f_holidayurban_cons_middle": "节假日流动_城市消费水平_中",
+        "f_holidayurban_cons_low": "节假日流动_城市消费水平_低",
+        "f_holidayurban_cons_lower": "节假日流动_城市消费水平_次低",
+        "f_holidayurban_cons_secondhigh": "节假日流动_城市消费水平_次高",
+        "f_holidayurban_cons_high": "节假日流动_城市消费水平_高",
+        "f_holidayedu_junior_middle": "节假日流动_学历_初中",
+        "f_holidayedu_doctor": "节假日流动_学历_博士",
+        "f_holidayedu_specialty": "节假日流动_学历_大专",
+        "f_holidayedu_primary": "节假日流动_学历_小学",
+        "f_holidayedu_college": "节假日流动_学历_本科",
+        "f_holidayedu_postgraduate": "节假日流动_学历_硕士",
+        "f_holidayedu_senior_middle": "节假日流动_学历_高中",
+        "f_holidayhouse_middle": "节假日流动_居住社区房价_中",
+        "f_holidayhouse_low": "节假日流动_居住社区房价_低",
+        "f_holidayhouse_lower": "节假日流动_居住社区房价_次低",
+        "f_holidayhouse_second_high": "节假日流动_居住社区房价_次高",
+        "f_holidayhouse_high": "节假日流动_居住社区房价_高",
+        "f_holidayage_17": "节假日流动_年龄_0_17",
+        "f_holidayage_24": "节假日流动_年龄_18_24",
+        "f_holidayage_30": "节假日流动_年龄_25_30",
+        "f_holidayage_35": "节假日流动_年龄_31_35",
+        "f_holidayage_40": "节假日流动_年龄_36_40",
+        "f_holidayage_45": "节假日流动_年龄_41_45",
+        "f_holidayage_60": "节假日流动_年龄_46_60",
+        "f_holidayage_over_60": "节假日流动_年龄_61以上",
+        "f_holidaysex_woman": "节假日流动_性别_女",
+        "f_holidaysex_man": "节假日流动_性别_男",
+        "f_workday_urban_cons_middle": "工作日流动_城市消费水平_中",
+        "f_workday_urban_cons_low": "工作日流动_城市消费水平_低",
+        "f_workday_urban_cons_lower": "工作日流动_城市消费水平_次低",
+        "f_workday_urban_cons_secondhigh": "工作日流动_城市消费水平_次高",
+        "f_workday_urban_cons_high": "工作日流动_城市消费水平_高",
+        "f_workday_edu_junior_middle": "工作日流动_学历_初中",
+        "f_workday_edu_doctor": "工作日流动_学历_博士",
+        "f_workday_edu_specialty": "工作日流动_学历_大专",
+        "f_workday_edu_primary": "工作日流动_学历_小学",
+        "f_workday_edu_college": "工作日流动_学历_本科",
+        "f_workday_edu_postgraduate": "工作日流动_学历_硕士",
+        "f_workday_edu_senior_middle": "工作日流动_学历_高中",
+        "f_workday_house_middle": "工作日流动_居住社区房价_中",
+        "f_workday_house_low": "工作日流动_居住社区房价_低",
+        "f_workday_house_lower": "工作日流动_居住社区房价_次低",
+        "f_workday_house_second_high": "工作日流动_居住社区房价_次高",
+        "f_workday_house_high": "工作日流动_居住社区房价_高",
+        "f_workday_age_17": "工作日流动_年龄_0_17",
+        "f_workday_age_24": "工作日流动_年龄_18_24",
+        "f_workday_age_30": "工作日流动_年龄_25_30",
+        "f_workday_age_35": "工作日流动_年龄_31_35",
+        "f_workday_age_40": "工作日流动_年龄_36_40",
+        "f_workday_age_45": "工作日流动_年龄_41_45",
+        "f_workday_age_60": "工作日流动_年龄_46_60",
+        "f_workday_age_over_60": "工作日流动_年龄_61以上",
+        "f_workday_sex_woman": "工作日流动_性别_女",
+        "f_workday_sex_man": "工作日流动_性别_男"
     }

+ 62 - 61
models/rank/data/dataloader.py

@@ -1,62 +1,63 @@
-import pandas as pd
-from models.rank.data.config import CustConfig, ProductConfig
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
-from models.rank.data.utils import one_hot_embedding
-
-class DataLoader:
-    def __init__(self,path):
-        self._gbdt_data_path = path
-        self._load_data()
-    
-    def _load_data(self):
-       
-        self._gbdt_data = pd.read_csv(self._gbdt_data_path, encoding="utf-8")
-        self._gbdt_data.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True)
-        self._gbdt_data.drop('product_code', axis=1, inplace=True)
-        
-        self._onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
-        
-        self._onehot_columns = list(self._onehot_feats.keys())
-        self._numeric_columns = self._gbdt_data.drop(self._onehot_columns + ["label"], axis=1).columns
-        
-        # 将类别数据进行one-hot编码
-        self._gbdt_data = one_hot_embedding(self._gbdt_data, self._onehot_feats)
-        
-    
-    def split_dataset(self):
-        """数据集划分,将数据集划分为训练集、验证集、测试集"""
-        # 1. 分离特征和标签
-        features = self._gbdt_data.drop("label", axis=1)
-        labels = self._gbdt_data["label"]
-        
-        # 2. 划分数据集,80%训练集、20%的测试集
-        X_train, X_test, y_train, y_test = train_test_split(
-            features, labels, 
-            test_size=0.2, 
-            random_state=42, 
-            shuffle=True,
-            stratify=labels,
-        )
-        
-        # 3. 数据标准化(仅对特征进行标准化)
-        scaler = StandardScaler()
-        X_train[self._numeric_columns] = scaler.fit_transform(X_train[self._numeric_columns])
-        X_test[self._numeric_columns] = scaler.fit_transform(X_test[self._numeric_columns])
-        
-        train_dataset = {"data": X_train, "label": y_train}
-        test_dataset = {"data": X_test, "label": y_test}
-        
-        return train_dataset, test_dataset
-    
-if __name__ == '__main__':
-    path = './models/rank/data/gbdt_data.csv'
-    dataloader = DataLoader(path)
-    train_dataset, test_dataset = dataloader.split_dataset()
-    
-    # 打印训练集和测试集的正负样本分布
-    print("训练集正负样本分布:")
-    print(train_dataset["label"].value_counts(normalize=True))
-    
-    print("测试集正负样本分布:")
+import pandas as pd
+from models.rank.data.config import CustConfig, ProductConfig, ShopConfig
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from models.rank.data.utils import one_hot_embedding
+
+class DataLoader:
+    def __init__(self,path):
+        self._gbdt_data_path = path
+        self._load_data()
+    
+    def _load_data(self):
+       
+        self._gbdt_data = pd.read_csv(self._gbdt_data_path, encoding="utf-8")
+        self._gbdt_data.drop('cust_code', axis=1, inplace=True)
+        self._gbdt_data.drop('product_code', axis=1, inplace=True)
+        
+        self._onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT, **ShopConfig.ONEHOT_CAT}
+        
+        self._onehot_columns = list(self._onehot_feats.keys())
+        self._numeric_columns = self._gbdt_data.drop(self._onehot_columns + ["label"], axis=1).columns
+        
+        # 将类别数据进行one-hot编码
+        self._gbdt_data = one_hot_embedding(self._gbdt_data, self._onehot_feats)
+        
+    
+    def split_dataset(self):
+        """数据集划分,将数据集划分为训练集、验证集、测试集"""
+        # 1. 分离特征和标签
+        features = self._gbdt_data.drop("label", axis=1)
+        labels = self._gbdt_data["label"]
+        
+        # 2. 划分数据集,80%训练集、20%的测试集
+        X_train, X_test, y_train, y_test = train_test_split(
+            features, labels, 
+            test_size=0.2, 
+            random_state=42, 
+            shuffle=True,
+            stratify=labels,
+        )
+        
+        # 3. 数据标准化(仅对特征进行标准化)
+        if len(self._numeric_columns) != 0:
+            scaler = StandardScaler()
+            X_train[self._numeric_columns] = scaler.fit_transform(X_train[self._numeric_columns])
+            X_test[self._numeric_columns] = scaler.fit_transform(X_test[self._numeric_columns])
+        
+        train_dataset = {"data": X_train, "label": y_train}
+        test_dataset = {"data": X_test, "label": y_test}
+        
+        return train_dataset, test_dataset
+    
+if __name__ == '__main__':
+    path = './data/train_data.csv'
+    dataloader = DataLoader(path)
+    train_dataset, test_dataset = dataloader.split_dataset()
+    
+    # 打印训练集和测试集的正负样本分布
+    print("训练集正负样本分布:")
+    print(train_dataset["label"].value_counts(normalize=True))
+    
+    print("测试集正负样本分布:")
     print(test_dataset["label"].value_counts(normalize=True))

+ 140 - 223
models/rank/data/preprocess.py

@@ -1,224 +1,141 @@
-from database import MySqlDao
-from models.rank.data.config import CustConfig, ProductConfig, OrderConfig
-import os
-import pandas as pd
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.utils import shuffle
-import numpy as np
-
-class DataProcess():
-    def __init__(self, city_uuid, save_dir):
-        self._mysql_dao = MySqlDao()
-        self.save_dir = save_dir
-        print("正在加载cust_info...")
-        self._cust_data = self._mysql_dao.load_cust_data(city_uuid)
-        print("正在加载product_info...")
-        self._product_data = self._mysql_dao.load_product_data(city_uuid)
-        print("正在加载order_info...")
-        self._order_data = self._mysql_dao.load_order_data(city_uuid)
-        # self._order_data = self._mysql_dao.load_mock_order_data()
-        print("正在加载shopping_info...")
-        self._shopping_data = self._mysql_dao.load_shopping_data(city_uuid)
-        
-    def data_process(self):
-        """数据预处理"""
-        ori_train_data_save_path = os.path.join(self.save_dir, "original_train_data.csv")
-        pos_train_data_save_path = os.path.join(self.save_dir, "pos_train_data.csv")
-        shopping_train_data_save_path = os.path.join(self.save_dir, "shopping_train_data.csv")
-        if os.path.exists(ori_train_data_save_path):
-            os.remove(ori_train_data_save_path)
-        if os.path.exists(pos_train_data_save_path):
-            os.remove(pos_train_data_save_path)
-        if os.path.exists(shopping_train_data_save_path):
-            os.remove(shopping_train_data_save_path)
-        
-        # 1. 获取指定的特征组合
-        self._cust_data = self._cust_data[CustConfig.FEATURE_COLUMNS]
-        self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
-        self._order_data = self._order_data[OrderConfig.FEATURE_COLUMNS]
-        
-        # 2. 数据清洗
-        self._clean_cust_data()
-        self._clean_product_data()
-        self._clean_order_data()
-        self._clean_shopping_data()
-        
-        # 3. 生成训练数据集
-        ori_train_data = self._generate_original_train_data(is_pos=False)
-        shopping_train_data = self._generate_shopping_train_data()
-        pos_train_data = self._generate_pos_train_data()
-        
-        ori_train_data.to_csv(ori_train_data_save_path, index=False)
-        shopping_train_data.to_csv(shopping_train_data_save_path, index=False)
-        pos_train_data.to_csv(pos_train_data_save_path, index=False)
-        
-
-    def _clean_cust_data(self):
-        """用户信息表数据清洗"""
-        self._cust_data["BB_RETAIL_CUSTOMER_CODE"] = self._cust_data["BB_RETAIL_CUSTOMER_CODE"].astype(str)
-        # 根据配置规则清洗数据
-        for feature, rules, in CustConfig.CLEANING_RULES.items():
-            if rules["type"] == "num":
-                # 先将数值型字符串转换为数值
-                self._cust_data[feature] = pd.to_numeric(self._cust_data[feature], errors="coerce")
-                
-            if rules["method"] == "fillna":
-                if rules["opt"] == "fill":
-                    self._cust_data[feature] = self._cust_data[feature].fillna(rules["value"]).infer_objects(copy=False)
-                elif rules["opt"] == "replace":
-                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]]).infer_objects(copy=False)
-                elif rules["opt"] == "mean":
-                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean()).infer_objects(copy=False)
-                self._cust_data[feature] = self._cust_data[feature].infer_objects(copy=False)
-    
-    def _clean_product_data(self):
-        """卷烟信息表数据清洗"""
-        self._product_data["product_code"] = self._product_data["product_code"].astype(str)
-        for feature, rules, in ProductConfig.CLEANING_RULES.items():
-            if rules["type"] == "num":
-                self._product_data[feature] = pd.to_numeric(self._product_data[feature], errors="coerce")
-            
-            if rules["method"] == "fillna":
-                if rules["opt"] == "fill":
-                    self._product_data[feature] = self._product_data[feature].fillna(rules["value"]).infer_objects(copy=False)
-                elif rules["opt"] == "mean":
-                    self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean()).infer_objects(copy=False)
-                self._product_data[feature] = self._product_data[feature].infer_objects(copy=False)
-                    
-    def _clean_order_data(self):
-        remaining_cols = self._order_data.columns.drop(OrderConfig.POSFEATURES) # 数据清洗时先不对pos数据做处理
-        col_all_missing = remaining_cols[self._order_data[remaining_cols].isnull().all()].to_list()
-        self._order_data.drop(columns=col_all_missing, inplace=True)
-        
-        # 去除重复值和填补缺失值
-        self._order_data.drop_duplicates(inplace=True)
-        self._order_data[remaining_cols.drop(col_all_missing)] = self._order_data[remaining_cols.drop(col_all_missing)].fillna(0)
-        self._order_data = self._order_data.infer_objects(copy=False) 
-        
-        
-    def _clean_shopping_data(self):
-        """处理商圈数据缺省值"""
-        self._shopping_data.drop(columns=["cust_uuid", "longitude", "latitude", "range_radius"], axis=1, inplace=True)
-        remaining_cols = self._shopping_data.columns.drop(["city_uuid", "cust_code"])
-        col_with_missing = remaining_cols[self._shopping_data[remaining_cols].isnull().any()].tolist() # 判断有缺失的字段
-        col_all_missing = remaining_cols[self._shopping_data[remaining_cols].isnull().all()].to_list() # 全部缺失的字段
-        col_partial_missing = list(set(col_with_missing) - set(col_all_missing)) # 部分缺失的字段
-        
-        for col in col_partial_missing:
-            self._shopping_data[col] = self._shopping_data[col].fillna(self._shopping_data[col].mean())
-        
-        for col in col_all_missing:
-            self._shopping_data[col] = self._shopping_data[col].fillna(0).infer_objects(copy=False)
-    
-    def _generate_original_train_data(self, is_pos):
-        union_data = self._union_order_cust_product(is_pos)
-        scored_data = self._calculate_score(union_data)
-        labeled_data = self._labeled_data(scored_data)
-        
-        # labeled_data.to_csv(save_path, index=False)
-        return labeled_data
-        
-        
-    
-    def _generate_pos_train_data(self):
-        pos_data = self._generate_original_train_data(is_pos=True)
-        pos_data.dropna(subset=['YLT_TURNOVER_RATE'], inplace=True)
-        pos_data[OrderConfig.POSFEATURES] = pos_data[OrderConfig.POSFEATURES].fillna(0)
-        pos_data = pos_data.infer_objects(copy=False)
-        return pos_data
-        
-    
-    def _generate_shopping_train_data(self):
-        orignal_data = self._generate_original_train_data(is_pos=False)
-        cust_feats = self._shopping_data.set_index("cust_code")
-        
-        shopping_train_data = orignal_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
-        return shopping_train_data
-    
-    def _union_order_cust_product(self, is_pos):
-        """联合order表、商户表、卷烟表"""
-        union_data = self._order_data.copy()
-        if not is_pos:
-            union_data.drop(OrderConfig.POSFEATURES, axis=1, inplace=True)
-        union_data.rename(columns={"PRODUCT_CODE": "product_code"}, inplace=True)
-        # union_data = union_data.drop(OrderConfig.POSFEATURES) # 去除pos数据特征字段
-        cust_feats = self._cust_data.set_index("BB_RETAIL_CUSTOMER_CODE")
-        product_feats = self._product_data.set_index("product_code")
-        
-        union_data = union_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
-        union_data = union_data.join(product_feats, on="product_code", how="inner")
-        
-        return union_data
-        # self._train_data = shuffle(self._train_data, random_state=42)
-        
-    def _calculate_score(self, union_data):
-        """计算联合数据记录的分数"""
-        # 对参与算分的特征值进行归一化
-        scaler = MinMaxScaler()
-        union_data[list(OrderConfig.WEIGHTS.keys())] = scaler.fit_transform(union_data[list(OrderConfig.WEIGHTS.keys())])
-        # 计算加权分数
-        union_data["score"] = sum(union_data[feat] * weight 
-                          for feat, weight in OrderConfig.WEIGHTS.items())
-        
-        return union_data
-    
-    def _labeled_data(self, scored_data):
-        """通过计算分数打标签"""
-        # 按品规分组计算中位数
-        product_medians = scored_data.groupby("product_code")["score"].median().reset_index()
-        product_medians.columns = ["product_code", "median_score"]
-        
-        # 合并中位数到原始订单数据
-        temp_data = pd.merge(scored_data, product_medians, on="product_code", how="left")
-        
-        # 生成标签 (1: 大于等于中位数, 0: 小于中位数)
-        temp_data["label"] = np.where(
-            temp_data["score"] >= temp_data["median_score"], 1, 0
-        )
-        temp_data = temp_data.sort_values("score", ascending=False)
-        temp_data.drop(columns=["median_score", "score"], inplace=True)
-        scored_data = shuffle(temp_data, random_state=42)
-        return scored_data
-    
-    # def _descartes(self):
-    #     """将零售户信息与卷烟信息进行笛卡尔积连接"""
-    #     self._cust_data["descartes"] = 1
-    #     self._product_data["descartes"] = 1
-        
-    #     self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
-        
-    # def _labeled_data_from_descartes(self):
-    #     """根据order表信息给descartes_data数据打标签"""
-    #     # 获取order表中的正样本组合
-    #     order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
-    #     order_set = set(zip(order_combinations["BB_RETAIL_CUSTOMER_CODE"], order_combinations["PRODUCT_CODE"]))
-        
-    #     # 在descartes_data中打标签:正样本为1,负样本为0
-    #     self._descartes_data['label'] = self._descartes_data.apply(
-    #         lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
-
-    # def _generate_train_data_from_descartes(self):
-    #     """从descartes_data中生成训练数据"""
-    #     positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
-    #     negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
-        
-    #     positive_count = len(positive_samples)
-    #     negative_count = min(1 * positive_count, len(negative_samples))
-    #     print(positive_count)
-    #     print(negative_count)
-        
-    #     # 随机抽取2倍正样本数量的负样本
-    #     negative_samples_sampled = negative_samples.sample(n=negative_count, random_state=42)
-    #     # 合并正负样本
-    #     self._train_data = pd.concat([positive_samples, negative_samples_sampled], axis=0)
-    #     self._train_data = self._train_data.sample(frac=1, random_state=42).reset_index(drop=True)
-        
-    #     # 保存训练数据
-    #     self._train_data.to_csv(self._save_res_path, index=False)
-    
-if __name__ == '__main__':
-    city_uuid = "00000000000000000000000011445301"
-    # city_uuid = "00000000000000000000000011441801"
-    save_dir = "./data"
-    processor = DataProcess(city_uuid, save_dir)
+from database import MySqlDao
+from models.rank.data.config import CustConfig, ProductConfig, OrderConfig, ShopConfig
+import os
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.utils import shuffle
+import numpy as np
+
+class DataProcess():
+    def __init__(self, city_uuid, save_dir):
+        self._mysql_dao = MySqlDao()
+        self.save_dir = save_dir
+        print("正在加载cust_info...")
+        self._cust_data = self._mysql_dao.load_cust_data(city_uuid)
+        print("正在加载product_info...")
+        self._product_data = self._mysql_dao.load_product_data(city_uuid)
+        print("正在加载order_info...")
+        self._order_data = self._mysql_dao.load_order_data(city_uuid)
+        # self._order_data = self._mysql_dao.load_mock_order_data()
+        print("正在加载shopping_info...")
+        self._shopping_data = self._mysql_dao.load_shopping_data(city_uuid)
+        
+    def data_process(self):
+        """数据预处理"""
+        train_data_save_path = os.path.join(self.save_dir, "train_data.csv")
+        if os.path.exists(train_data_save_path):
+            os.remove(train_data_save_path)
+        
+        # 1. 获取指定的特征组合
+        self._cust_data = self._cust_data[CustConfig.FEATURE_COLUMNS]
+        self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
+        self._order_data = self._order_data[OrderConfig.FEATURE_COLUMNS]
+        self._shopping_data = self._shopping_data[ShopConfig.FEATURE_COLUMNS]
+        
+        # 2. 数据清洗
+        self._clean_cust_data()
+        self._clean_product_data()
+        self._clean_order_data()
+        self._clean_shopping_data()
+        
+        # 3. 生成训练数据集
+        train_data = self._generate_train_data()
+        train_data.to_csv(train_data_save_path, index=False, encoding="utf-8")
+        
+
+    def _clean_cust_data(self):
+        """用户信息表数据清洗"""
+        self._cust_data["BB_RETAIL_CUSTOMER_CODE"] = self._cust_data["BB_RETAIL_CUSTOMER_CODE"].astype(str)
+        # 根据配置规则清洗数据
+        for feature, rules, in CustConfig.CLEANING_RULES.items():
+            if rules["type"] == "num":
+                # 先将数值型字符串转换为数值
+                self._cust_data[feature] = pd.to_numeric(self._cust_data[feature], errors="coerce")
+                
+            if rules["method"] == "fillna":
+                if rules["opt"] == "fill":
+                    self._cust_data[feature] = self._cust_data[feature].fillna(rules["value"]).infer_objects(copy=False)
+                elif rules["opt"] == "replace":
+                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]]).infer_objects(copy=False)
+                elif rules["opt"] == "mean":
+                    self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean()).infer_objects(copy=False)
+                self._cust_data[feature] = self._cust_data[feature].infer_objects(copy=False)
+    
+    def _clean_product_data(self):
+        """卷烟信息表数据清洗"""
+        self._product_data["product_code"] = self._product_data["product_code"].astype(str)
+        for feature, rules, in ProductConfig.CLEANING_RULES.items():
+            if rules["type"] == "num":
+                self._product_data[feature] = pd.to_numeric(self._product_data[feature], errors="coerce")
+            
+            if rules["method"] == "fillna":
+                if rules["opt"] == "fill":
+                    self._product_data[feature] = self._product_data[feature].fillna(rules["value"]).infer_objects(copy=False)
+                elif rules["opt"] == "mean":
+                    self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean()).infer_objects(copy=False)
+                self._product_data[feature] = self._product_data[feature].infer_objects(copy=False)
+                    
+    def _clean_order_data(self):
+        self._order_data["cust_code"] = self._order_data["cust_code"].astype(str)
+        self._order_data["product_code"] = self._order_data["product_code"].astype(str)
+        
+        # self._order_data[order_cols.drop(col_all_missing)] = self._order_data[order_cols.drop(col_all_missing)].fillna(0)
+        self._order_data["sale_qty"] = self._order_data["sale_qty"].fillna(0)
+        self._order_data = self._order_data.infer_objects(copy=False)
+        
+        # 将销售量进行分组求和
+        self._order_data = self._order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].sum()
+        
+        
+    def _clean_shopping_data(self):
+        """处理商圈数据缺省值"""
+        self._shopping_data["cust_code"] = self._shopping_data["cust_code"].astype(str)
+        # 根据配置规则清洗数据
+        for feature, rules, in ShopConfig.CLEANING_RULES.items():
+            if rules["type"] == "num":
+                # 先将数值型字符串转换为数值
+                self._shopping_data[feature] = pd.to_numeric(self._shopping_data[feature], errors="coerce")
+                
+            if rules["method"] == "fillna":
+                if rules["opt"] == "fill":
+                    self._shopping_data[feature] = self._shopping_data[feature].fillna(rules["value"]).infer_objects(copy=False)
+                elif rules["opt"] == "replace":
+                    self._shopping_data[feature] = self._shopping_data[feature].fillna(self._shopping_data[rules["value"]]).infer_objects(copy=False)
+                elif rules["opt"] == "mean":
+                    self._shopping_data[feature] = self._shopping_data[feature].fillna(self._shopping_data[feature].mean()).infer_objects(copy=False)
+                self._shopping_data[feature] = self._shopping_data[feature].infer_objects(copy=False)
+    
+    def _generate_train_data(self):
+        """生成训练数据"""
+        # 将商户表与商圈表进行连接
+        cust_feats = self._shopping_data.set_index("cust_code")
+        self._cust_data = self._cust_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
+        
+        union_data = self._union_order_cust_product()
+        
+        train_data = self._labeled_data(union_data)
+        
+        return train_data
+    
+    def _union_order_cust_product(self):
+        """联合order表、商户表、卷烟表"""
+        # 使用merge进行连接
+        union_data = self._order_data.merge(self._product_data, on="product_code", how="inner")
+        union_data = union_data.merge(self._cust_data, left_on='cust_code', right_on='BB_RETAIL_CUSTOMER_CODE', how="inner")
+        union_data = union_data.drop(columns=['BB_RETAIL_CUSTOMER_CODE'])
+        
+        return union_data
+        
+    def _labeled_data(self, union_data):
+        union_data['label'] = union_data['sale_qty'].apply(lambda x: 0 if x == 0 else 1)
+        train_data = union_data.drop(columns=['sale_qty'])
+        train_data = shuffle(train_data, random_state=42)
+        
+        return train_data
+    
+if __name__ == '__main__':
+    city_uuid = "00000000000000000000000011445301"
+    # city_uuid = "00000000000000000000000011441801"
+    save_dir = "./data"
+    processor = DataProcess(city_uuid, save_dir)
     processor.data_process()

+ 23 - 23
models/rank/data/utils.py

@@ -1,24 +1,24 @@
-import pandas as pd
-def one_hot_embedding(dataframe, onehout_feat):
-    """对数据的指定特征做embedding编码"""
-    # 先将指定的特征进行Categorical处理
-    for feat, categories in onehout_feat.items():
-        dataframe[feat] = pd.Categorical(dataframe[feat], categories=categories, ordered=False)
-    dataframe = pd.get_dummies(
-        dataframe,
-        columns=list(onehout_feat.keys()),
-        prefix_sep="_",
-        dtype=int,
-    )
-    return dataframe
-
-def sample_data_clear(data, config):
-    for feature, rules, in config.CLEANING_RULES.items():
-        if rules["type"] == "num":
-            data[feature] = pd.to_numeric(data[feature], errors="coerce")
-        if rules["method"] == "fill":
-            if rules["type"] == "str":
-                data[feature] = data[feature].fillna(rules["value"])
-            elif rules["type"] == "num":
-                data[feature] = data[feature].fillna(0.0)
+import pandas as pd
+def one_hot_embedding(dataframe, onehout_feat):
+    """对数据的指定特征做embedding编码"""
+    # 先将指定的特征进行Categorical处理
+    for feat, categories in onehout_feat.items():
+        dataframe[feat] = pd.Categorical(dataframe[feat], categories=categories, ordered=False)
+    dataframe = pd.get_dummies(
+        dataframe,
+        columns=list(onehout_feat.keys()),
+        prefix_sep="_",
+        dtype=int,
+    )
+    return dataframe
+
+def sample_data_clear(data, config):
+    for feature, rules, in config.CLEANING_RULES.items():
+        if rules["type"] == "num":
+            data[feature] = pd.to_numeric(data[feature], errors="coerce")
+        if rules["method"] == "fill":
+            if rules["type"] == "str":
+                data[feature] = data[feature].fillna(rules["value"])
+            elif rules["type"] == "num":
+                data[feature] = data[feature].fillna(0.0)
     return data

+ 124 - 124
models/rank/gbdt_lr.py

@@ -1,125 +1,125 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-import numpy as np
-from models.rank.data import DataLoader
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
-from sklearn.model_selection import GridSearchCV
-from sklearn.preprocessing import OneHotEncoder
-import joblib
-import time
-
-class Trainer:
-    def __init__(self, path):
-        self._load_data(path)
-        
-        # 初始化GBDT和LR模型参数
-        self._gbdt_params = {
-            'n_estimators': 100,
-            'learning_rate': 0.01,
-            'max_depth': 6,
-            'subsample': 0.8,
-            'random_state': 42,
-        }
-        self._lr_params = {
-            "max_iter": 1000,
-            'C': 1.0, 
-            'penalty': 'elasticnet', 
-            'l1_ratio': 0.8,  # 添加 l1_ratio 参数,可以根据需要调整
-            'solver': 'saga',
-            'random_state': 42,
-            'class_weight': 'balanced'
-        }
-        
-        # 初始化模型
-        self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
-        self._lr_model = LogisticRegression(**self._lr_params)
-        
-        self._onehot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
-        
-    def _load_data(self, path):
-        dataloader = DataLoader(path)
-        self._train_dataset, self._test_dataset = dataloader.split_dataset()
-        
-    def train(self):
-        """模型训练"""
-        print("开始训练GBDT模型...")
-        # 训练GBDT模型
-        self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"])
-        
-        # 获取GBDT的每棵树的分数(决策值)
-        gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0]  # 仅取每棵树的叶节点输出
-        
-        gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds)
-        
-        print("开始训练LR模型...")
-        # 使用决策树输出作为LR的输入特征
-        self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"])
-        
-    def predict(self, X):
-        # 获取GBDT模型的预测分数
-        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
-        
-        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
-        
-        # 使用训练好的LR模型输出概率
-        return self._lr_model.predict(gbdt_feats_encoded)
-    
-    def predict_proba(self, X):
-        # 获取GBDT模型的预测分数
-        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
-        
-        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
-        
-        # 使用训练好的LR模型输出概率
-        return self._lr_model.predict_proba(gbdt_feats_encoded)
-        
-    def evaluate(self):
-        # 对测试集进行预测
-        y_pred = self.predict(self._test_dataset["data"])
-        y_pred_proba = self.predict_proba(self._test_dataset["data"])[:, 1]  # 获取正类的概率
-        
-        # 计算各类评估指标
-        accuracy = accuracy_score(self._test_dataset["label"], y_pred)
-        precision = precision_score(self._test_dataset["label"], y_pred)
-        recall = recall_score(self._test_dataset["label"], y_pred)
-        f1 = f1_score(self._test_dataset["label"], y_pred)
-        roc_auc = roc_auc_score(self._test_dataset["label"], y_pred_proba)    
-        
-        return {
-            'accuracy': accuracy,
-            'precision': precision,
-            'recall': recall,
-            'f1_score': f1,
-            'roc_auc': roc_auc
-        }
-        
-    def save_model(self, model_path):
-        """将模型保存到本地"""
-        models = {"gbdt_model": self._gbdt_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder}
-        joblib.dump(models, model_path)
-    
-     
-if __name__ == "__main__":
-    gbdt_data_path = "./models/rank/data/gbdt_data.csv"
-    trainer = Trainer(gbdt_data_path)
-    
-    start_time = time.time()
-    trainer.train()
-    end_time = time.time()
-    
-    training_time_hours = (end_time - start_time) / 3600
-    print(f"训练时间: {training_time_hours:.4f} 小时")
-    
-    eval_metrics = trainer.evaluate()
-    
-    # 输出评估结果
-    print("GBDT-LR Evaluation Metrics:")
-    for metric, value in eval_metrics.items():
-        print(f"{metric}: {value:.4f}")
-        
-    # 保存模型
-    model_path = "./models/rank/weights/model.pkl"
-    trainer.save_model(model_path)
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import numpy as np
+from models.rank.data import DataLoader
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import OneHotEncoder
+import joblib
+import time
+
+class Trainer:
+    def __init__(self, path):
+        self._load_data(path)
+        
+        # 初始化GBDT和LR模型参数
+        self._gbdt_params = {
+            'n_estimators': 100,
+            'learning_rate': 0.01,
+            'max_depth': 6,
+            'subsample': 0.8,
+            'random_state': 42,
+        }
+        self._lr_params = {
+            "max_iter": 1000,
+            'C': 1.0, 
+            'penalty': 'elasticnet', 
+            'l1_ratio': 0.8,  # 添加 l1_ratio 参数,可以根据需要调整
+            'solver': 'saga',
+            'random_state': 42,
+            'class_weight': 'balanced'
+        }
+        
+        # 初始化模型
+        self._gbdt_model = GradientBoostingClassifier(**self._gbdt_params)
+        self._lr_model = LogisticRegression(**self._lr_params)
+        
+        self._onehot_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
+        
+    def _load_data(self, path):
+        dataloader = DataLoader(path)
+        self._train_dataset, self._test_dataset = dataloader.split_dataset()
+        
+    def train(self):
+        """模型训练"""
+        print("开始训练GBDT模型...")
+        # 训练GBDT模型
+        self._gbdt_model.fit(self._train_dataset["data"], self._train_dataset["label"])
+        
+        # 获取GBDT的每棵树的分数(决策值)
+        gbdt_train_preds = self._gbdt_model.apply(self._train_dataset["data"])[:, :, 0]  # 仅取每棵树的叶节点输出
+        
+        gbdt_feats_encoded = self._onehot_encoder.fit_transform(gbdt_train_preds)
+        
+        print("开始训练LR模型...")
+        # 使用决策树输出作为LR的输入特征
+        self._lr_model.fit(gbdt_feats_encoded, self._train_dataset["label"])
+        
+    def predict(self, X):
+        # 获取GBDT模型的预测分数
+        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
+        
+        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
+        
+        # 使用训练好的LR模型输出概率
+        return self._lr_model.predict(gbdt_feats_encoded)
+    
+    def predict_proba(self, X):
+        # 获取GBDT模型的预测分数
+        gbdt_preds = self._gbdt_model.apply(X)[:, :, 0]
+        
+        gbdt_feats_encoded = self._onehot_encoder.transform(gbdt_preds)
+        
+        # 使用训练好的LR模型输出概率
+        return self._lr_model.predict_proba(gbdt_feats_encoded)
+        
+    def evaluate(self):
+        # 对测试集进行预测
+        y_pred = self.predict(self._test_dataset["data"])
+        y_pred_proba = self.predict_proba(self._test_dataset["data"])[:, 1]  # 获取正类的概率
+        
+        # 计算各类评估指标
+        accuracy = accuracy_score(self._test_dataset["label"], y_pred)
+        precision = precision_score(self._test_dataset["label"], y_pred)
+        recall = recall_score(self._test_dataset["label"], y_pred)
+        f1 = f1_score(self._test_dataset["label"], y_pred)
+        roc_auc = roc_auc_score(self._test_dataset["label"], y_pred_proba)    
+        
+        return {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1_score': f1,
+            'roc_auc': roc_auc
+        }
+        
+    def save_model(self, model_path):
+        """将模型保存到本地"""
+        models = {"gbdt_model": self._gbdt_model, "lr_model": self._lr_model, "onehot_encoder": self._onehot_encoder}
+        joblib.dump(models, model_path)
+    
+     
+if __name__ == "__main__":
+    gbdt_data_path = "./data/train_data.csv"
+    trainer = Trainer(gbdt_data_path)
+    
+    start_time = time.time()
+    trainer.train()
+    end_time = time.time()
+    
+    training_time_hours = (end_time - start_time) / 3600
+    print(f"训练时间: {training_time_hours:.4f} 小时")
+    
+    eval_metrics = trainer.evaluate()
+    
+    # 输出评估结果
+    print("GBDT-LR Evaluation Metrics:")
+    for metric, value in eval_metrics.items():
+        print(f"{metric}: {value:.4f}")
+        
+    # 保存模型
+    model_path = "./models/rank/weights/model.pkl"
+    trainer.save_model(model_path)
     

+ 136 - 136
models/rank/gbdt_lr_sort.py

@@ -1,137 +1,137 @@
-import joblib
-# from dao import Redis, get_product_by_id, get_custs_by_ids, load_cust_data_from_mysql
-from database import RedisDatabaseHelper, MySqlDao
-from models.rank.data import ProductConfig, CustConfig, ImportanceFeaturesMap
-from models.rank.data.utils import one_hot_embedding, sample_data_clear
-import pandas as pd
-from sklearn.preprocessing import StandardScaler
-import os
-
-
-class GbdtLrModel:
-    def __init__(self, model_path):
-        self.load_model(model_path)
-        self.redis = RedisDatabaseHelper().redis
-        self._mysql_dao = MySqlDao()
-    
-    def load_model(self, model_path):
-        self._modelname = os.path.basename(model_path).split(".")[0]
-        models = joblib.load(model_path)
-        self.gbdt_model, self.lr_model, self.onehot_encoder = models["gbdt_model"], models["lr_model"], models["onehot_encoder"]
-        
-    
-    # def get_recall_list(self, city_uuid, product_id):
-    #     """根据卷烟id获取召回的商铺列表"""
-    #     key = f"fc:{city_uuid}:{product_id}"
-    #     self.recall_cust_list = self.redis.zrange(key, 0, -1, withscores=False)
-    
-    # def load_recall_data(self, city_uuid, product_id):
-    #     self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
-    #     self.custs_data = self._mysql_dao.get_cust_by_ids(city_uuid, self.recall_cust_list)[CustConfig.FEATURE_COLUMNS]
-        
-    def get_cust_and_product_data(self, city_uuid, product_id):
-        """从商户数据库中获取指定城市所有商户的id"""
-        self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
-        self.custs_data = self._mysql_dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
-    
-    def generate_feats_map(self, city_uuid, product_id):
-        """组合卷烟、商户特征矩阵"""
-        # self.get_recall_list(city_uuid, product_id)
-        # self.load_recall_data(city_uuid, product_id)
-        
-        self.get_cust_and_product_data(city_uuid, product_id)
-        # 做数据清洗
-        self.product_data = sample_data_clear(self.product_data, ProductConfig)
-        self.custs_data = sample_data_clear(self.custs_data, CustConfig)
-        
-        # 笛卡尔积联合
-        self.custs_data["descartes"] = 1
-        self.product_data["descartes"] = 1
-        self.feats_map = pd.merge(self.custs_data, self.product_data, on="descartes").drop("descartes", axis=1)
-        self.recall_cust_list = self.feats_map["BB_RETAIL_CUSTOMER_CODE"].to_list()
-        self.feats_map.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True)
-        self.feats_map.drop('product_code', axis=1, inplace=True)
-        
-        # onehot编码
-        onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
-        onehot_columns = list(onehot_feats.keys())
-        numeric_columns = self.feats_map.drop(onehot_columns, axis=1).columns
-        self.feats_map = one_hot_embedding(self.feats_map, onehot_feats)
-        
-        # 数字特征归一化
-        scaler = StandardScaler()
-        self.feats_map[numeric_columns] = scaler.fit_transform(self.feats_map[numeric_columns])
-    
-    def sort(self, city_uuid, product_id):
-        self.generate_feats_map(city_uuid, product_id)
-        
-        gbdt_preds = self.gbdt_model.apply(self.feats_map)[:, :, 0]
-        gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds)
-        scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1]
-        
-        self.recommend_list = []
-        for cust_id, score in zip(self.recall_cust_list, scores):
-            self.recommend_list.append({cust_id: float(score)})
-            
-        self.recommend_list = sorted(self.recommend_list, key=lambda x: list(x.values())[0], reverse=True)
-        # for res in self.recommend_list[:200]:
-        #     print(res)
-        return self.recommend_list
-    
-    def generate_feats_importance(self):
-        """生成特征重要性"""
-        # 获取GBDT模型的特征重要性
-        feats_importance = self.gbdt_model.feature_importances_
-        
-        # 获取特征名称
-        feats_names = self.gbdt_model.feature_names_in_
-        
-        importance_dict = dict(zip(feats_names, feats_importance))
-        
-        onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
-        for feat, categories in onehot_feats.items():
-            related_columns = [col for col in feats_names if col.startswith(feat)]
-            if related_columns:
-                # 合并类别重要性
-                combined_importance = sum(importance_dict[col] for col in related_columns)
-                # 删除onehot类别列
-                for col in related_columns:
-                    del importance_dict[col]
-                # 添加合并后的重要性
-                importance_dict[feat] = combined_importance
-        
-        # 排序
-        sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
-        
-        # 输出特征重要性
-        cust_features_importance = []
-        product_features_importance = []
-        order_features_importance = []
-        
-        for feat, importance in sorted_importance:
-            if feat in list(ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP.keys()):
-                cust_features_importance.append({ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP[feat]: float(importance)})
-            if feat in list(ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP.keys()):
-                product_features_importance.append({ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP[feat]: float(importance)})
-            if feat in list(ImportanceFeaturesMap.ORDER_FEATURE_MAP.keys()):
-                order_features_importance.append({ImportanceFeaturesMap.ORDER_FEATURE_MAP[feat]: float(importance)})
-                
-            # 零消特征重要性
-            if self._modelname == 'pos_model' and feat in list(ImportanceFeaturesMap.POS_FEATURE_MAP.keys()):
-                order_features_importance.append({ImportanceFeaturesMap.POS_FEATURE_MAP[feat]: float(importance)})
-                
-            # 商圈特征重要性
-            if self._modelname == 'shopping_model' and feat in list(ImportanceFeaturesMap.SHOPING_FEATURES_MAP.keys()):
-                cust_features_importance.append({ImportanceFeaturesMap.SHOPING_FEATURES_MAP[feat]: float(importance)})
-        return cust_features_importance, product_features_importance, order_features_importance
-    
-if __name__ == "__main__":
-    model_path = "./models/rank/weights/00000000000000000000000011445301/shopping_model.pkl"
-    city_uuid = "00000000000000000000000011445301"
-    product_id = "110102"
-    gbdt_sort = GbdtLrModel(model_path)
-    # gbdt_sort.sort(city_uuid, product_id)
-    
-    importances = gbdt_sort.generate_feats_importance()
-    for importance in importances:
+import joblib
+# from dao import Redis, get_product_by_id, get_custs_by_ids, load_cust_data_from_mysql
+from database import RedisDatabaseHelper, MySqlDao
+from models.rank.data import ProductConfig, CustConfig, ImportanceFeaturesMap
+from models.rank.data.utils import one_hot_embedding, sample_data_clear
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+import os
+
+
+class GbdtLrModel:
+    def __init__(self, model_path):
+        self.load_model(model_path)
+        self.redis = RedisDatabaseHelper().redis
+        self._mysql_dao = MySqlDao()
+    
+    def load_model(self, model_path):
+        self._modelname = os.path.basename(model_path).split(".")[0]
+        models = joblib.load(model_path)
+        self.gbdt_model, self.lr_model, self.onehot_encoder = models["gbdt_model"], models["lr_model"], models["onehot_encoder"]
+        
+    
+    # def get_recall_list(self, city_uuid, product_id):
+    #     """根据卷烟id获取召回的商铺列表"""
+    #     key = f"fc:{city_uuid}:{product_id}"
+    #     self.recall_cust_list = self.redis.zrange(key, 0, -1, withscores=False)
+    
+    # def load_recall_data(self, city_uuid, product_id):
+    #     self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
+    #     self.custs_data = self._mysql_dao.get_cust_by_ids(city_uuid, self.recall_cust_list)[CustConfig.FEATURE_COLUMNS]
+        
+    def get_cust_and_product_data(self, city_uuid, product_id):
+        """从商户数据库中获取指定城市所有商户的id"""
+        self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
+        self.custs_data = self._mysql_dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
+    
+    def generate_feats_map(self, city_uuid, product_id):
+        """组合卷烟、商户特征矩阵"""
+        # self.get_recall_list(city_uuid, product_id)
+        # self.load_recall_data(city_uuid, product_id)
+        
+        self.get_cust_and_product_data(city_uuid, product_id)
+        # 做数据清洗
+        self.product_data = sample_data_clear(self.product_data, ProductConfig)
+        self.custs_data = sample_data_clear(self.custs_data, CustConfig)
+        
+        # 笛卡尔积联合
+        self.custs_data["descartes"] = 1
+        self.product_data["descartes"] = 1
+        self.feats_map = pd.merge(self.custs_data, self.product_data, on="descartes").drop("descartes", axis=1)
+        self.recall_cust_list = self.feats_map["BB_RETAIL_CUSTOMER_CODE"].to_list()
+        self.feats_map.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True)
+        self.feats_map.drop('product_code', axis=1, inplace=True)
+        
+        # onehot编码
+        onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
+        onehot_columns = list(onehot_feats.keys())
+        numeric_columns = self.feats_map.drop(onehot_columns, axis=1).columns
+        self.feats_map = one_hot_embedding(self.feats_map, onehot_feats)
+        
+        # 数字特征归一化
+        scaler = StandardScaler()
+        self.feats_map[numeric_columns] = scaler.fit_transform(self.feats_map[numeric_columns])
+    
+    def sort(self, city_uuid, product_id):
+        self.generate_feats_map(city_uuid, product_id)
+        
+        gbdt_preds = self.gbdt_model.apply(self.feats_map)[:, :, 0]
+        gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds)
+        scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1]
+        
+        self.recommend_list = []
+        for cust_id, score in zip(self.recall_cust_list, scores):
+            self.recommend_list.append({cust_id: float(score)})
+            
+        self.recommend_list = sorted(self.recommend_list, key=lambda x: list(x.values())[0], reverse=True)
+        # for res in self.recommend_list[:200]:
+        #     print(res)
+        return self.recommend_list
+    
+    def generate_feats_importance(self):
+        """生成特征重要性"""
+        # 获取GBDT模型的特征重要性
+        feats_importance = self.gbdt_model.feature_importances_
+        
+        # 获取特征名称
+        feats_names = self.gbdt_model.feature_names_in_
+        
+        importance_dict = dict(zip(feats_names, feats_importance))
+        
+        onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
+        for feat, categories in onehot_feats.items():
+            related_columns = [col for col in feats_names if col.startswith(feat)]
+            if related_columns:
+                # 合并类别重要性
+                combined_importance = sum(importance_dict[col] for col in related_columns)
+                # 删除onehot类别列
+                for col in related_columns:
+                    del importance_dict[col]
+                # 添加合并后的重要性
+                importance_dict[feat] = combined_importance
+        
+        # 排序
+        sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
+        
+        # 输出特征重要性
+        cust_features_importance = []
+        product_features_importance = []
+        order_features_importance = []
+        
+        for feat, importance in sorted_importance:
+            if feat in list(ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP.keys()):
+                cust_features_importance.append({ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP[feat]: float(importance)})
+            if feat in list(ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP.keys()):
+                product_features_importance.append({ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP[feat]: float(importance)})
+            if feat in list(ImportanceFeaturesMap.ORDER_FEATURE_MAP.keys()):
+                order_features_importance.append({ImportanceFeaturesMap.ORDER_FEATURE_MAP[feat]: float(importance)})
+                
+            # 零消特征重要性
+            if self._modelname == 'pos_model' and feat in list(ImportanceFeaturesMap.POS_FEATURE_MAP.keys()):
+                order_features_importance.append({ImportanceFeaturesMap.POS_FEATURE_MAP[feat]: float(importance)})
+                
+            # 商圈特征重要性
+            if self._modelname == 'shopping_model' and feat in list(ImportanceFeaturesMap.SHOPING_FEATURES_MAP.keys()):
+                cust_features_importance.append({ImportanceFeaturesMap.SHOPING_FEATURES_MAP[feat]: float(importance)})
+        return cust_features_importance, product_features_importance, order_features_importance
+    
+if __name__ == "__main__":
+    model_path = "./models/rank/weights/00000000000000000000000011445301/shopping_model.pkl"
+    city_uuid = "00000000000000000000000011445301"
+    product_id = "110102"
+    gbdt_sort = GbdtLrModel(model_path)
+    # gbdt_sort.sort(city_uuid, product_id)
+    
+    importances = gbdt_sort.generate_feats_importance()
+    for importance in importances:
         print(importance)

+ 77 - 77
models/recall/hot_recall.py

@@ -1,77 +1,77 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-'''
-@filename     : hot_recall.py
-@description     : 热度召回算法   
-@time     : 2025/01/21/00
-@author     : Sherlock1011 & Min1027
-@Version     : 1.0
-'''
-import pandas as pd
-from database import RedisDatabaseHelper
-from tqdm import tqdm
-
-class HotRecallModel:
-    def __init__(self, order_data):
-        self._redis_db = RedisDatabaseHelper()
-        self._hotkeys = self.get_hotkeys()
-        self._order_data = order_data
-
-
-    def get_hotkeys(self):
-        info = self._redis_db.redis.zrange("configs:hotkeys", 0, -1, withscores=True)
-        hotkeys = []
-        for item, _ in info:
-            hotkeys.append(item)
-        return hotkeys
-        
-    def _calculate_hot_score(self, hot_name):
-        """
-        根据热度指标计算热度得分
-        :param hot_name: 热度指标A
-        :type param: string
-        :return: 所有热度指标的得分
-        :rtype: list
-        """
-        results = self._order_data.groupby("BB_RETAIL_CUSTOMER_CODE")[hot_name].mean().reset_index()
-        sorted_results = results.sort_values(by=hot_name, ascending=False).reset_index(drop=True)
-        item_hot_score = []
-        # mock热度召回最大分数
-        max_score = 1.0
-        total_score = sorted_results.loc[0, hot_name] / max_score
-        for row in sorted_results.itertuples(index=True, name="Row"):
-            item = {row[1]:(row[2]/total_score)*100}
-            item_hot_score.append(item)
-        return {"key":f"{hot_name}", "value":item_hot_score}
-
-    def calculate_all_hot_score(self, city_uuid):
-        """
-        计算所有的热度指标得分
-        """
-        # hot_datas = []
-        for hotkey_name in tqdm(self._hotkeys, desc="hot_recall:正在计算热度分数"):
-            self.to_redis(self._calculate_hot_score(hotkey_name), city_uuid)
-
-    def to_redis(self, rec_content_score, city_uuid):
-        hotkey_name = rec_content_score["key"]
-        rec_item_id = f"hot:{city_uuid}:{str(hotkey_name)}" # 修正 rec_item_id 拼接方式
-        print("自动清除历史id前数量", self._redis_db.redis.zcard(rec_item_id))
-        # 清空 sorted set 数据,确保不会影响后续的存储
-        self._redis_db.redis.delete(rec_item_id)
-        print("自动清除历史id后数量", self._redis_db.redis.zcard(rec_item_id))
-         
-        res = {}
-
-        for item in rec_content_score["value"]:  
-            for content, score in item.items():  # item 形如 {A001: 75.0}
-                res[content] = float(score)  # 确保 score 是 float 类型
-
-        if res:  # 只有当 res 不为空时才执行 zadd
-            self._redis_db.redis.zadd(rec_item_id, res)
-
-
-if __name__ == "__main__":
-    # 序列化
-    model = HotRecallModel()
-    model.calculate_all_hot_score()
-    # joblib.dump(model, "hot_recall.model")
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@filename     : hot_recall.py
+@description     : 热度召回算法   
+@time     : 2025/01/21/00
+@author     : Sherlock1011 & Min1027
+@Version     : 1.0
+'''
+import pandas as pd
+from database import RedisDatabaseHelper
+from tqdm import tqdm
+
+class HotRecallModel:
+    def __init__(self, order_data):
+        self._redis_db = RedisDatabaseHelper()
+        self._hotkeys = self.get_hotkeys()
+        self._order_data = order_data
+
+
+    def get_hotkeys(self):
+        info = self._redis_db.redis.zrange("configs:hotkeys", 0, -1, withscores=True)
+        hotkeys = []
+        for item, _ in info:
+            hotkeys.append(item)
+        return hotkeys
+        
+    def _calculate_hot_score(self, hot_name):
+        """
+        根据热度指标计算热度得分
+        :param hot_name: 热度指标A
+        :type param: string
+        :return: 所有热度指标的得分
+        :rtype: list
+        """
+        results = self._order_data.groupby("BB_RETAIL_CUSTOMER_CODE")[hot_name].mean().reset_index()
+        sorted_results = results.sort_values(by=hot_name, ascending=False).reset_index(drop=True)
+        item_hot_score = []
+        # mock热度召回最大分数
+        max_score = 1.0
+        total_score = sorted_results.loc[0, hot_name] / max_score
+        for row in sorted_results.itertuples(index=True, name="Row"):
+            item = {row[1]:(row[2]/total_score)*100}
+            item_hot_score.append(item)
+        return {"key":f"{hot_name}", "value":item_hot_score}
+
+    def calculate_all_hot_score(self, city_uuid):
+        """
+        计算所有的热度指标得分
+        """
+        # hot_datas = []
+        for hotkey_name in tqdm(self._hotkeys, desc="hot_recall:正在计算热度分数"):
+            self.to_redis(self._calculate_hot_score(hotkey_name), city_uuid)
+
+    def to_redis(self, rec_content_score, city_uuid):
+        hotkey_name = rec_content_score["key"]
+        rec_item_id = f"hot:{city_uuid}:{str(hotkey_name)}" # 修正 rec_item_id 拼接方式
+        print("自动清除历史id前数量", self._redis_db.redis.zcard(rec_item_id))
+        # 清空 sorted set 数据,确保不会影响后续的存储
+        self._redis_db.redis.delete(rec_item_id)
+        print("自动清除历史id后数量", self._redis_db.redis.zcard(rec_item_id))
+         
+        res = {}
+
+        for item in rec_content_score["value"]:  
+            for content, score in item.items():  # item 形如 {A001: 75.0}
+                res[content] = float(score)  # 确保 score 是 float 类型
+
+        if res:  # 只有当 res 不为空时才执行 zadd
+            self._redis_db.redis.zadd(rec_item_id, res)
+
+
+if __name__ == "__main__":
+    # 序列化
+    model = HotRecallModel()
+    model.calculate_all_hot_score()
+    # joblib.dump(model, "hot_recall.model")

+ 119 - 119
models/recall/item2vec.py

@@ -1,120 +1,120 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-import gensim
-from dao.mysql_client import Mysql
-
-class Item2Vec(object):
-    def __init__(self):
-        mysql_client = Mysql()
-        # 创建会话
-        self.session = mysql_client.create_session()
-def load_item_sequences_from_mysql():
-    try:
-        conn = mysql.connector.connect(
-            host='localhost',
-            user='your_username',
-            password='your_password',
-            database='your_database'
-        )
-        cursor = conn.cursor()
-        query = "SELECT user_id, sequence FROM item_sequences"
-        cursor.execute(query)
-        for row in cursor:
-            user_id, sequence_str = row
-            sequence = sequence_str.split(',')
-            yield user_id, sequence
-        cursor.close()
-        conn.close()
-    except mysql.connector.Error as err:
-        print(f"数据库连接或查询出错: {err}")
-
-
-def load_item_attributes_from_mysql():
-    try:
-        conn = mysql.connector.connect(
-            host='localhost',
-            user='your_username',
-            password='your_password',
-            database='your_database'
-        )
-        cursor = conn.cursor()
-        query = "SELECT item, attributes FROM item_attributes"
-        cursor.execute(query)
-        item_attributes = {}
-        for item, attributes_str in cursor:
-            attributes = attributes_str.split(',')
-            item_attributes[item] = attributes
-        cursor.close()
-        conn.close()
-        return item_attributes
-    except mysql.connector.Error as err:
-        print(f"数据库连接或查询出错: {err}")
-
-
-def load_user_attributes_from_mysql():
-    try:
-        conn = mysql.connector.connect(
-            host='localhost',
-            user='your_username',
-            password='your_password',
-            database='your_database'
-        )
-        cursor = conn.cursor()
-        query = "SELECT user_id, taste, cigarette_length, cigarette_type, packaging_color FROM user_attributes"
-        cursor.execute(query)
-        for row in cursor:
-            user_id, taste, cigarette_length, cigarette_type, packaging_color = row
-            user_attrs = [attr for attr in [taste, cigarette_length, cigarette_type, packaging_color] if attr]
-            yield user_id, user_attrs
-        cursor.close()
-        conn.close()
-    except mysql.connector.Error as err:
-        print(f"数据库连接或查询出错: {err}")
-
-
-def combine_user_item_attributes(item_sequences, item_attributes):
-    user_attributes = {user_id: attrs for user_id, attrs in load_user_attributes_from_mysql()}
-    for user_id, sequence in item_sequences:
-        user_attrs = user_attributes.get(user_id, [])
-        combined_sequence = user_attrs.copy()
-        for item in sequence:
-            combined_sequence.append(item)
-            combined_sequence.extend(item_attributes.get(item, []))
-        yield combined_sequence
-
-
-def train_item2vec(item_sequences, vector_size=100, window=5, min_count=10, workers=4):
-    model = gensim.models.Word2Vec(sentences=item_sequences, vector_size=vector_size, window=window,
-                                   min_count=min_count, workers=workers)
-    return model
-
-
-def get_item_vector(item, model):
-    try:
-        return model.wv[item]
-    except KeyError:
-        print(f"物品 {item} 未在模型中找到。")
-        return None
-
-
-def find_similar_items(item, model, topn=5):
-    try:
-        similar_items = model.wv.most_similar(item, topn=topn)
-        filtered_similar_items = [(item, score) for item, score in similar_items if not item.startswith(('attr', 'user_'))]
-        return filtered_similar_items
-    except KeyError:
-        print(f"物品 {item} 未在模型中找到。")
-        return None
-
-
-if __name__ == "__main__":
-    item_sequences = load_item_sequences_from_mysql()
-    item_attributes = load_item_attributes_from_mysql()
-    combined_sequences = combine_user_item_attributes(item_sequences, item_attributes)
-    item2vec_model = train_item2vec(combined_sequences)
-    item_vector = get_item_vector('item1', item2vec_model)
-    if item_vector is not None:
-        print(f"物品 'item1' 的向量表示: {item_vector}")
-    similar_items = find_similar_items('item1', item2vec_model, topn=3)
-    if similar_items is not None:
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import gensim
+from dao.mysql_client import Mysql
+
+class Item2Vec(object):
+    def __init__(self):
+        mysql_client = Mysql()
+        # 创建会话
+        self.session = mysql_client.create_session()
+def load_item_sequences_from_mysql():
+    try:
+        conn = mysql.connector.connect(
+            host='localhost',
+            user='your_username',
+            password='your_password',
+            database='your_database'
+        )
+        cursor = conn.cursor()
+        query = "SELECT user_id, sequence FROM item_sequences"
+        cursor.execute(query)
+        for row in cursor:
+            user_id, sequence_str = row
+            sequence = sequence_str.split(',')
+            yield user_id, sequence
+        cursor.close()
+        conn.close()
+    except mysql.connector.Error as err:
+        print(f"数据库连接或查询出错: {err}")
+
+
+def load_item_attributes_from_mysql():
+    try:
+        conn = mysql.connector.connect(
+            host='localhost',
+            user='your_username',
+            password='your_password',
+            database='your_database'
+        )
+        cursor = conn.cursor()
+        query = "SELECT item, attributes FROM item_attributes"
+        cursor.execute(query)
+        item_attributes = {}
+        for item, attributes_str in cursor:
+            attributes = attributes_str.split(',')
+            item_attributes[item] = attributes
+        cursor.close()
+        conn.close()
+        return item_attributes
+    except mysql.connector.Error as err:
+        print(f"数据库连接或查询出错: {err}")
+
+
+def load_user_attributes_from_mysql():
+    try:
+        conn = mysql.connector.connect(
+            host='localhost',
+            user='your_username',
+            password='your_password',
+            database='your_database'
+        )
+        cursor = conn.cursor()
+        query = "SELECT user_id, taste, cigarette_length, cigarette_type, packaging_color FROM user_attributes"
+        cursor.execute(query)
+        for row in cursor:
+            user_id, taste, cigarette_length, cigarette_type, packaging_color = row
+            user_attrs = [attr for attr in [taste, cigarette_length, cigarette_type, packaging_color] if attr]
+            yield user_id, user_attrs
+        cursor.close()
+        conn.close()
+    except mysql.connector.Error as err:
+        print(f"数据库连接或查询出错: {err}")
+
+
+def combine_user_item_attributes(item_sequences, item_attributes):
+    user_attributes = {user_id: attrs for user_id, attrs in load_user_attributes_from_mysql()}
+    for user_id, sequence in item_sequences:
+        user_attrs = user_attributes.get(user_id, [])
+        combined_sequence = user_attrs.copy()
+        for item in sequence:
+            combined_sequence.append(item)
+            combined_sequence.extend(item_attributes.get(item, []))
+        yield combined_sequence
+
+
+def train_item2vec(item_sequences, vector_size=100, window=5, min_count=10, workers=4):
+    model = gensim.models.Word2Vec(sentences=item_sequences, vector_size=vector_size, window=window,
+                                   min_count=min_count, workers=workers)
+    return model
+
+
+def get_item_vector(item, model):
+    try:
+        return model.wv[item]
+    except KeyError:
+        print(f"物品 {item} 未在模型中找到。")
+        return None
+
+
+def find_similar_items(item, model, topn=5):
+    try:
+        similar_items = model.wv.most_similar(item, topn=topn)
+        filtered_similar_items = [(item, score) for item, score in similar_items if not item.startswith(('attr', 'user_'))]
+        return filtered_similar_items
+    except KeyError:
+        print(f"物品 {item} 未在模型中找到。")
+        return None
+
+
+if __name__ == "__main__":
+    item_sequences = load_item_sequences_from_mysql()
+    item_attributes = load_item_attributes_from_mysql()
+    combined_sequences = combine_user_item_attributes(item_sequences, item_attributes)
+    item2vec_model = train_item2vec(combined_sequences)
+    item_vector = get_item_vector('item1', item2vec_model)
+    if item_vector is not None:
+        print(f"物品 'item1' 的向量表示: {item_vector}")
+    similar_items = find_similar_items('item1', item2vec_model, topn=3)
+    if similar_items is not None:
         print(f"与物品 'item1' 最相似的 3 个物品: {similar_items}")

+ 102 - 102
models/recall/itemCF/ItemCF.py

@@ -1,103 +1,103 @@
-from database import RedisDatabaseHelper
-import pandas as pd
-import numpy as np
-from tqdm import tqdm
-from scipy.sparse import csr_matrix
-from joblib import Parallel, delayed
-import joblib
-
-class ItemCFModel:
-    def __init__(self):
-        self._recommendations = {}
-        
-    def train(self, score_path, similatity_path, city_uuid, n=100, k=10, top_n=100, n_jobs=4):
-        self._score_df = pd.read_csv(score_path)
-        self._similarity_df = pd.read_csv(similatity_path, index_col=0)
-        self._similarity_matrix = csr_matrix(self._similarity_df.values)
-        self._shop_index = {shop: idx for idx, shop in enumerate(self._similarity_df.index)}
-        self._index_shop = {idx: shop for idx, shop in enumerate(self._similarity_df.index)}
-        
-        def process_product(product_code, scores):
-            # 获取热度最高的n个商户
-            top_n_shops = scores.nlargest(n, "SCORE")["BB_RETAIL_CUSTOMER_CODE"].values
-            top_n_indices = [self._shop_index[shop] for shop in top_n_shops]
-            
-            # 找到每个商户最相似的k个商户
-            similar_shops = {}
-            for shop_idx in top_n_indices:
-                similarities = self._similarity_matrix[shop_idx].toarray().flatten()
-                similar_indices = np.argpartition(similarities, -k-1)[-k-1:]
-                similar_indices = similar_indices[similar_indices != shop_idx][:k]
-                similar_shops[self._index_shop[shop_idx]] = [self._index_shop[idx] for idx in similar_indices]
-            
-            # 生成候选商户列表
-            candidate_shops = list(set([m for sublist in similar_shops.values() for m in sublist]))
-            candidate_indices = [self._shop_index[shop] for shop in candidate_shops]
-            
-            # 计算每个候选商户的兴趣得分
-            interest_scores = {}
-            for candidate_idx in candidate_indices:
-                interest_score = 0
-                for shop_idx in top_n_indices:
-                    if self._index_shop[candidate_idx] in similar_shops[self._index_shop[shop_idx]]:
-                        shop_score = scores[scores["BB_RETAIL_CUSTOMER_CODE"]==self._index_shop[shop_idx]]["SCORE"].values[0]
-                        interest_score += shop_score * self._similarity_matrix[shop_idx, candidate_idx]
-                interest_scores[self._index_shop[candidate_idx]] = interest_score
-            
-            # 将候选商户的兴趣得分转换为字典列表,并按照从大到小排列
-            sorted_candidates = sorted([{shop_id: s} for shop_id, s in interest_scores.items()],
-                                       key=lambda x: list(x.values())[0], reverse=True)[:top_n]
-            
-            return product_code, sorted_candidates
-        
-        # 并行处理每个品规
-        results = Parallel(n_jobs=n_jobs)(delayed(process_product)(product_code, scores) 
-                                          for product_code, scores in tqdm(self._score_df.groupby("PRODUCT_CODE"), desc="train:正在计算候选得分"))
-        print(len(results))
-        # 存储结果
-        self._recommendations = {product_code: sorted_candidates for product_code, sorted_candidates in results}
-        self.to_redis_zset(city_uuid)
-    
-    def to_redis_zset(self, city_uuid):
-        """
-        将 self._recommendations 中的数据保存到 Redis 的 Sorted Set (ZSET) 中
-        存储格式为 fc:product_code,其中商户 ID 作为成员,得分作为分数
-        """
-        redis_db = RedisDatabaseHelper()
-        
-        # 存redis之前,先进行删除操作
-        pattern = f"fc:{city_uuid}:*"
-        keys_to_delete = redis_db.redis.keys(pattern)
-        if keys_to_delete:
-            redis_db.redis.delete(*keys_to_delete)
-            
-        for product_code, recommendations in tqdm(self._recommendations.items(), desc="train:正在存储推荐结果"):
-            redis_key = f"fc:{city_uuid}:{product_code}"
-            zset_data = {}
-            for rec in recommendations:
-                for shop_id, score in rec.items():
-                    try:
-                        zset_data[shop_id] = float(score)
-                    except ValueError as e:
-                        print(f"Error converting score to float for shop_id {shop_id}: {score}")
-                        raise e
-            
-            redis_db.redis.zadd(redis_key, zset_data)
-    
-if __name__ == "__main__":
-    score_path = "./models/recall/itemCF/matrix/score.csv"
-    similarity_path = "./models/recall/itemCF/matrix/similarity.csv"
-    # itemcf_model = ItemCFModel()
-    # itemcf_model.train(score_path, similarity_path, n_jobs=4)
-    # recommend_list = itemcf_model.inference(110111)
-    # itemcf_model.to_redis_zset()
-    # print(len(recommend_list))
-    # print(recommend_list)
-    # joblib.dump(itemcf_model, "itemCF.model")
-    
-    # model = joblib.load("./itemCF.model")
-    # recommend_list = model.inference(110102)
-    # print(len(recommend_list))
-    # print(recommend_list)
-    data = pd.read_csv(similarity_path, index_col=0)
+from database import RedisDatabaseHelper
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from scipy.sparse import csr_matrix
+from joblib import Parallel, delayed
+import joblib
+
+class ItemCFModel:
+    def __init__(self):
+        self._recommendations = {}
+        
+    def train(self, score_path, similatity_path, city_uuid, n=100, k=10, top_n=100, n_jobs=4):
+        self._score_df = pd.read_csv(score_path)
+        self._similarity_df = pd.read_csv(similatity_path, index_col=0)
+        self._similarity_matrix = csr_matrix(self._similarity_df.values)
+        self._shop_index = {shop: idx for idx, shop in enumerate(self._similarity_df.index)}
+        self._index_shop = {idx: shop for idx, shop in enumerate(self._similarity_df.index)}
+        
+        def process_product(product_code, scores):
+            # 获取热度最高的n个商户
+            top_n_shops = scores.nlargest(n, "SCORE")["BB_RETAIL_CUSTOMER_CODE"].values
+            top_n_indices = [self._shop_index[shop] for shop in top_n_shops]
+            
+            # 找到每个商户最相似的k个商户
+            similar_shops = {}
+            for shop_idx in top_n_indices:
+                similarities = self._similarity_matrix[shop_idx].toarray().flatten()
+                similar_indices = np.argpartition(similarities, -k-1)[-k-1:]
+                similar_indices = similar_indices[similar_indices != shop_idx][:k]
+                similar_shops[self._index_shop[shop_idx]] = [self._index_shop[idx] for idx in similar_indices]
+            
+            # 生成候选商户列表
+            candidate_shops = list(set([m for sublist in similar_shops.values() for m in sublist]))
+            candidate_indices = [self._shop_index[shop] for shop in candidate_shops]
+            
+            # 计算每个候选商户的兴趣得分
+            interest_scores = {}
+            for candidate_idx in candidate_indices:
+                interest_score = 0
+                for shop_idx in top_n_indices:
+                    if self._index_shop[candidate_idx] in similar_shops[self._index_shop[shop_idx]]:
+                        shop_score = scores[scores["BB_RETAIL_CUSTOMER_CODE"]==self._index_shop[shop_idx]]["SCORE"].values[0]
+                        interest_score += shop_score * self._similarity_matrix[shop_idx, candidate_idx]
+                interest_scores[self._index_shop[candidate_idx]] = interest_score
+            
+            # 将候选商户的兴趣得分转换为字典列表,并按照从大到小排列
+            sorted_candidates = sorted([{shop_id: s} for shop_id, s in interest_scores.items()],
+                                       key=lambda x: list(x.values())[0], reverse=True)[:top_n]
+            
+            return product_code, sorted_candidates
+        
+        # 并行处理每个品规
+        results = Parallel(n_jobs=n_jobs)(delayed(process_product)(product_code, scores) 
+                                          for product_code, scores in tqdm(self._score_df.groupby("PRODUCT_CODE"), desc="train:正在计算候选得分"))
+        print(len(results))
+        # 存储结果
+        self._recommendations = {product_code: sorted_candidates for product_code, sorted_candidates in results}
+        self.to_redis_zset(city_uuid)
+    
+    def to_redis_zset(self, city_uuid):
+        """
+        将 self._recommendations 中的数据保存到 Redis 的 Sorted Set (ZSET) 中
+        存储格式为 fc:product_code,其中商户 ID 作为成员,得分作为分数
+        """
+        redis_db = RedisDatabaseHelper()
+        
+        # 存redis之前,先进行删除操作
+        pattern = f"fc:{city_uuid}:*"
+        keys_to_delete = redis_db.redis.keys(pattern)
+        if keys_to_delete:
+            redis_db.redis.delete(*keys_to_delete)
+            
+        for product_code, recommendations in tqdm(self._recommendations.items(), desc="train:正在存储推荐结果"):
+            redis_key = f"fc:{city_uuid}:{product_code}"
+            zset_data = {}
+            for rec in recommendations:
+                for shop_id, score in rec.items():
+                    try:
+                        zset_data[shop_id] = float(score)
+                    except ValueError as e:
+                        print(f"Error converting score to float for shop_id {shop_id}: {score}")
+                        raise e
+            
+            redis_db.redis.zadd(redis_key, zset_data)
+    
+if __name__ == "__main__":
+    score_path = "./models/recall/itemCF/matrix/score.csv"
+    similarity_path = "./models/recall/itemCF/matrix/similarity.csv"
+    # itemcf_model = ItemCFModel()
+    # itemcf_model.train(score_path, similarity_path, n_jobs=4)
+    # recommend_list = itemcf_model.inference(110111)
+    # itemcf_model.to_redis_zset()
+    # print(len(recommend_list))
+    # print(recommend_list)
+    # joblib.dump(itemcf_model, "itemCF.model")
+    
+    # model = joblib.load("./itemCF.model")
+    # recommend_list = model.inference(110102)
+    # print(len(recommend_list))
+    # print(recommend_list)
+    data = pd.read_csv(similarity_path, index_col=0)
     print(data)

+ 78 - 78
models/recall/itemCF/calculate_similarity_matrix.py

@@ -1,79 +1,79 @@
-from database import MySqlDao
-import pandas as pd
-import numpy as np
-
-from itertools import combinations
-from tqdm import tqdm
-
-dao = MySqlDao()
-def build_co_occurence_matrix(order_data):
-    """
-    构建商户共现矩阵
-    """
-    # 获取所有商户的唯一列表
-    shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique()
-    num_shops = len(shops)
-    
-    # 创建商户到索引的映射
-    shops_to_index = {shop: idx for idx, shop in enumerate(shops)}
-    # 初始化共现矩阵(上三角部分)
-    co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
-    
-    # 按照品规分组
-    grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list)
-    
-    # 遍历每个品规的商户列表
-    for shop_in_product in grouped:
-        # 生成商户对
-        shop_pairs = combinations(shop_in_product, 2)
-        for shop1, shop2 in shop_pairs:
-            # 获取商户索引
-            idx1 = shops_to_index[shop1]
-            idx2 = shops_to_index[shop2]
-            # 更新共现矩阵
-            co_occurrence_matrix[idx1, idx2] += 1
-            co_occurrence_matrix[idx2, idx1] += 1
-    return co_occurrence_matrix, shops, shops_to_index
-
-def calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index):
-    """
-    使用向量计算商铺之间的相似度矩阵
-    """
-    # 计算每个商铺售卖品规的总次数
-    shop_counts = order_data.groupby("BB_RETAIL_CUSTOMER_CODE").size()
-    
-    # 将商户售卖次数转换为数组
-    counts = np.array([shop_counts[shop] for shop in shops_to_index.keys()])
-    
-    # 计算分母部分 (sqrt(count_i * count_j))
-    denominator = np.sqrt(np.outer(counts, counts))
-    
-    # 计算相似度矩阵
-    similarity_matrix = co_occurrence_matrix / denominator
-    
-    # 将对角线设置为1
-    np.fill_diagonal(similarity_matrix, 1.0)
-    
-    return similarity_matrix
-
-def save_matrix(matrix, shops, save_path):
-    """
-    保存共现矩阵
-    """
-    matrix_df = pd.DataFrame(matrix, index=shops, columns=shops)
-    matrix_df.to_csv(save_path, index=True, encoding="utf-8")
-    
-def calculate_similarity_and_save_results(order_data, similarity_matrix_save_path):
-    co_occurrence_matrix, shops, shops_to_index = build_co_occurence_matrix(order_data)
-    similarity_matrix = calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index)
-    save_matrix(similarity_matrix, shops, similarity_matrix_save_path)
-    
-if __name__ == "__main__":
-    co_occurrence_save_path = "./models/recall/itemCF/matrix/occurrence.csv"
-    similarity_matrix_save_path = "./models/recall/itemCF/matrix/similarity.csv"
-    # 从数据库中读取订单数据
-    order_data = dao.load_order_data()
-    
-    calculate_similarity_and_save_results(order_data, similarity_matrix_save_path)
-    
+from database import MySqlDao
+import pandas as pd
+import numpy as np
+
+from itertools import combinations
+from tqdm import tqdm
+
+dao = MySqlDao()
+def build_co_occurence_matrix(order_data):
+    """
+    构建商户共现矩阵
+    """
+    # 获取所有商户的唯一列表
+    shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique()
+    num_shops = len(shops)
+    
+    # 创建商户到索引的映射
+    shops_to_index = {shop: idx for idx, shop in enumerate(shops)}
+    # 初始化共现矩阵(上三角部分)
+    co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
+    
+    # 按照品规分组
+    grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list)
+    
+    # 遍历每个品规的商户列表
+    for shop_in_product in grouped:
+        # 生成商户对
+        shop_pairs = combinations(shop_in_product, 2)
+        for shop1, shop2 in shop_pairs:
+            # 获取商户索引
+            idx1 = shops_to_index[shop1]
+            idx2 = shops_to_index[shop2]
+            # 更新共现矩阵
+            co_occurrence_matrix[idx1, idx2] += 1
+            co_occurrence_matrix[idx2, idx1] += 1
+    return co_occurrence_matrix, shops, shops_to_index
+
+def calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index):
+    """
+    使用向量计算商铺之间的相似度矩阵
+    """
+    # 计算每个商铺售卖品规的总次数
+    shop_counts = order_data.groupby("BB_RETAIL_CUSTOMER_CODE").size()
+    
+    # 将商户售卖次数转换为数组
+    counts = np.array([shop_counts[shop] for shop in shops_to_index.keys()])
+    
+    # 计算分母部分 (sqrt(count_i * count_j))
+    denominator = np.sqrt(np.outer(counts, counts))
+    
+    # 计算相似度矩阵
+    similarity_matrix = co_occurrence_matrix / denominator
+    
+    # 将对角线设置为1
+    np.fill_diagonal(similarity_matrix, 1.0)
+    
+    return similarity_matrix
+
+def save_matrix(matrix, shops, save_path):
+    """
+    保存共现矩阵
+    """
+    matrix_df = pd.DataFrame(matrix, index=shops, columns=shops)
+    matrix_df.to_csv(save_path, index=True, encoding="utf-8")
+    
+def calculate_similarity_and_save_results(order_data, similarity_matrix_save_path):
+    co_occurrence_matrix, shops, shops_to_index = build_co_occurence_matrix(order_data)
+    similarity_matrix = calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index)
+    save_matrix(similarity_matrix, shops, similarity_matrix_save_path)
+    
+if __name__ == "__main__":
+    co_occurrence_save_path = "./models/recall/itemCF/matrix/occurrence.csv"
+    similarity_matrix_save_path = "./models/recall/itemCF/matrix/similarity.csv"
+    # 从数据库中读取订单数据
+    order_data = dao.load_order_data()
+    
+    calculate_similarity_and_save_results(order_data, similarity_matrix_save_path)
+    
     

+ 81 - 81
models/recall/itemCF/user_item_score.py

@@ -1,82 +1,82 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-'''
-@filename     : ShopScore.py
-@description     : 品规-商户-评分矩阵:品规(用户)对商铺(物品)的评分矩阵,将结果保存在score.csv文件中
-@time     : 2025/01/31/02
-@author     : Sherlock1011 & Min1027
-@Version     : 1.0
-'''
-
-
-from database import MySqlDao
-from decimal import Decimal
-
-# 算法封装成一个类
-class UserItemScore:
-    """TODO 1. 将结果保存到redis数据库中"""
-    def __init__(self):
-        self.weights = {
-            "MONTH6_SALE_QTY": Decimal(0.1),
-            "MONTH6_SALE_AMT": Decimal(0.1),
-            "MONTH6_GROSS_PROFIT_RATE": Decimal(0.03),
-            "MONTH6_SALE_QTY_YOY": Decimal(0.1),
-            "MONTH6_SALE_QTY_MOM": Decimal(0.1),
-            "MONTH6_SALE_AMT_YOY": Decimal(0.1),
-            "MONTH6_SALE_AMT_MOM": Decimal(0.1),
-            "ORDER_FULLORDR_RATE": Decimal(0.1),
-            "CUSTOMER_REPURCHASE_RATE": Decimal(0.1),
-            "NEW_PRODUCT_ORDER_QTY_OCC": Decimal(0.03),
-            "LISTING_RATE": Decimal(0.1),
-            "OUT_STOCK_DAYS": Decimal(0.02),
-            "RETAIL_PRICE_INDEX": Decimal(0.02)
-        }
-        self.dao = MySqlDao()
-
-    # 均值方差归一化函数
-    def standardize_column(self, column):
-        if(column.max() == column.min() and column.max() == 0):
-            return 0
-        elif (column.max() == column.min() and column.max() != 0):
-            return 1
-        else:
-            return (column - column.min()) / (column.max() - column.min())
-
-    # 按照品规分组归一化并计算评分
-    def calculate_heart_per_product(self, group):
-        for column in self.weights.keys():
-            if column == "OUT_STOCK_DAYS":
-                group[column] = 1 - self.standardize_column(group[column])
-            else:
-                group[column] = self.standardize_column(group[column])
-        group["SCORE"] = group.apply(
-            lambda row: sum(Decimal(row[col]) * weight for col, weight in self.weights.items()) * 100, axis=1
-        )
-        return group
-
-    # 主算法函数:计算品规-商铺评分矩阵
-    def score(self, order_data):
-       
-
-        # 应用分组计算
-        df_result = order_data.groupby("PRODUCT_CODE").apply(self.calculate_heart_per_product).reset_index(drop=True)
-        df_result = df_result.sort_values(by=["PRODUCT_CODE", "SCORE"], ascending=[True, False])
-
-        # 选择要保存的列
-        return df_result[['PRODUCT_CODE', 'BB_RETAIL_CUSTOMER_CODE', 'SCORE']]
- 
-if __name__ == "__main__":
-    # 创建一个 ItemCF 类的实例
-    item_cf_algorithm = UserItemScore()
-    dao = MySqlDao()
-    # 读取数据
-    order_data = dao.load_order_data()
-
-    # 调用算法
-    scores = item_cf_algorithm.score(order_data)
-    
-    scores_path = "./models/recall/itemCF/matrix/score.csv"
-    
-    # 保存评分结果到csv文件
-    scores.to_csv(scores_path, index=False, encoding="utf-8")
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@filename     : ShopScore.py
+@description     : 品规-商户-评分矩阵:品规(用户)对商铺(物品)的评分矩阵,将结果保存在score.csv文件中
+@time     : 2025/01/31/02
+@author     : Sherlock1011 & Min1027
+@Version     : 1.0
+'''
+
+
+from database import MySqlDao
+from decimal import Decimal
+
+# 算法封装成一个类
+class UserItemScore:
+    """TODO 1. 将结果保存到redis数据库中"""
+    def __init__(self):
+        self.weights = {
+            "MONTH6_SALE_QTY": Decimal(0.1),
+            "MONTH6_SALE_AMT": Decimal(0.1),
+            "MONTH6_GROSS_PROFIT_RATE": Decimal(0.03),
+            "MONTH6_SALE_QTY_YOY": Decimal(0.1),
+            "MONTH6_SALE_QTY_MOM": Decimal(0.1),
+            "MONTH6_SALE_AMT_YOY": Decimal(0.1),
+            "MONTH6_SALE_AMT_MOM": Decimal(0.1),
+            "ORDER_FULLORDR_RATE": Decimal(0.1),
+            "CUSTOMER_REPURCHASE_RATE": Decimal(0.1),
+            "NEW_PRODUCT_ORDER_QTY_OCC": Decimal(0.03),
+            "LISTING_RATE": Decimal(0.1),
+            "OUT_STOCK_DAYS": Decimal(0.02),
+            "RETAIL_PRICE_INDEX": Decimal(0.02)
+        }
+        self.dao = MySqlDao()
+
+    # 均值方差归一化函数
+    def standardize_column(self, column):
+        if(column.max() == column.min() and column.max() == 0):
+            return 0
+        elif (column.max() == column.min() and column.max() != 0):
+            return 1
+        else:
+            return (column - column.min()) / (column.max() - column.min())
+
+    # 按照品规分组归一化并计算评分
+    def calculate_heart_per_product(self, group):
+        for column in self.weights.keys():
+            if column == "OUT_STOCK_DAYS":
+                group[column] = 1 - self.standardize_column(group[column])
+            else:
+                group[column] = self.standardize_column(group[column])
+        group["SCORE"] = group.apply(
+            lambda row: sum(Decimal(row[col]) * weight for col, weight in self.weights.items()) * 100, axis=1
+        )
+        return group
+
+    # 主算法函数:计算品规-商铺评分矩阵
+    def score(self, order_data):
+       
+
+        # 应用分组计算
+        df_result = order_data.groupby("PRODUCT_CODE").apply(self.calculate_heart_per_product).reset_index(drop=True)
+        df_result = df_result.sort_values(by=["PRODUCT_CODE", "SCORE"], ascending=[True, False])
+
+        # 选择要保存的列
+        return df_result[['PRODUCT_CODE', 'BB_RETAIL_CUSTOMER_CODE', 'SCORE']]
+ 
+if __name__ == "__main__":
+    # 创建一个 ItemCF 类的实例
+    item_cf_algorithm = UserItemScore()
+    dao = MySqlDao()
+    # 读取数据
+    order_data = dao.load_order_data()
+
+    # 调用算法
+    scores = item_cf_algorithm.score(order_data)
+    
+    scores_path = "./models/recall/itemCF/matrix/score.csv"
+    
+    # 保存评分结果到csv文件
+    scores.to_csv(scores_path, index=False, encoding="utf-8")
     

+ 46 - 46
requirements.txt

@@ -1,46 +1,46 @@
-asttokens==3.0.0
-async-timeout==5.0.1
-comm==0.2.2
-debugpy==1.8.12
-decorator==5.1.1
-et_xmlfile==2.0.0
-exceptiongroup==1.2.2
-executing==2.2.0
-filelock==3.17.0
-greenlet==3.1.1
-ipykernel==6.29.5
-ipython==8.31.0
-jedi==0.19.2
-joblib==1.4.2
-matplotlib-inline==0.1.7
-nest-asyncio==1.6.0
-numpy==2.2.2
-openpyxl==3.1.5
-packaging==24.2
-pandas==2.2.3
-parso==0.8.4
-pexpect==4.9.0
-platformdirs==4.3.6
-prompt_toolkit==3.0.50
-psutil==6.1.1
-ptyprocess==0.7.0
-pure_eval==0.2.3
-Pygments==2.19.1
-PyMySQL==1.1.1
-python-dateutil==2.9.0.post0
-pytz==2024.2
-PyYAML==6.0.2
-pyzmq==26.2.1
-redis==5.2.1
-scikit-learn==1.6.1
-scipy==1.15.1
-six==1.17.0
-SQLAlchemy==2.0.37
-stack-data==0.6.3
-threadpoolctl==3.5.0
-tornado==6.4.2
-tqdm==4.67.1
-traitlets==5.14.3
-typing_extensions==4.12.2
-tzdata==2025.1
-wcwidth==0.2.13
+asttokens==3.0.0
+async-timeout==5.0.1
+comm==0.2.2
+debugpy==1.8.12
+decorator==5.1.1
+et_xmlfile==2.0.0
+exceptiongroup==1.2.2
+executing==2.2.0
+filelock==3.17.0
+greenlet==3.1.1
+ipykernel==6.29.5
+ipython==8.31.0
+jedi==0.19.2
+joblib==1.4.2
+matplotlib-inline==0.1.7
+nest-asyncio==1.6.0
+numpy==2.2.2
+openpyxl==3.1.5
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prompt_toolkit==3.0.50
+psutil==6.1.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+Pygments==2.19.1
+PyMySQL==1.1.1
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.1
+redis==5.2.1
+scikit-learn==1.6.1
+scipy==1.15.1
+six==1.17.0
+SQLAlchemy==2.0.37
+stack-data==0.6.3
+threadpoolctl==3.5.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzdata==2025.1
+wcwidth==0.2.13

+ 2 - 2
utils/__init__.py

@@ -1,2 +1,2 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-

+ 82 - 82
utils/mock_data_to_database.py

@@ -1,83 +1,83 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-'''
-@filename     : mock_data_to_database.py
-@description     : 将mock数据写入到数据库中
-@time     : 2025/01/31/00
-@author     : Sherlock1011 & Min1027
-@Version     : 1.0
-'''
-from dao.mysql_client import Mysql
-
-import pandas as pd
-from sqlalchemy import Column, Integer, VARCHAR, Float, DECIMAL
-from sqlalchemy.ext.declarative import declarative_base
-
-# 定义数据库表结构
-Base = declarative_base()
-
-class MockOrder(Base):
-    __tablename__ = "mock_order"
-    id = Column(Integer, primary_key=True, autoincrement=True)  # 添加主键列
-    BB_RETAIL_CUSTOMER_CODE = Column(VARCHAR(50))
-    PRODUCT_CODE = Column(VARCHAR(50))
-    MONTH6_SALE_QTY = Column(DECIMAL(18, 6))
-    MONTH6_SALE_AMT = Column(DECIMAL(18, 6))
-    MONTH6_GROSS_PROFIT_RATE = Column(DECIMAL(18, 6))
-    MONTH6_SALE_QTY_YOY = Column(DECIMAL(18, 6))
-    MONTH6_SALE_QTY_MOM = Column(DECIMAL(18, 6))
-    MONTH6_SALE_AMT_YOY = Column(DECIMAL(18, 6))
-    MONTH6_SALE_AMT_MOM = Column(DECIMAL(18, 6))
-    ORDER_FULLORDR_RATE = Column(DECIMAL(18, 6))
-    CUSTOMER_REPURCHASE_RATE = Column(DECIMAL(18, 6))
-    NEW_PRODUCT_ORDER_QTY_OCC = Column(DECIMAL(18, 6))
-    LISTING_RATE = Column(DECIMAL(18, 6))
-    OUT_STOCK_DAYS = Column(DECIMAL(18, 6))
-    RETAIL_PRICE_INDEX = Column(DECIMAL(18, 6))
-    
-def insert_data(db, data_path):
-    df = pd.read_excel(data_path)
-    session = db.create_session()
-    try:
-        df.columns = ['BB_RETAIL_CUSTOMER_CODE', 
-                      'PRODUCT_CODE', 
-                      'MONTH6_SALE_QTY', 
-                      'MONTH6_SALE_AMT', 
-                      'MONTH6_GROSS_PROFIT_RATE',
-                      'MONTH6_SALE_QTY_YOY', 
-                      'MONTH6_SALE_QTY_MOM', 
-                      'MONTH6_SALE_AMT_YOY', 
-                      'MONTH6_SALE_AMT_MOM', 
-                      'ORDER_FULLORDR_RATE',
-                      'CUSTOMER_REPURCHASE_RATE', 
-                      'NEW_PRODUCT_ORDER_QTY_OCC', 
-                      'LISTING_RATE', 
-                      'OUT_STOCK_DAYS', 
-                      'RETAIL_PRICE_INDEX',
-                      ]  # 确保列名匹配
-        session.bulk_insert_mappings(MockOrder, df.to_dict(orient='records'))
-        session.commit()
-        print("数据成功插入数据库")
-        
-    except Exception as e:
-        session.rollback()
-        print(f"插入数据时出错: {e}")
-        
-    finally:
-        session.close()
-    
-        
-if __name__ == "__main__":
-    data_path = "./data/order.xlsx"
-    # 创建数据库链接
-    db = Mysql()
-    
-
-    # 创建表(如果不存在)
-    Base.metadata.create_all(db.engine)
-    insert_data(db, data_path)
-    
-    db.closed()
-    
-    
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@filename     : mock_data_to_database.py
+@description     : 将mock数据写入到数据库中
+@time     : 2025/01/31/00
+@author     : Sherlock1011 & Min1027
+@Version     : 1.0
+'''
+from dao.mysql_client import Mysql
+
+import pandas as pd
+from sqlalchemy import Column, Integer, VARCHAR, Float, DECIMAL
+from sqlalchemy.ext.declarative import declarative_base
+
+# 定义数据库表结构
+Base = declarative_base()
+
+class MockOrder(Base):
+    __tablename__ = "mock_order"
+    id = Column(Integer, primary_key=True, autoincrement=True)  # 添加主键列
+    BB_RETAIL_CUSTOMER_CODE = Column(VARCHAR(50))
+    PRODUCT_CODE = Column(VARCHAR(50))
+    MONTH6_SALE_QTY = Column(DECIMAL(18, 6))
+    MONTH6_SALE_AMT = Column(DECIMAL(18, 6))
+    MONTH6_GROSS_PROFIT_RATE = Column(DECIMAL(18, 6))
+    MONTH6_SALE_QTY_YOY = Column(DECIMAL(18, 6))
+    MONTH6_SALE_QTY_MOM = Column(DECIMAL(18, 6))
+    MONTH6_SALE_AMT_YOY = Column(DECIMAL(18, 6))
+    MONTH6_SALE_AMT_MOM = Column(DECIMAL(18, 6))
+    ORDER_FULLORDR_RATE = Column(DECIMAL(18, 6))
+    CUSTOMER_REPURCHASE_RATE = Column(DECIMAL(18, 6))
+    NEW_PRODUCT_ORDER_QTY_OCC = Column(DECIMAL(18, 6))
+    LISTING_RATE = Column(DECIMAL(18, 6))
+    OUT_STOCK_DAYS = Column(DECIMAL(18, 6))
+    RETAIL_PRICE_INDEX = Column(DECIMAL(18, 6))
+    
+def insert_data(db, data_path):
+    df = pd.read_excel(data_path)
+    session = db.create_session()
+    try:
+        df.columns = ['BB_RETAIL_CUSTOMER_CODE', 
+                      'PRODUCT_CODE', 
+                      'MONTH6_SALE_QTY', 
+                      'MONTH6_SALE_AMT', 
+                      'MONTH6_GROSS_PROFIT_RATE',
+                      'MONTH6_SALE_QTY_YOY', 
+                      'MONTH6_SALE_QTY_MOM', 
+                      'MONTH6_SALE_AMT_YOY', 
+                      'MONTH6_SALE_AMT_MOM', 
+                      'ORDER_FULLORDR_RATE',
+                      'CUSTOMER_REPURCHASE_RATE', 
+                      'NEW_PRODUCT_ORDER_QTY_OCC', 
+                      'LISTING_RATE', 
+                      'OUT_STOCK_DAYS', 
+                      'RETAIL_PRICE_INDEX',
+                      ]  # 确保列名匹配
+        session.bulk_insert_mappings(MockOrder, df.to_dict(orient='records'))
+        session.commit()
+        print("数据成功插入数据库")
+        
+    except Exception as e:
+        session.rollback()
+        print(f"插入数据时出错: {e}")
+        
+    finally:
+        session.close()
+    
+        
+if __name__ == "__main__":
+    data_path = "./data/order.xlsx"
+    # 创建数据库链接
+    db = Mysql()
+    
+
+    # 创建表(如果不存在)
+    Base.metadata.create_all(db.engine)
+    insert_data(db, data_path)
+    
+    db.closed()
+    
+    
     

+ 166 - 166
烟草模型部署文档.md

@@ -1,167 +1,167 @@
-# 烟草推荐模型部署文档
-
-## 1、配置文件说明:
-
-- ### database_config.yaml  这个是数据配置文件
-    
-
-```
-mysql:
-  host: 'rm-t4n6rz18y4t5x47y70o.mysql.singapore.rds.aliyuncs.com'
-  port: 3036
-  db: 'brand_cultivation'
-  user: 'xxxxx'
-  passwd: 'xxxxx'
-
-redis:
-  host: 'r-t4nb4n9i8je7u6ogk1pd.redis.singapore.rds.aliyuncs.com'
-  port: 5000
-  db: 10
-  passwd: 'xxxxx'
-```
-
-- ### crontab 定时任务配置文件
-    
-
-```
-# START CRON JOB
-1 2 * * * python /app/app.pyc --run_all
-# END CRON JOB
-```
-
- 
-
-## 2、模型启动配置说明:
-
-### app.py
-
-```
-    parser.add_argument("--run_all", action='store_true')
-    parser.add_argument("--run_hot", action='store_true')
-    parser.add_argument("--run_itemcf", action='store_true')
-    parser.add_argument("--run_itemcf_inference", action='store_true')
-    parser.add_argument("--city_uuid", type=str, help="City UUID for filtering data")
-```
-
-### 总共有4种启动模式分别是:
-
-1\. 启动热度召回和协同过滤  
-        2. 启动热度召回  
-        3. 启动协同过滤  
-        4. 启动系统过滤推理
-
-## 3、GBDT LR模型训练推理启动
-
-### gbdt_lr.py
-
-```
-    parser.add_argument("--run_train", action='store_true')
-    parser.add_argument("--recommend", action='store_true')
-    parser.add_argument("--importance", action='store_true')
-
-    parser.add_argument("--city_uuid", type=str, default='00000000000000000000000011445301')
-    parser.add_argument("--product_id", type=str, default='110102')
-```
-
-### gbdt_lr总共3个功能:
-
-1\. 启动gbdt_lr训练  python -m gbdt_lr --run_train --city_uuid "00000000000000000000000011445301"  
-        2. 根据城市id和product_id进行推荐,需要指定city_uuid、product_id。      python -m gbdt_lr --recommend --city_uuid "00000000000000000000000011445301" --product_id '110102'  
-        3. 获取指定城市的特征重要性指标。  python -m gbdt_lr --importance --city_uuid "00000000000000000000000011445301"    
-注意:在数据准备阶段,会将训练数据保存到./models/rank/data/gbdt_data.csv中  
-模型文件会存放在 ./models/rank/weights/city_uuid/model.pkl  
-重要性指标会存放在 ./models/rank/weights/下,分别是商户指标重要性和卷烟指标重要性  
-
-## 4、模型docker运行配置说明:
-
-### docker镜像是:registry.cn-hangzhou.aliyuncs.com/hexiaoshi/brandcultivation:0.0.1
-
-```yaml
-docker run --name BrandCultivation -d -v /export/brandcultivation/crontab:/etc/cron.d/crontab -v /export/brandcultivation/database_config.yaml:/app/config/database_config.yaml  registry.cn-hangzhou.aliyuncs.com/hexiaoshi/brandcultivation:0.0.1
-```
-
-## 5、模型kubernetes运行配置说明
-
-yaml文件如下:
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: brandcultivation
-  namespace: default
-  labels:
-    app: brandcultivation
-spec:
-  selector:
-    matchLabels:
-      app: brandcultivation
-  replicas: 1
-  strategy:
-    rollingUpdate:
-      maxSurge: 25%
-      maxUnavailable: 25%
-    type: RollingUpdate
-  template:
-    metadata:
-      labels:
-        app: brandcultivation
-    spec:
-      containers:
-      - name: brandcultivation
-        image: registry.cn-hangzhou.aliyuncs.com/hexiaoshi/brandcultivation:0.0.1
-        imagePullPolicy: IfNotPresent
-        resources:
-          requests:
-            cpu: 4000m
-            memory: 4096Mi
-            ephemeral-storage: 20Gi             
-          limits:
-            cpu: 4000m
-            memory: 4096Mi
-            ephemeral-storage: 20Gi            
-        ports:
-        - containerPort:  80
-          name: brandcultivation
-        volumeMounts:
-        - name: localtime
-          mountPath: /etc/localtime
-        - name: config
-          mountPath: /app/config/database_config.yaml
-          subPath: database_config.yaml
-        - name: config
-          mountPath: /etc/cron.d/crontab
-          subPath: crontab          
-        - name: localtime
-          hostPath:
-            path: /usr/share/zoneinfo/Asia/Shanghai
-        - name: config
-          configMap:
-            name: brandcultivation
-      restartPolicy: Always
----
-kind: ConfigMap
-apiVersion: v1
-metadata:
-  name: brandcultivation
-  namespace: default
-data:
-  database_config.yaml: |
-    mysql:
-      host: 'rm-t4n6rz18y4t5x47y70o.mysql.singapore.rds.aliyuncs.com'
-      port: 3036
-      db: 'brand_cultivation'
-      user: 'BrandCultivation'
-      passwd: '8BfWBc18NBXl#CMd'
-
-    redis:
-      host: 'r-t4nb4n9i8je7u6ogk1pd.redis.singapore.rds.aliyuncs.com'
-      port: 5000
-      db: 10
-      passwd: 'gHmNkVBd88sZybj'
-  crontab: |
-    # START CRON JOB
-    1 2 * * * python /app/app.pyc
-    # END CRON JOB
-
+# 烟草推荐模型部署文档
+
+## 1、配置文件说明:
+
+- ### database_config.yaml  这个是数据配置文件
+    
+
+```
+mysql:
+  host: 'rm-t4n6rz18y4t5x47y70o.mysql.singapore.rds.aliyuncs.com'
+  port: 3036
+  db: 'brand_cultivation'
+  user: 'xxxxx'
+  passwd: 'xxxxx'
+
+redis:
+  host: 'r-t4nb4n9i8je7u6ogk1pd.redis.singapore.rds.aliyuncs.com'
+  port: 5000
+  db: 10
+  passwd: 'xxxxx'
+```
+
+- ### crontab 定时任务配置文件
+    
+
+```
+# START CRON JOB
+1 2 * * * python /app/app.pyc --run_all
+# END CRON JOB
+```
+
+ 
+
+## 2、模型启动配置说明:
+
+### app.py
+
+```
+    parser.add_argument("--run_all", action='store_true')
+    parser.add_argument("--run_hot", action='store_true')
+    parser.add_argument("--run_itemcf", action='store_true')
+    parser.add_argument("--run_itemcf_inference", action='store_true')
+    parser.add_argument("--city_uuid", type=str, help="City UUID for filtering data")
+```
+
+### 总共有4种启动模式分别是:
+
+1\. 启动热度召回和协同过滤  
+        2. 启动热度召回  
+        3. 启动协同过滤  
+        4. 启动系统过滤推理
+
+## 3、GBDT LR模型训练推理启动
+
+### gbdt_lr.py
+
+```
+    parser.add_argument("--run_train", action='store_true')
+    parser.add_argument("--recommend", action='store_true')
+    parser.add_argument("--importance", action='store_true')
+
+    parser.add_argument("--city_uuid", type=str, default='00000000000000000000000011445301')
+    parser.add_argument("--product_id", type=str, default='110102')
+```
+
+### gbdt_lr总共3个功能:
+
+1\. 启动gbdt_lr训练  python -m gbdt_lr --run_train --city_uuid "00000000000000000000000011445301"  
+        2. 根据城市id和product_id进行推荐,需要指定city_uuid、product_id。      python -m gbdt_lr --recommend --city_uuid "00000000000000000000000011445301" --product_id '110102'  
+        3. 获取指定城市的特征重要性指标。  python -m gbdt_lr --importance --city_uuid "00000000000000000000000011445301"    
+注意:在数据准备阶段,会将训练数据保存到./models/rank/data/gbdt_data.csv中  
+模型文件会存放在 ./models/rank/weights/city_uuid/model.pkl  
+重要性指标会存放在 ./models/rank/weights/下,分别是商户指标重要性和卷烟指标重要性  
+
+## 4、模型docker运行配置说明:
+
+### docker镜像是:registry.cn-hangzhou.aliyuncs.com/hexiaoshi/brandcultivation:0.0.1
+
+```yaml
+docker run --name BrandCultivation -d -v /export/brandcultivation/crontab:/etc/cron.d/crontab -v /export/brandcultivation/database_config.yaml:/app/config/database_config.yaml  registry.cn-hangzhou.aliyuncs.com/hexiaoshi/brandcultivation:0.0.1
+```
+
+## 5、模型kubernetes运行配置说明
+
+yaml文件如下:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: brandcultivation
+  namespace: default
+  labels:
+    app: brandcultivation
+spec:
+  selector:
+    matchLabels:
+      app: brandcultivation
+  replicas: 1
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: brandcultivation
+    spec:
+      containers:
+      - name: brandcultivation
+        image: registry.cn-hangzhou.aliyuncs.com/hexiaoshi/brandcultivation:0.0.1
+        imagePullPolicy: IfNotPresent
+        resources:
+          requests:
+            cpu: 4000m
+            memory: 4096Mi
+            ephemeral-storage: 20Gi             
+          limits:
+            cpu: 4000m
+            memory: 4096Mi
+            ephemeral-storage: 20Gi            
+        ports:
+        - containerPort:  80
+          name: brandcultivation
+        volumeMounts:
+        - name: localtime
+          mountPath: /etc/localtime
+        - name: config
+          mountPath: /app/config/database_config.yaml
+          subPath: database_config.yaml
+        - name: config
+          mountPath: /etc/cron.d/crontab
+          subPath: crontab          
+        - name: localtime
+          hostPath:
+            path: /usr/share/zoneinfo/Asia/Shanghai
+        - name: config
+          configMap:
+            name: brandcultivation
+      restartPolicy: Always
+---
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: brandcultivation
+  namespace: default
+data:
+  database_config.yaml: |
+    mysql:
+      host: 'rm-t4n6rz18y4t5x47y70o.mysql.singapore.rds.aliyuncs.com'
+      port: 3036
+      db: 'brand_cultivation'
+      user: 'BrandCultivation'
+      passwd: '8BfWBc18NBXl#CMd'
+
+    redis:
+      host: 'r-t4nb4n9i8je7u6ogk1pd.redis.singapore.rds.aliyuncs.com'
+      port: 5000
+      db: 10
+      passwd: 'gHmNkVBd88sZybj'
+  crontab: |
+    # START CRON JOB
+    1 2 * * * python /app/app.pyc
+    # END CRON JOB
+
 ```