Procházet zdrojové kódy

封装推荐流程

Sherlock před 11 měsíci
rodič
revize
075a9e443d

+ 162 - 0
gnerate_report.py

@@ -0,0 +1,162 @@
+
+from database import RedisDatabaseHelper, MySqlDao
+from models.item2vec import Item2VecModel
+from models.rank.data.config import CustConfig, ProductConfig, ShopConfig, OrderConfig
+from models.rank.data.utils import sample_data_clear
+from models.rank.gbdt_lr_inference import GbdtLrModel
+from utils.result_process import get_cust_list_from_history_order, split_relation_subtable, generate_report
+import pandas as pd
+
+redis = RedisDatabaseHelper().redis
+dao = MySqlDao()
+gbdtlr_model = GbdtLrModel("./models/rank/weights/00000000000000000000000011445301/gbdtlr_model.pkl")
+item2vec = Item2VecModel("00000000000000000000000011445301")
+
+def get_itemcf_recall(city_uuid, product_id):
+    """协同召回"""
+    key = f"fc:{city_uuid}:{product_id}"
+    recall_list = redis.zrevrange(key, 0, -1, withscores=False)
+    return recall_list
+
+def get_hot_recall(city_uuid):
+    """热度召回"""
+    key = f"hot:{city_uuid}:sale_qty"
+    recall_list = redis.zrevrange(key, 0, -1, withscores=False)
+    return recall_list
+
+def get_recall_cust(city_uuid, product_id, recall_count):
+    """根据协同过滤和热度召回召回商户
+    """
+    itemcf_recall_list = get_itemcf_recall(city_uuid, product_id)
+    hot_recall_list = get_hot_recall(city_uuid)
+    
+    result = list(dict.fromkeys(itemcf_recall_list))
+    
+    # 如果结果不足,从hot_recall中补齐
+    if len(result) < recall_count:
+        hot_recall_set = set(hot_recall_list) - set(result)
+        additional_items = [item for item in hot_recall_list if item in hot_recall_set]
+        needed = recall_count - len(result)
+        result.extend(additional_items[:needed])
+    return result[:recall_count]
+
+def generate_recommend_sample(city_uuid, product_id):
+    """生成预测数据集"""
+    product_in_order = dao.get_product_from_order(city_uuid)["product_code"].unique().tolist()
+    if product_id in product_in_order:
+        recall_count = 1000
+        cust_list = get_recall_cust(city_uuid, product_id, recall_count)
+    else:
+        cust_list = item2vec.get_recommend_cust_list(product_id)["cust_code"].to_list()
+    
+    
+    # 获取卷烟的信息
+    product_data = dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
+    filter_dict = product_data.to_dict("records")[0]
+    
+    cust_data = dao.get_cust_by_ids(city_uuid, cust_list)[CustConfig.FEATURE_COLUMNS]
+    shop_data = dao.get_shop_by_ids(city_uuid, cust_list)[ShopConfig.FEATURE_COLUMNS]
+    
+    product_data = sample_data_clear(product_data, ProductConfig)
+    cust_data = sample_data_clear(cust_data, CustConfig)
+    shop_data = sample_data_clear(shop_data, ShopConfig)
+    
+    cust_feats = shop_data.set_index("cust_code")
+    cust_data = cust_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
+    
+    feats_map = gbdtlr_model.generate_feats_map(product_data, cust_data)
+    
+    return feats_map, filter_dict, cust_list
+
+def get_recommend_list_by_gbdt_lr(city_uuid, product_id):
+    """根据gbdt-lr进行打分并获得推荐列表,适用于推荐历史订单中存在的卷烟"""
+    feats_sample, _, cust_list = generate_recommend_sample(city_uuid, product_id)
+    recommend_list = gbdtlr_model.get_recommend_list(feats_sample, cust_list)
+    return recommend_list
+    
+
+def gbdt_lr_inference(city_uuid, product_id):
+    pass
+
+def generate_features_shap(city_uuid, product_id, delivery_count):
+    feats_sample, filter_dict, cust_list = generate_recommend_sample(city_uuid, product_id)
+    
+    if product_id in dao.get_product_from_order(city_uuid)["product_code"].unique().tolist():
+        # 如果推荐商品为新卷烟,走iterm2vec
+        recommend_data = gbdtlr_model.get_recommend_list(feats_sample, cust_list)
+    else:
+        recommend_data = item2vec.get_recommend_cust_list(product_id).to_dict("records")
+    result = gbdtlr_model.generate_shap_interance(feats_sample)
+    generate_report(city_uuid, result, filter_dict, recommend_data, delivery_count, "./data")
+
+def eval(city_uuid, product_code):
+    """推荐效果验证"""
+    eval_report = get_cust_list_from_history_order(city_uuid, product_code)
+    eval_report.to_csv("./data/效果验证表.csv", index=False)
+    
+def generate_similarity_product(product_code):
+    product_similarity_map = item2vec.generate_product_similarity_map(product_code)
+    product_similarity_map = product_similarity_map[["product_name", "similarity", "brand_name", "factory_name", "is_low_tar", "is_medium", "is_tiny", "is_coarse", "is_exploding_beads", "is_abnormity", "is_cig", "is_chuangxin", "direct_retail_price", "tbc_total_length", "product_style"]]
+    product_similarity_map = product_similarity_map.rename(
+        columns={
+            "product_name": "卷烟名称",
+            "similarity": "相似度",
+            "factory_name": "生产厂商",
+            "brand_name": "品牌名称",
+            "is_low_tar":                "低焦油卷烟",
+            "is_medium":                 "中支烟",
+            "is_tiny":                   "细支烟",
+            "is_coarse":                 "粗支烟",
+            "is_exploding_beads":        "爆珠烟",
+            "is_abnormity":              "异形包装",
+            "is_cig":                    "雪茄烟",
+            "is_chuangxin":              "创新品类",
+            "direct_retail_price":       "卷烟建议零售价",
+            "tbc_total_length":          "烟支总长度",
+            "product_style":             "包装类型",
+        }
+    )
+    product_similarity_map.to_excel("./data/相似卷烟表.xlsx", index=False)
+
+def generate_delivery_strategy():
+    
+    pass
+
+def run():
+    pass
+
+if __name__ == '__main__':
+    generate_features_shap("00000000000000000000000011445301", "350139", delivery_count=5000)
+    generate_similarity_product("350139")
+    eval("00000000000000000000000011445301", "350355")
+    
+    # recommend_list = get_recommend_list_by_gbdt_lr("00000000000000000000000011445301", "350139")
+    # recommend_list = pd.DataFrame(recommend_list)
+    # recommend_list.to_csv("./data/recommend_list.csv", index=False, encoding="utf-8-sig")
+    
+    # 拿龙军数据
+    # data = dao.get_order_by_cust("00000000000000000000000011445301", "445323105795")
+    # data = data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
+    # data.to_csv("./data/cust.csv", index=False)
+    
+    # city_uuid = "00000000000000000000000011445301"
+    # order_data = dao.get_order_by_cust("00000000000000000000000011445301", "445323105795")
+    # order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
+    # order_data = order_data.infer_objects(copy=False)
+    # order_data = order_data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
+    
+    # cust_data = dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
+    # sample_data_clear(cust_data, CustConfig)
+    # shop_data = dao.load_shopping_data(city_uuid)[ShopConfig.FEATURE_COLUMNS]
+    # sample_data_clear(shop_data, ShopConfig)
+    # cust_ids = shop_data.set_index("cust_code")
+    # cust_data = cust_data.join(cust_ids, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
+    
+    # product_data = dao.load_product_data(city_uuid)[ProductConfig.FEATURE_COLUMNS]
+    # sample_data_clear(product_data, ProductConfig)
+    
+    # order_data = order_data.merge(product_data, on="product_code", how="inner")
+    # order_data = order_data.merge(cust_data, left_on='cust_code', right_on='BB_RETAIL_CUSTOMER_CODE', how="inner")
+    
+    # result = gbdtlr_model.inference_from_sample(order_data)
+    # result.to_csv("./data/junlong.csv", index=False)

+ 1 - 160
inference.py

@@ -1,162 +1,3 @@
+from database.db.redis_db import RedisDatabaseHelper
 
-from database import RedisDatabaseHelper, MySqlDao
-from models.item2vec import Item2VecModel
-from models.rank.data.config import CustConfig, ProductConfig, ShopConfig, OrderConfig
-from models.rank.data.utils import sample_data_clear
-from models.rank.gbdt_lr_inference import GbdtLrModel
-from utils.result_process import get_cust_list_from_history_order, split_relation_subtable, generate_report
-import pandas as pd
 
-redis = RedisDatabaseHelper().redis
-dao = MySqlDao()
-gbdtlr_model = GbdtLrModel("./models/rank/weights/00000000000000000000000011445301/gbdtlr_model.pkl")
-item2vec = Item2VecModel("00000000000000000000000011445301")
-
-def get_itemcf_recall(city_uuid, product_id):
-    """协同召回"""
-    key = f"fc:{city_uuid}:{product_id}"
-    recall_list = redis.zrevrange(key, 0, -1, withscores=False)
-    return recall_list
-
-def get_hot_recall(city_uuid):
-    """热度召回"""
-    key = f"hot:{city_uuid}:sale_qty"
-    recall_list = redis.zrevrange(key, 0, -1, withscores=False)
-    return recall_list
-
-def get_recall_cust(city_uuid, product_id, recall_count):
-    """根据协同过滤和热度召回召回商户
-    """
-    itemcf_recall_list = get_itemcf_recall(city_uuid, product_id)
-    hot_recall_list = get_hot_recall(city_uuid)
-    
-    result = list(dict.fromkeys(itemcf_recall_list))
-    
-    # 如果结果不足,从hot_recall中补齐
-    if len(result) < recall_count:
-        hot_recall_set = set(hot_recall_list) - set(result)
-        additional_items = [item for item in hot_recall_list if item in hot_recall_set]
-        needed = recall_count - len(result)
-        result.extend(additional_items[:needed])
-    return result[:recall_count]
-
-def generate_recommend_sample(city_uuid, product_id):
-    """生成预测数据集"""
-    product_in_order = dao.get_product_from_order(city_uuid)["product_code"].unique().tolist()
-    if product_id in product_in_order:
-        recall_count = 1000
-        cust_list = get_recall_cust(city_uuid, product_id, recall_count)
-    else:
-        cust_list = item2vec.get_recommend_cust_list(product_id)["cust_code"].to_list()
-    
-    
-    # 获取卷烟的信息
-    product_data = dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
-    filter_dict = product_data.to_dict("records")[0]
-    
-    cust_data = dao.get_cust_by_ids(city_uuid, cust_list)[CustConfig.FEATURE_COLUMNS]
-    shop_data = dao.get_shop_by_ids(city_uuid, cust_list)[ShopConfig.FEATURE_COLUMNS]
-    
-    product_data = sample_data_clear(product_data, ProductConfig)
-    cust_data = sample_data_clear(cust_data, CustConfig)
-    shop_data = sample_data_clear(shop_data, ShopConfig)
-    
-    cust_feats = shop_data.set_index("cust_code")
-    cust_data = cust_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
-    
-    feats_map = gbdtlr_model.generate_feats_map(product_data, cust_data)
-    
-    return feats_map, filter_dict, cust_list
-
-def get_recommend_list_by_gbdt_lr(city_uuid, product_id):
-    """根据gbdt-lr进行打分并获得推荐列表,适用于推荐历史订单中存在的卷烟"""
-    feats_sample, _, cust_list = generate_recommend_sample(city_uuid, product_id)
-    recommend_list = gbdtlr_model.get_recommend_list(feats_sample, cust_list)
-    return recommend_list
-    
-
-def gbdt_lr_inference(city_uuid, product_id):
-    pass
-
-def generate_features_shap(city_uuid, product_id, delivery_count):
-    feats_sample, filter_dict, cust_list = generate_recommend_sample(city_uuid, product_id)
-    
-    if product_id in dao.get_product_from_order(city_uuid)["product_code"].unique().tolist():
-        # 如果推荐商品为新卷烟,走iterm2vec
-        recommend_data = gbdtlr_model.get_recommend_list(feats_sample, cust_list)
-    else:
-        recommend_data = item2vec.get_recommend_cust_list(product_id).to_dict("records")
-    result = gbdtlr_model.generate_shap_interance(feats_sample)
-    generate_report(city_uuid, result, filter_dict, recommend_data, delivery_count, "./data")
-
-def eval(city_uuid, product_code):
-    """推荐效果验证"""
-    eval_report = get_cust_list_from_history_order(city_uuid, product_code)
-    eval_report.to_csv("./data/效果验证表.csv", index=False)
-    
-def generate_similarity_product(product_code):
-    product_similarity_map = item2vec.generate_product_similarity_map(product_code)
-    product_similarity_map = product_similarity_map[["product_name", "similarity", "brand_name", "factory_name", "is_low_tar", "is_medium", "is_tiny", "is_coarse", "is_exploding_beads", "is_abnormity", "is_cig", "is_chuangxin", "direct_retail_price", "tbc_total_length", "product_style"]]
-    product_similarity_map = product_similarity_map.rename(
-        columns={
-            "product_name": "卷烟名称",
-            "similarity": "相似度",
-            "factory_name": "生产厂商",
-            "brand_name": "品牌名称",
-            "is_low_tar":                "低焦油卷烟",
-            "is_medium":                 "中支烟",
-            "is_tiny":                   "细支烟",
-            "is_coarse":                 "粗支烟",
-            "is_exploding_beads":        "爆珠烟",
-            "is_abnormity":              "异形包装",
-            "is_cig":                    "雪茄烟",
-            "is_chuangxin":              "创新品类",
-            "direct_retail_price":       "卷烟建议零售价",
-            "tbc_total_length":          "烟支总长度",
-            "product_style":             "包装类型",
-        }
-    )
-    product_similarity_map.to_excel("./data/相似卷烟表.xlsx", index=False)
-
-def generate_delivery_strategy():
-    
-    pass
-
-def run():
-    pass
-
-if __name__ == '__main__':
-    generate_features_shap("00000000000000000000000011445301", "350139", delivery_count=5000)
-    generate_similarity_product("350139")
-    eval("00000000000000000000000011445301", "350355")
-    
-    # recommend_list = get_recommend_list_by_gbdt_lr("00000000000000000000000011445301", "350139")
-    # recommend_list = pd.DataFrame(recommend_list)
-    # recommend_list.to_csv("./data/recommend_list.csv", index=False, encoding="utf-8-sig")
-    
-    # 拿龙军数据
-    # data = dao.get_order_by_cust("00000000000000000000000011445301", "445323105795")
-    # data = data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
-    # data.to_csv("./data/cust.csv", index=False)
-    
-    # city_uuid = "00000000000000000000000011445301"
-    # order_data = dao.get_order_by_cust("00000000000000000000000011445301", "445323105795")
-    # order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
-    # order_data = order_data.infer_objects(copy=False)
-    # order_data = order_data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
-    
-    # cust_data = dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
-    # sample_data_clear(cust_data, CustConfig)
-    # shop_data = dao.load_shopping_data(city_uuid)[ShopConfig.FEATURE_COLUMNS]
-    # sample_data_clear(shop_data, ShopConfig)
-    # cust_ids = shop_data.set_index("cust_code")
-    # cust_data = cust_data.join(cust_ids, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
-    
-    # product_data = dao.load_product_data(city_uuid)[ProductConfig.FEATURE_COLUMNS]
-    # sample_data_clear(product_data, ProductConfig)
-    
-    # order_data = order_data.merge(product_data, on="product_code", how="inner")
-    # order_data = order_data.merge(cust_data, left_on='cust_code', right_on='BB_RETAIL_CUSTOMER_CODE', how="inner")
-    
-    # result = gbdtlr_model.inference_from_sample(order_data)
-    # result.to_csv("./data/junlong.csv", index=False)

+ 2 - 2
models/recall/hot_recall.py

@@ -19,7 +19,7 @@ class HotRecallModel:
         """加载订单记录表"""
         print("hot_recall: 正在加载order_info...")
         self._order_data = self._dao.load_order_data(city_uuid)
-        self._order_data =self._order_data[OrderConfig.FEATURE_COLUMNS]
+        self._order_data =self._order_data[OrderConfig.FEATURE_COLUMNS] 
         
         # 数据清洗
         self._order_data["sale_qty"] = self._order_data["sale_qty"].fillna(0)
@@ -35,7 +35,7 @@ class HotRecallModel:
         :return: 所有热度指标的得分
         :rtype: list
         """
-        results = self._order_data.groupby("cust_code")[hot_name].mean().reset_index()
+        results = self._order_data.groupby("cust_code")[hot_name].sum().reset_index()
         sorted_results = results.sort_values(by=hot_name, ascending=False).reset_index(drop=True)
         
         scaler = StandardScaler()

+ 1 - 1
models/recall/itemCF/similarity_matrix.py

@@ -18,7 +18,7 @@ class SimilarityMatrix:
         
         # 数据清洗
         self._order_data["sale_qty"] = self._order_data["sale_qty"].fillna(0)
-        self._order_data = self._order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].sum()
+        self._order_data = self._order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].mean()
         self._order_data = self._order_data[self._order_data["sale_qty"] != 0]
         
     def _build_co_occurace_matrix(self):

+ 80 - 0
recommend.py

@@ -0,0 +1,80 @@
+from database.dao.mysql_dao import MySqlDao
+from database.db.redis_db import RedisDatabaseHelper
+import os
+from models.item2vec.inference import Item2VecModel
+from models.rank.data.config import CustConfig, ProductConfig, ShopConfig
+from models.rank.data.utils import sample_data_clear
+from models.rank.gbdt_lr_inference import GbdtLrModel
+
+
+class Recommend:
+    def __init__(self, city_uuid):
+        self._redis = RedisDatabaseHelper().redis
+        self._dao = MySqlDao()
+        
+        self._load_molde(city_uuid)
+        
+    def _load_molde(self, city_uuid):
+        """加载推演模型"""
+        self._city_uuid = city_uuid
+        gbdtlr_model_path = os.path.join("./models/rank/weights", city_uuid, "gbdtlr_model.pkl")
+        self._gbdtlr_model = GbdtLrModel(gbdtlr_model_path)
+        self._item2vec_model = Item2VecModel(city_uuid)
+        
+    def _get_itemcf_recall(self, product_id):
+        """协同召回"""
+        key = f"fc:{self._city_uuid}:{product_id}"
+        recall_list = self._redis.zrevrange(key, 0, -1, withscores=False)
+        return recall_list
+    
+    def _get_hot_recall(self):
+        """热度召回"""
+        key = f"hot:{self._city_uuid}:sale_qty"
+        recall_list = self._redis.zrevrange(key, 0, -1, withscores=False)
+        return recall_list
+    
+    def _get_recal_cust(self, product_id, recall_count):
+        """通过协同过滤和热度召回,召回待推荐商户列表"""
+        itemcf_recall_list = self._get_itemcf_recall(product_id)
+        hot_recall_list =  self._get_hot_recall()
+        
+        result = list(dict.fromkeys(itemcf_recall_list))
+        # 如果结果不足,从hot_recall中补齐
+        if len(result) < recall_count:
+            hot_recall_set = set(hot_recall_list) - set(result)
+            additional_items = [item for item in hot_recall_list if item in hot_recall_set]
+            needed = recall_count - len(result)
+            result.extend(additional_items[:needed])
+            
+        return result[:recall_count]
+    
+    def get_recommend_list_by_gbdtlr(self, product_id, recall_count=100, discovery_count=500):
+        """根据gbdt_lr获取商户推荐列表"""
+        # 获取召回的商户列表
+        recall_cust_list = self._get_recal_cust(product_id, recall_count)
+        print(len(recall_cust_list))
+        # 获取卷烟数据
+        product_data = self._dao.get_product_by_id(self._city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
+        product_data = sample_data_clear(product_data, ProductConfig)
+        
+        # 获取整合商户数据
+        cust_data = self._dao.get_cust_by_ids(self._city_uuid, recall_cust_list)[CustConfig.FEATURE_COLUMNS]
+        shop_data = self._dao.get_shop_by_ids(self._city_uuid, recall_cust_list)[ShopConfig.FEATURE_COLUMNS]
+        cust_data = sample_data_clear(cust_data, CustConfig)
+        shop_data = sample_data_clear(shop_data, ShopConfig)
+        
+        cust_feats = shop_data.set_index("cust_code")
+        cust_data = cust_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
+        
+        # 获取推理用的feats_map
+        feats_map = self._gbdtlr_model.generate_feats_map(product_data, cust_data)
+        print(len(cust_data))
+        recommend_list = self._gbdtlr_model.get_recommend_list(feats_map, recall_cust_list)
+        
+        return recommend_list
+    
+if __name__ == "__main__":
+    city_uuid = "00000000000000000000000011445301"
+    product_id = '110110'
+    recommend = Recommend(city_uuid)
+    recommend_list = recommend.get_recommend_list_by_gbdtlr(product_id)