Browse Source

将Item2vec冷启动模式添加到推理流程中

yangzeyu 11 months ago
parent
commit
3df5dc7f80

+ 32 - 2
database/dao/mysql_dao.py

@@ -20,8 +20,7 @@ class MySqlDao:
         self._product_tablename = "tads_brandcul_product_info_f"
         self._cust_tablename = "tads_brandcul_cust_info_f"
         self._order_tablename = "tads_brandcul_consumer_order"
-        # self._order_tablename = "tads_brandcul_consumer_order"
-        # self._eval_order_name = "tads_brandcul_consumer_order_check"
+        self._eval_order_name = "tads_brandcul_consumer_order_check"
         self._mock_order_tablename = "yunfu_mock_data"
         self._shopping_tablename = "tads_brandcul_cust_info_lbs_f"
         # self._shopping_tablename = "yunfu_shopping_mock_data"
@@ -55,6 +54,17 @@ class MySqlDao:
         
         return data
     
+    def load_eval_order_data(self, city_uuid):
+        """从数据库中读取订单信息"""
+        query = f"SELECT * FROM {self._eval_order_name} WHERE city_uuid = :city_uuid"
+        params = {"city_uuid": city_uuid}
+        
+        data = self.db_helper.load_data_with_page(query, params)
+        data.drop('stat_month', axis=1, inplace=True)
+        data.drop('city_uuid', axis=1, inplace=True)
+        
+        return data
+    
     def load_mock_order_data(self):
         """从数据库中读取mock的订单信息"""
         query = f"SELECT * FROM {self._mock_order_tablename}"
@@ -174,6 +184,18 @@ class MySqlDao:
         
         return data
     
+    def get_eval_order_by_product(self, city_uuid, product_id):
+        query = f"""
+            SELECT *
+            FROM {self._eval_order_name}
+            WHERE city_uuid = :city_uuid
+            AND product_code = :product_id
+        """
+        params = {"city_uuid": city_uuid, "product_id": product_id}
+        data = self.db_helper.load_data_with_page(query, params)
+        
+        return data
+    
     def get_order_by_cust(self, city_uuid, cust_id):
         query = f"""
             SELECT *
@@ -199,6 +221,14 @@ class MySqlDao:
         
         return data
     
+    def get_product_from_order(self, city_uuid):
+        query = f"SELECT DISTINCT product_code FROM {self._order_tablename} WHERE city_uuid = :city_uuid"
+        params = {"city_uuid": city_uuid}
+        
+        data = pd.DataFrame(self.db_helper.fetch_all(text(query), params))
+        
+        return data
+    
     def data_preprocess(self, data: pd.DataFrame):
         
         data.drop(["cust_uuid", "longitude", "latitude", "range_radius"], axis=1, inplace=True)

+ 30 - 10
inference.py

@@ -1,14 +1,16 @@
 
 from database import RedisDatabaseHelper, MySqlDao
+from models.item2vec import Item2VecModel
 from models.rank.data.config import CustConfig, ProductConfig, ShopConfig, OrderConfig
 from models.rank.data.utils import sample_data_clear
 from models.rank.gbdt_lr_inference import GbdtLrModel
-from utils.result_process import split_relation_subtable, generate_report
+from utils.result_process import get_cust_list_from_history_order, split_relation_subtable, generate_report
 import pandas as pd
 
 redis = RedisDatabaseHelper().redis
 dao = MySqlDao()
 gbdtlr_model = GbdtLrModel("./models/rank/weights/00000000000000000000000011445301/gbdtlr_model.pkl")
+item2vec = Item2VecModel("00000000000000000000000011445301")
 
 def get_itemcf_recall(city_uuid, product_id):
     """协同召回"""
@@ -23,7 +25,8 @@ def get_hot_recall(city_uuid):
     return recall_list
 
 def get_recall_cust(city_uuid, product_id, recall_count):
-    """根据协同过滤和热度召回召回商户"""
+    """根据协同过滤和热度召回召回商户
+    """
     itemcf_recall_list = get_itemcf_recall(city_uuid, product_id)
     hot_recall_list = get_hot_recall(city_uuid)
     
@@ -39,11 +42,18 @@ def get_recall_cust(city_uuid, product_id, recall_count):
 
 def generate_recommend_sample(city_uuid, product_id):
     """生成预测数据集"""
-    recall_count = 1000
-    cust_list = get_recall_cust(city_uuid, product_id, recall_count)
+    product_in_order = dao.get_product_from_order(city_uuid)["product_code"].unique().tolist()
+    if product_id in product_in_order:
+        recall_count = 1000
+        cust_list = get_recall_cust(city_uuid, product_id, recall_count)
+    else:
+        cust_list = item2vec.get_recommend_cust_list(product_id)["cust_code"].to_list()
     
+    
+    # 获取卷烟的信息
     product_data = dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
     filter_dict = product_data.to_dict("records")[0]
+    
     cust_data = dao.get_cust_by_ids(city_uuid, cust_list)[CustConfig.FEATURE_COLUMNS]
     shop_data = dao.get_shop_by_ids(city_uuid, cust_list)[ShopConfig.FEATURE_COLUMNS]
     
@@ -58,7 +68,8 @@ def generate_recommend_sample(city_uuid, product_id):
     
     return feats_map, filter_dict, cust_list
 
-def get_recommend_list(city_uuid, product_id):
+def get_recommend_list_by_gbdt_lr(city_uuid, product_id):
+    """根据gbdt-lr进行打分并获得推荐列表,适用于推荐历史订单中存在的卷烟"""
     feats_sample, _, cust_list = generate_recommend_sample(city_uuid, product_id)
     recommend_list = gbdtlr_model.get_recommend_list(feats_sample, cust_list)
     return recommend_list
@@ -69,11 +80,19 @@ def gbdt_lr_inference(city_uuid, product_id):
 
 def generate_features_shap(city_uuid, product_id, delivery_count):
     feats_sample, filter_dict, cust_list = generate_recommend_sample(city_uuid, product_id)
-    result = gbdtlr_model.generate_shap_interance(feats_sample)
     
-    recommend_data = gbdtlr_model.get_recommend_list(feats_sample, cust_list)
+    if product_id in dao.get_product_from_order(city_uuid)["product_code"].unique().tolist():
+        # 如果推荐商品为新卷烟,走iterm2vec
+        recommend_data = gbdtlr_model.get_recommend_list(feats_sample, cust_list)
+    else:
+        recommend_data = item2vec.get_recommend_cust_list(product_id).to_dict("records")
+    result = gbdtlr_model.generate_shap_interance(feats_sample)
     generate_report(city_uuid, result, filter_dict, recommend_data, delivery_count, "./data")
-    
+
+def eval(city_uuid, product_code):
+    """推荐效果验证"""
+    eval_report = get_cust_list_from_history_order(city_uuid, product_code)
+    eval_report.to_csv("./data/eval.csv", index=False)
 
 def generate_delivery_strategy():
     
@@ -83,8 +102,9 @@ def run():
     pass
 
 if __name__ == '__main__':
-    generate_features_shap("00000000000000000000000011445301", "420202", delivery_count=5000)
-    # recommend_list = get_recommend_list("00000000000000000000000011445301", "420202")
+    # generate_features_shap("00000000000000000000000011445301", "350139", delivery_count=5000)
+    eval("00000000000000000000000011445301", "350355")
+    # recommend_list = get_recommend_list_by_gbdt_lr("00000000000000000000000011445301", "350139")
     # recommend_list = pd.DataFrame(recommend_list)
     # recommend_list.to_csv("./data/recommend_list.csv", index=False, encoding="utf-8-sig")
     

+ 20 - 7
models/item2vec/inference.py

@@ -2,7 +2,9 @@ from database.dao.mysql_dao import MySqlDao
 from models.item2vec import Item2Vec
 from models.rank.data.config import OrderConfig, ProductConfig
 from models.rank.data.utils import sample_data_clear
+import numpy as np
 import pandas as pd
+from sklearn.preprocessing import StandardScaler
 
 class Item2VecModel:
     def __init__(self, city_uuid):
@@ -25,12 +27,12 @@ class Item2VecModel:
     def get_similarity_list(self, product_code, top=40):
         """获取与指卷烟最相似的top k个卷烟"""
         similarity_map = self.generate_product_similarity_map(product_code)
+        similarity_map.to_excel("./data/product_similarity.xlsx", index=False)
         similarity_list = similarity_map["product_code"].to_list()
-        # similarity_list.remove(product_code)
         similarity_list = similarity_list[:top]
         return similarity_list
     
-    def get_recommend_cust_list(self, product_code, top=50):
+    def get_recommend_cust_list(self, product_code, top=100):
         """获取推荐的商户列表"""
         product_list = self.get_similarity_list(product_code)
         order_data = self._dao.get_order_by_product_ids(self._city_uuid, product_list)[OrderConfig.FEATURE_COLUMNS]
@@ -46,18 +48,29 @@ class Item2VecModel:
             .head(top)
         )
         
-        recommend_cust = order_data.groupby(["cust_code"], as_index=False)["sale_qty"].sum()
-        recommend_cust = recommend_cust.sort_values(["sale_qty"], ascending=[False])
-        recommend_cust.to_csv("./data/recommend.csv", index=False)
+        recommend_cust = (
+            order_data.groupby(["cust_code"], as_index=False)["sale_qty"].sum()
+            .query("sale_qty > 0")
+            .sort_values(["sale_qty"], ascending=[False])
+        )
+        
+        # 对销量进行归一化
+        scaler = StandardScaler()
+        normalized = scaler.fit_transform(recommend_cust["sale_qty"].values.reshape(-1, 1))
+        recommend_cust["sale_qty"] = ((1 / (1 + np.exp(-normalized))) * 100).flatten()
+        recommend_cust = recommend_cust.rename(columns={"sale_qty": "recommend_score"})
+        # recommend_cust.to_csv("./data/item2vec_recommend.csv", index=False)
+        
+        return recommend_cust
         
         
         
 if __name__ == "__main__":
     city_uuid = "00000000000000000000000011445301"
-    product_id = "420202"
+    product_id = "350139"
     
     model = Item2VecModel(city_uuid)
-    model.get_recommend_cust_list(product_id)
+    model.get_similarity_list(product_id)
     # dao = MySqlDao()
     # data = dao.get_order_by_cust_and_product(city_uuid, "445300108802", "340223")[OrderConfig.FEATURE_COLUMNS]
     # data.to_csv("./data/result.csv", index=False)

+ 1 - 1
models/item2vec/item2vec.py

@@ -80,7 +80,7 @@ class Item2Vec:
 if __name__ == "__main__":
     dao = MySqlDao()
     city_uuid = "00000000000000000000000011445301"
-    product_id = "420202"
+    product_id = "350139"
     
     product = dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
     product = sample_data_clear(product, ProductConfig)

+ 1 - 1
models/item2vec/preprocess.py

@@ -9,7 +9,7 @@ class Item2VecDataProcess:
         self._mysql_dao = MySqlDao()
         print("item2vec: 正在加载product_info...")
         # self._product_data = self._mysql_dao.load_product_data(city_uuid)
-        product_ids = self._mysql_dao.load_order_data(city_uuid)["product_code"].unique().tolist()
+        product_ids = self._mysql_dao.get_product_from_order(city_uuid)["product_code"].unique().tolist()
         self._product_data = self._mysql_dao.get_product_by_ids(city_uuid, product_ids)
         self._data_process()
         

+ 1 - 0
models/rank/gbdt_lr_inference.py

@@ -72,6 +72,7 @@ class GbdtLrModel:
             key=lambda x: x["recommend_score"],
             reverse=True
         )
+        
         return recommend_list
     
     def inference_from_sample(self, sample):

BIN
models/rank/weights/00000000000000000000000011445301/gbdtlr_model.pkl


+ 2 - 3
utils/result_process.py

@@ -5,7 +5,6 @@ from models.rank.data.config import ImportanceFeaturesMap, ProductConfig
 
 dao = MySqlDao()
 def filter_data(data, filter_dict):
-    
     product_content = []
     for key, value in filter_dict.items():
         if key != 'product_code':
@@ -88,7 +87,7 @@ def generate_recommend_report(city_uuid, recommend_data, delivery_count):
        
 def get_cust_list_from_history_order(city_uuid, product_code):
     # 获取订单数据并处理
-    order_data = dao.get_order_by_product(city_uuid, product_code)
+    order_data = dao.get_eval_order_by_product(city_uuid, product_code)
     order_data = order_data[["cust_code", "cust_name", "product_code", "product_name", "sale_qty", "sale_amt"]]
     
     # 确保cust_code是字符串类型
@@ -99,7 +98,7 @@ def get_cust_list_from_history_order(city_uuid, product_code):
     
     # 读取推荐数据
     recommend_data = pd.read_csv('./data/recommend_report.csv')
-    recommend_data = recommend_data.drop(columns=["sale_qty"])
+    # recommend_data = recommend_data.drop(columns=["sale_qty"])
     # 确保recommend_data中的cust_code也是字符串类型
     recommend_data["cust_code"] = recommend_data["cust_code"].astype(str)
     cust_ids = recommend_data.set_index("cust_code")