Ver Fonte

item2vec计算卷烟相似度

yangzeyu há 11 meses atrás
pai
commit
43af8d1fc8

+ 1 - 1
database/dao/mysql_dao.py

@@ -19,7 +19,7 @@ class MySqlDao:
         self.db_helper = MySqlDatabaseHelper()
         self._product_tablename = "tads_brandcul_product_info_f"
         self._cust_tablename = "tads_brandcul_cust_info_f"
-        self._order_tablename = "tads_brandcul_consumer_order"
+        self._order_tablename = "tads_brandcul_consumer_order_check"
         # self._order_tablename = "tads_brandcul_consumer_order"
         # self._eval_order_name = "tads_brandcul_consumer_order_check"
         self._mock_order_tablename = "yunfu_mock_data"

+ 7 - 5
database/db/mysql.py

@@ -40,14 +40,16 @@ class MySqlDatabaseHelper:
         # 通过连接池创建engine
         self.engine = create_engine(
             conn,
-            pool_size=10, # 设置连接池大小
-            max_overflow=20, # 超过连接池大小时的额外连接数
-            pool_recycle=3600 # 回收连接时间
+            pool_size=20, # 设置连接池大小
+            max_overflow=30, # 超过连接池大小时的额外连接数
+            pool_recycle=1800, # 回收连接时间
+            pool_pre_ping=True, # 防止断开连接
+            isolation_level="READ COMMITTED" # 降低隔离级别
         )
         
         self._DBSession = sessionmaker(bind=self.engine)
-        
-    def load_data_with_page(self, query, params, page_size=1000):
+    
+    def load_data_with_page(self, query, params, page_size=100000):
         """分页查询数据"""
         data = pd.DataFrame()
         count_query = text(query.replace("SELECT *", "SELECT COUNT(*)"))

+ 19 - 19
inference.py

@@ -39,7 +39,7 @@ def get_recall_cust(city_uuid, product_id, recall_count):
 
 def generate_recommend_sample(city_uuid, product_id):
     """生成预测数据集"""
-    recall_count = 300
+    recall_count = 1000
     cust_list = get_recall_cust(city_uuid, product_id, recall_count)
     
     product_data = dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
@@ -83,7 +83,7 @@ def run():
     pass
 
 if __name__ == '__main__':
-    # generate_features_shap("00000000000000000000000011445301", "420202", delivery_count=5000)
+    generate_features_shap("00000000000000000000000011445301", "420202", delivery_count=5000)
     # recommend_list = get_recommend_list("00000000000000000000000011445301", "420202")
     # recommend_list = pd.DataFrame(recommend_list)
     # recommend_list.to_csv("./data/recommend_list.csv", index=False, encoding="utf-8-sig")
@@ -93,24 +93,24 @@ if __name__ == '__main__':
     # data = data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
     # data.to_csv("./data/cust.csv", index=False)
     
-    city_uuid = "00000000000000000000000011445301"
-    order_data = dao.get_order_by_cust("00000000000000000000000011445301", "445323105795")
-    order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
-    order_data = order_data.infer_objects(copy=False)
-    order_data = order_data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
+    # city_uuid = "00000000000000000000000011445301"
+    # order_data = dao.get_order_by_cust("00000000000000000000000011445301", "445323105795")
+    # order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
+    # order_data = order_data.infer_objects(copy=False)
+    # order_data = order_data.groupby(["cust_code", "product_code", "product_name"], as_index=False)["sale_qty"].sum()
     
-    cust_data = dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
-    sample_data_clear(cust_data, CustConfig)
-    shop_data = dao.load_shopping_data(city_uuid)[ShopConfig.FEATURE_COLUMNS]
-    sample_data_clear(shop_data, ShopConfig)
-    cust_ids = shop_data.set_index("cust_code")
-    cust_data = cust_data.join(cust_ids, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
+    # cust_data = dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
+    # sample_data_clear(cust_data, CustConfig)
+    # shop_data = dao.load_shopping_data(city_uuid)[ShopConfig.FEATURE_COLUMNS]
+    # sample_data_clear(shop_data, ShopConfig)
+    # cust_ids = shop_data.set_index("cust_code")
+    # cust_data = cust_data.join(cust_ids, on="BB_RETAIL_CUSTOMER_CODE", how="inner")
     
-    product_data = dao.load_product_data(city_uuid)[ProductConfig.FEATURE_COLUMNS]
-    sample_data_clear(product_data, ProductConfig)
+    # product_data = dao.load_product_data(city_uuid)[ProductConfig.FEATURE_COLUMNS]
+    # sample_data_clear(product_data, ProductConfig)
     
-    order_data = order_data.merge(product_data, on="product_code", how="inner")
-    order_data = order_data.merge(cust_data, left_on='cust_code', right_on='BB_RETAIL_CUSTOMER_CODE', how="inner")
+    # order_data = order_data.merge(product_data, on="product_code", how="inner")
+    # order_data = order_data.merge(cust_data, left_on='cust_code', right_on='BB_RETAIL_CUSTOMER_CODE', how="inner")
     
-    result = gbdtlr_model.inference_from_sample(order_data)
-    result.to_csv("./data/junlong.csv", index=False)
+    # result = gbdtlr_model.inference_from_sample(order_data)
+    # result.to_csv("./data/junlong.csv", index=False)

+ 70 - 12
models/item2vec/item2vec.py

@@ -1,34 +1,92 @@
 import joblib
+from database.dao.mysql_dao import MySqlDao
 from models.item2vec import Item2VecDataProcess
+import numpy as np
 from gensim.models import Word2Vec
+
+from models.rank.data.config import ProductConfig
+from models.rank.data.utils import sample_data_clear
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+from tqdm import tqdm
+
 class Item2Vec:
     def __init__(self, city_uuid):
         self._load_data(city_uuid)
+        self._load_model()
     
     def _load_data(self, city_uuid):
         """加载特征sentence"""
         data_processor = Item2VecDataProcess(city_uuid)
-        self._sentences = data_processor.generate_sentence()
-        
-    def train(self):
+        self._tokens_map = data_processor.generate_tokens()
+        self._tokens = [item["token"] for item in self._tokens_map]
+            
+    def _load_model(self):
         self._model = Word2Vec(
-            self._sentences,
+            self._tokens,
             vector_size=64,
-            window=4,
+            window=3,
             min_count=1,
             sg=1, # skip-gram
             workers=4,
-            epochs=20
+            epochs=20,
+            sample=0.0000001
         )
         
-    def save_model(self, model_path):
-        joblib.dump(self._model, model_path)
+    def token_to_vector(self, tokens):
+        """将token转换为vector"""
+        vector = [self._model.wv[token] for token in tokens if token in self._model.wv]
+        return np.mean(vector, axis=0) if vector else np.zeros(self._model.vector_size)
+    
+    def item_to_token(self, item):
+        token = []
+        for col in ProductConfig.FEATURE_COLUMNS:
+            if col == 'product_code':
+                continue
+            else:
+                token.append(f"{item[col].strip()}")
+        
+        return token
+    
+    def get_similarity_map(self, product):
+        """获取目标卷烟与所有卷烟的相似度"""
+        product = product.squeeze().to_dict()
+        token = self.item_to_token(product)
+        vector = self.token_to_vector(token).reshape(1, -1)
+        
+        similarity_map = []
+        for item in tqdm(self._tokens_map, desc="正在计算卷烟相似度..."):
+            target_product_code = item["product_code"]
+            torget_token = item["token"]
+            target_vectot = self.token_to_vector(torget_token).reshape(1, -1)
+            
+            similarity = cosine_similarity(vector, target_vectot)[0][0]
+            
+            similarity_map.append(
+                {
+                    "product_code": product['product_code'], 
+                    "target_product_code": target_product_code,
+                    "similarity": similarity
+                }
+            )
+        similarity_map.sort(key=lambda x: x["similarity"], reverse=True)
+        
+        return similarity_map
+        
+            
+
         
         
 if __name__ == "__main__":
+    dao = MySqlDao()
     city_uuid = "00000000000000000000000011445301"
+    product_id = "420202"
+    order_data = dao.load_order_data(city_uuid)
+    product = dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
+    product = sample_data_clear(product, ProductConfig)
     model = Item2Vec(city_uuid)
-    print("开始训练Item2Vec...")
-    model.train()
-    
-    
+    sims = model.get_similarity_map(product)
+    sims = pd.DataFrame(sims)
+    product_info = dao.load_product_data(city_uuid)[ProductConfig.FEATURE_COLUMNS]
+    sims = sims.merge(product_info, left_on="target_product_code", right_on="product_code", how="inner")
+    sims.to_csv("./data/product_similarity.csv", index=False)

+ 9 - 11
models/item2vec/preprocess.py

@@ -18,25 +18,23 @@ class Item2VecDataProcess:
         # 数据清洗
         self._product_data = sample_data_clear(self._product_data, ProductConfig)
         
-    def tokenize_features(self, row):
+    def item_to_token(self, row):
         """根据每款烟的特征生成sentence"""
-        tokens = []
+        token = []
         
         for col in ProductConfig.FEATURE_COLUMNS:
             if col == 'product_code':
                 continue
-            if col in ["direct_retail_price", "tbc_total_length"]:
-                tokens.append(f"{col}_{row[col].replace('-', '_')}")
             else:
-                tokens.append(f"{col}_{row[col]}")
-        
-        return tokens
+                token.append(f"{row[col].strip()}")
+        token_map = {"product_code": row['product_code'], "token": token}
+        return token_map
     
-    def generate_sentence(self):
-        sentcens = self._product_data.apply(self.tokenize_features, axis=1).tolist()
-        return sentcens
+    def generate_tokens(self):
+        tokens = self._product_data.apply(self.item_to_token, axis=1).tolist()
+        return tokens
         
 if __name__ == "__main__":
     city_uuid = "00000000000000000000000011445301"
     processor = Item2VecDataProcess(city_uuid)
-    processor.generate_sentence()
+    processor.generate_tokens()

+ 2 - 2
models/rank/gbdt_lr.py

@@ -23,7 +23,7 @@ class Trainer:
             
             # 树结构控制
             'num_leaves': 31,               # 叶子节点数 (建议20-63)
-            'max_depth': 7,                 # 树深度 (3-7)
+            'max_depth': 6,                 # 树深度 (3-7)
             'min_child_samples': 30,        # 叶子节点最小样本数 (20-100)
             'min_split_gain': 0.02,         # 分裂最小增益 (0.01-0.1)
             
@@ -36,7 +36,7 @@ class Trainer:
             
             # 学习控制
             'learning_rate': 0.05,          # 学习率 (0.01-0.1)
-            'n_estimators': 1000,           # 树的数量 (配合早停)
+            'n_estimators': 100,           # 树的数量 (配合早停)
             # 'early_stopping_rounds': 50,    # 早停轮数
             
             # 类别特征处理