Sherlock преди 11 месеца
родител
ревизия
9f941bfea8

+ 1 - 1
gbdt_lr.py

@@ -134,7 +134,7 @@ def run():
     parser.add_argument("--recommend", action='store_true')
     parser.add_argument("--importance", action='store_true')
     
-    parser.add_argument("--train_data_dir", type=str, default="./data")
+    parser.add_argument("--train_data_dir", type=str, default="./data/gbdt")
     parser.add_argument("--model_path", type=str, default="./models/rank/weights")
     parser.add_argument("--model_name", type=str, default='model.pkl')
     parser.add_argument("--last_n", type=int, default=200)

+ 0 - 2
models/__init__.py

@@ -1,12 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
 from models.recall.hot_recall import HotRecallModel
-from models.recall.itemCF.calculate_similarity_matrix import calculate_similarity_and_save_results
 from models.recall.itemCF.user_item_score import UserItemScore
 from models.recall.itemCF.ItemCF import ItemCFModel
 __all__ = [
     "HotRecallModel",
     "UserItemScore",
-    "calculate_similarity_and_save_results",
     "ItemCFModel"
 ]

+ 1 - 1
models/rank/data/dataloader.py

@@ -1,7 +1,7 @@
 import pandas as pd
 from models.rank.data.config import CustConfig, ProductConfig, ShopConfig
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler,MinMaxScaler
 from models.rank.data.utils import one_hot_embedding
 
 class DataLoader:

+ 4 - 4
models/rank/data/preprocess.py

@@ -10,14 +10,14 @@ class DataProcess():
     def __init__(self, city_uuid, save_dir):
         self._mysql_dao = MySqlDao()
         self.save_dir = save_dir
-        print("正在加载cust_info...")
+        print("gbdr-lr: 正在加载cust_info...")
         self._cust_data = self._mysql_dao.load_cust_data(city_uuid)
-        print("正在加载product_info...")
+        print("gbdr-lr: 正在加载product_info...")
         self._product_data = self._mysql_dao.load_product_data(city_uuid)
-        print("正在加载order_info...")
+        print("gbdr-lr: 正在加载order_info...")
         self._order_data = self._mysql_dao.load_order_data(city_uuid)
         # self._order_data = self._mysql_dao.load_mock_order_data()
-        print("正在加载shopping_info...")
+        print("gbdr-lr: 正在加载shopping_info...")
         self._shopping_data = self._mysql_dao.load_shopping_data(city_uuid)
         
     def data_process(self):

+ 8 - 10
models/recall/itemCF/ItemCF.py

@@ -19,7 +19,7 @@ class ItemCFModel:
         
         def process_product(product_code, scores):
             # 获取热度最高的n个商户
-            top_n_shops = scores.nlargest(n, "SCORE")["BB_RETAIL_CUSTOMER_CODE"].values
+            top_n_shops = scores.nlargest(n, "score")["cust_code"].values
             top_n_indices = [self._shop_index[shop] for shop in top_n_shops]
             
             # 找到每个商户最相似的k个商户
@@ -40,7 +40,7 @@ class ItemCFModel:
                 interest_score = 0
                 for shop_idx in top_n_indices:
                     if self._index_shop[candidate_idx] in similar_shops[self._index_shop[shop_idx]]:
-                        shop_score = scores[scores["BB_RETAIL_CUSTOMER_CODE"]==self._index_shop[shop_idx]]["SCORE"].values[0]
+                        shop_score = scores[scores["cust_code"]==self._index_shop[shop_idx]]["score"].values[0]
                         interest_score += shop_score * self._similarity_matrix[shop_idx, candidate_idx]
                 interest_scores[self._index_shop[candidate_idx]] = interest_score
             
@@ -52,7 +52,7 @@ class ItemCFModel:
         
         # 并行处理每个品规
         results = Parallel(n_jobs=n_jobs)(delayed(process_product)(product_code, scores) 
-                                          for product_code, scores in tqdm(self._score_df.groupby("PRODUCT_CODE"), desc="train:正在计算候选得分"))
+                                          for product_code, scores in tqdm(self._score_df.groupby("product_code"), desc="train:正在计算候选得分"))
         print(len(results))
         # 存储结果
         self._recommendations = {product_code: sorted_candidates for product_code, sorted_candidates in results}
@@ -85,10 +85,10 @@ class ItemCFModel:
             redis_db.redis.zadd(redis_key, zset_data)
     
 if __name__ == "__main__":
-    score_path = "./models/recall/itemCF/matrix/score.csv"
-    similarity_path = "./models/recall/itemCF/matrix/similarity.csv"
-    # itemcf_model = ItemCFModel()
-    # itemcf_model.train(score_path, similarity_path, n_jobs=4)
+    score_path = "./data/itemcf/scores.csv"
+    similarity_path = "./data/itemcf/similarity.csv"
+    itemcf_model = ItemCFModel()
+    itemcf_model.train(score_path, similarity_path, "00000000000000000000000011445301", n_jobs=4)
     # recommend_list = itemcf_model.inference(110111)
     # itemcf_model.to_redis_zset()
     # print(len(recommend_list))
@@ -98,6 +98,4 @@ if __name__ == "__main__":
     # model = joblib.load("./itemCF.model")
     # recommend_list = model.inference(110102)
     # print(len(recommend_list))
-    # print(recommend_list)
-    data = pd.read_csv(similarity_path, index_col=0)
-    print(data)
+    # print(recommend_list)

+ 0 - 79
models/recall/itemCF/calculate_similarity_matrix.py

@@ -1,79 +0,0 @@
-from database import MySqlDao
-import pandas as pd
-import numpy as np
-
-from itertools import combinations
-from tqdm import tqdm
-
-dao = MySqlDao()
-def build_co_occurence_matrix(order_data):
-    """
-    构建商户共现矩阵
-    """
-    # 获取所有商户的唯一列表
-    shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique()
-    num_shops = len(shops)
-    
-    # 创建商户到索引的映射
-    shops_to_index = {shop: idx for idx, shop in enumerate(shops)}
-    # 初始化共现矩阵(上三角部分)
-    co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
-    
-    # 按照品规分组
-    grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list)
-    
-    # 遍历每个品规的商户列表
-    for shop_in_product in grouped:
-        # 生成商户对
-        shop_pairs = combinations(shop_in_product, 2)
-        for shop1, shop2 in shop_pairs:
-            # 获取商户索引
-            idx1 = shops_to_index[shop1]
-            idx2 = shops_to_index[shop2]
-            # 更新共现矩阵
-            co_occurrence_matrix[idx1, idx2] += 1
-            co_occurrence_matrix[idx2, idx1] += 1
-    return co_occurrence_matrix, shops, shops_to_index
-
-def calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index):
-    """
-    使用向量计算商铺之间的相似度矩阵
-    """
-    # 计算每个商铺售卖品规的总次数
-    shop_counts = order_data.groupby("BB_RETAIL_CUSTOMER_CODE").size()
-    
-    # 将商户售卖次数转换为数组
-    counts = np.array([shop_counts[shop] for shop in shops_to_index.keys()])
-    
-    # 计算分母部分 (sqrt(count_i * count_j))
-    denominator = np.sqrt(np.outer(counts, counts))
-    
-    # 计算相似度矩阵
-    similarity_matrix = co_occurrence_matrix / denominator
-    
-    # 将对角线设置为1
-    np.fill_diagonal(similarity_matrix, 1.0)
-    
-    return similarity_matrix
-
-def save_matrix(matrix, shops, save_path):
-    """
-    保存共现矩阵
-    """
-    matrix_df = pd.DataFrame(matrix, index=shops, columns=shops)
-    matrix_df.to_csv(save_path, index=True, encoding="utf-8")
-    
-def calculate_similarity_and_save_results(order_data, similarity_matrix_save_path):
-    co_occurrence_matrix, shops, shops_to_index = build_co_occurence_matrix(order_data)
-    similarity_matrix = calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index)
-    save_matrix(similarity_matrix, shops, similarity_matrix_save_path)
-    
-if __name__ == "__main__":
-    co_occurrence_save_path = "./models/recall/itemCF/matrix/occurrence.csv"
-    similarity_matrix_save_path = "./models/recall/itemCF/matrix/similarity.csv"
-    # 从数据库中读取订单数据
-    order_data = dao.load_order_data()
-    
-    calculate_similarity_and_save_results(order_data, similarity_matrix_save_path)
-    
-    

+ 37 - 0
models/recall/itemCF/score.py

@@ -0,0 +1,37 @@
+from database import MySqlDao
+from models.rank.data.config import OrderConfig
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+class UserItemScore:
+    def __init__(self, city_uuid):
+        self._dao = MySqlDao()
+        self._load_data(city_uuid)
+        
+    def _load_data(self, city_uuid):
+        """加载订单记录表"""
+        print("item-cf: 正在加载order_info...")
+        self._order_data = self._dao.load_order_data(city_uuid)
+        self._order_data =self._order_data[OrderConfig.FEATURE_COLUMNS]
+        
+        # 数据清洗
+        self._order_data["sale_qty"] = self._order_data["sale_qty"].fillna(0)
+        self._order_data = self._order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].sum()
+        self._order_data = self._order_data[self._order_data["sale_qty"] != 0]
+        
+        # 归一化处理
+        scaler = StandardScaler()
+        normalized = scaler.fit_transform(self._order_data["sale_qty"].values.reshape(-1, 1))
+        self._order_data["sale_qty"] = ((1 / (1 + np.exp(-normalized))) * 100).flatten()
+    
+        
+    def generate_product_scores(self, save_path):
+        self._order_data = self._order_data.rename(columns={'sale_qty': 'score'})
+        self._order_data = self._order_data.sort_values(['product_code', 'score'], ascending=[True, False])
+        self._score_data = self._order_data[['product_code', 'cust_code', 'score']]
+        self._score_data.to_csv(save_path, index=False, encoding="utf-8")
+        
+        
+if __name__ == "__main__":
+    save_path = "./data/itemcf/scores.csv"
+    score_utils = UserItemScore("00000000000000000000000011445301") 
+    score_utils.generate_product_scores(save_path)

+ 76 - 0
models/recall/itemCF/similarity_matrix.py

@@ -0,0 +1,76 @@
+from database import MySqlDao
+from itertools import combinations
+from models.rank.data.config import OrderConfig
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+
+class SimilarityMatrix:
+    def __init__(self, city_uuid):
+        self._dao = MySqlDao()
+        self._load_data(city_uuid)
+        self._build_co_occurace_matrix()
+        
+    def _load_data(self, city_uuid):
+        """加载订单记录表"""
+        print("item-cf: 正在加载order_info...")
+        self._order_data = self._dao.load_order_data(city_uuid)
+        self._order_data =self._order_data[OrderConfig.FEATURE_COLUMNS]
+        
+        # 数据清洗
+        self._order_data["sale_qty"] = self._order_data["sale_qty"].fillna(0)
+        self._order_data = self._order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].sum()
+        self._order_data = self._order_data[self._order_data["sale_qty"] != 0]
+        
+    def _build_co_occurace_matrix(self):
+        """构建商户共现矩阵"""
+       # 获取所有商户的唯一列表
+        self._shops = self._order_data["cust_code"].unique()
+        num_shops = len(self._shops)
+        
+        # 创建商户到索引的映射
+        self._shops_to_index = {shop: idx for idx, shop in enumerate(self._shops)}
+        # 初始化共现矩阵(上三角部分)
+        self._co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
+        
+        # 按照品规分组
+        grouped = self._order_data.groupby("product_code")["cust_code"].apply(list)
+        
+        # 遍历每个品规的商户列表
+        for shop_in_product in tqdm(grouped, desc="正在构建共现矩阵..."):
+            # 生成商户对
+            shop_pairs = combinations(shop_in_product, 2)
+            for shop1, shop2 in shop_pairs:
+                # 获取商户索引
+                idx1 = self._shops_to_index[shop1]
+                idx2 = self._shops_to_index[shop2]
+                # 更新共现矩阵
+                self._co_occurrence_matrix[idx1, idx2] += 1
+                self._co_occurrence_matrix[idx2, idx1] += 1
+                
+    def calculate_similarity_matrix(self, save_path):
+        """使用向量计算商铺之间的相似度矩阵"""
+        # 计算每个商铺售卖品规的总次数
+        shop_counts = self._order_data.groupby("cust_code").size()
+        
+        # 将商户售卖次数转换为数组
+        counts = np.array([shop_counts[shop] for shop in self._shops_to_index.keys()])
+        
+        # 计算分母部分 (sqrt(count_i * count_j))
+        denominator = np.sqrt(np.outer(counts, counts))
+        
+        # 计算相似度矩阵
+        self._similarity_matrix = self._co_occurrence_matrix / denominator
+        
+        # 将对角线设置为1
+        np.fill_diagonal(self._similarity_matrix, 1.0)
+        
+        # 保存结果
+        self._similarity_matrix = pd.DataFrame(self._similarity_matrix, index=self._shops, columns=self._shops)
+        self._similarity_matrix.to_csv(save_path, index=True, encoding="utf-8")
+        
+if __name__ == "__main__":
+    similarity_matrix_save_path = "./data/itemcf/similarity.csv"
+    similarity_matrix = SimilarityMatrix("00000000000000000000000011445301")
+    similarity_matrix.calculate_similarity_matrix(similarity_matrix_save_path)

+ 0 - 82
models/recall/itemCF/user_item_score.py

@@ -1,82 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-'''
-@filename     : ShopScore.py
-@description     : 品规-商户-评分矩阵:品规(用户)对商铺(物品)的评分矩阵,将结果保存在score.csv文件中
-@time     : 2025/01/31/02
-@author     : Sherlock1011 & Min1027
-@Version     : 1.0
-'''
-
-
-from database import MySqlDao
-from decimal import Decimal
-
-# 算法封装成一个类
-class UserItemScore:
-    """TODO 1. 将结果保存到redis数据库中"""
-    def __init__(self):
-        self.weights = {
-            "MONTH6_SALE_QTY": Decimal(0.1),
-            "MONTH6_SALE_AMT": Decimal(0.1),
-            "MONTH6_GROSS_PROFIT_RATE": Decimal(0.03),
-            "MONTH6_SALE_QTY_YOY": Decimal(0.1),
-            "MONTH6_SALE_QTY_MOM": Decimal(0.1),
-            "MONTH6_SALE_AMT_YOY": Decimal(0.1),
-            "MONTH6_SALE_AMT_MOM": Decimal(0.1),
-            "ORDER_FULLORDR_RATE": Decimal(0.1),
-            "CUSTOMER_REPURCHASE_RATE": Decimal(0.1),
-            "NEW_PRODUCT_ORDER_QTY_OCC": Decimal(0.03),
-            "LISTING_RATE": Decimal(0.1),
-            "OUT_STOCK_DAYS": Decimal(0.02),
-            "RETAIL_PRICE_INDEX": Decimal(0.02)
-        }
-        self.dao = MySqlDao()
-
-    # 均值方差归一化函数
-    def standardize_column(self, column):
-        if(column.max() == column.min() and column.max() == 0):
-            return 0
-        elif (column.max() == column.min() and column.max() != 0):
-            return 1
-        else:
-            return (column - column.min()) / (column.max() - column.min())
-
-    # 按照品规分组归一化并计算评分
-    def calculate_heart_per_product(self, group):
-        for column in self.weights.keys():
-            if column == "OUT_STOCK_DAYS":
-                group[column] = 1 - self.standardize_column(group[column])
-            else:
-                group[column] = self.standardize_column(group[column])
-        group["SCORE"] = group.apply(
-            lambda row: sum(Decimal(row[col]) * weight for col, weight in self.weights.items()) * 100, axis=1
-        )
-        return group
-
-    # 主算法函数:计算品规-商铺评分矩阵
-    def score(self, order_data):
-       
-
-        # 应用分组计算
-        df_result = order_data.groupby("PRODUCT_CODE").apply(self.calculate_heart_per_product).reset_index(drop=True)
-        df_result = df_result.sort_values(by=["PRODUCT_CODE", "SCORE"], ascending=[True, False])
-
-        # 选择要保存的列
-        return df_result[['PRODUCT_CODE', 'BB_RETAIL_CUSTOMER_CODE', 'SCORE']]
- 
-if __name__ == "__main__":
-    # 创建一个 ItemCF 类的实例
-    item_cf_algorithm = UserItemScore()
-    dao = MySqlDao()
-    # 读取数据
-    order_data = dao.load_order_data()
-
-    # 调用算法
-    scores = item_cf_algorithm.score(order_data)
-    
-    scores_path = "./models/recall/itemCF/matrix/score.csv"
-    
-    # 保存评分结果到csv文件
-    scores.to_csv(scores_path, index=False, encoding="utf-8")
-