from database import MySqlDao from itertools import combinations from models.rank.data.config import OrderConfig import numpy as np import pandas as pd from tqdm import tqdm class SimilarityMatrix: def __init__(self, data): self._order_data = data.copy() self._load_data() self._build_co_occurace_matrix() def _load_data(self): """加载订单记录表""" self._order_data =self._order_data[OrderConfig.FEATURE_COLUMNS] # 数据清洗 self._order_data["sale_qty"] = self._order_data["sale_qty"].fillna(0) self._order_data = self._order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].sum() self._order_data = self._order_data[self._order_data["sale_qty"] != 0] def _build_co_occurace_matrix(self): """构建商户共现矩阵""" # 获取所有商户的唯一列表 self._shops = self._order_data["cust_code"].unique() num_shops = len(self._shops) # 创建商户到索引的映射 self._shops_to_index = {shop: idx for idx, shop in enumerate(self._shops)} # 初始化共现矩阵(上三角部分) self._co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int) # 按照品规分组 grouped = self._order_data.groupby("product_code")["cust_code"].apply(list) # 遍历每个品规的商户列表 for shop_in_product in tqdm(grouped, desc="正在构建共现矩阵..."): # 生成商户对 shop_pairs = combinations(shop_in_product, 2) for shop1, shop2 in shop_pairs: # 获取商户索引 idx1 = self._shops_to_index[shop1] idx2 = self._shops_to_index[shop2] # 更新共现矩阵 self._co_occurrence_matrix[idx1, idx2] += 1 self._co_occurrence_matrix[idx2, idx1] += 1 def generate_similarity_matrix(self): """使用向量计算商铺之间的相似度矩阵""" # 计算每个商铺售卖品规的总次数 shop_counts = self._order_data.groupby("cust_code").size() # 将商户售卖次数转换为数组 counts = np.array([shop_counts[shop] for shop in self._shops_to_index.keys()]) # 计算分母部分 (sqrt(count_i * count_j)) denominator = np.sqrt(np.outer(counts, counts)) # 计算相似度矩阵 self._similarity_matrix = self._co_occurrence_matrix / denominator # 将对角线设置为1 np.fill_diagonal(self._similarity_matrix, 1.0) # 保存结果 self._similarity_matrix = pd.DataFrame(self._similarity_matrix, index=self._shops, columns=self._shops) # self._similarity_matrix.to_csv(save_path, index=True, encoding="utf-8") return self._similarity_matrix if __name__ == "__main__": similarity_matrix_save_path = "./data/itemcf/similarity.csv" similarity_matrix = SimilarityMatrix("00000000000000000000000011445301") similarity_matrix.generate_similarity_matrix(similarity_matrix_save_path)