from database import MySqlDao import pandas as pd import numpy as np from itertools import combinations from tqdm import tqdm dao = MySqlDao() def build_co_occurence_matrix(order_data): """ 构建商户共现矩阵 """ # 获取所有商户的唯一列表 shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique() num_shops = len(shops) # 创建商户到索引的映射 shops_to_index = {shop: idx for idx, shop in enumerate(shops)} # 初始化共现矩阵(上三角部分) co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int) # 按照品规分组 grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list) # 遍历每个品规的商户列表 for shop_in_product in grouped: # 生成商户对 shop_pairs = combinations(shop_in_product, 2) for shop1, shop2 in shop_pairs: # 获取商户索引 idx1 = shops_to_index[shop1] idx2 = shops_to_index[shop2] # 更新共现矩阵 co_occurrence_matrix[idx1, idx2] += 1 co_occurrence_matrix[idx2, idx1] += 1 return co_occurrence_matrix, shops, shops_to_index def calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index): """ 使用向量计算商铺之间的相似度矩阵 """ # 计算每个商铺售卖品规的总次数 shop_counts = order_data.groupby("BB_RETAIL_CUSTOMER_CODE").size() # 将商户售卖次数转换为数组 counts = np.array([shop_counts[shop] for shop in shops_to_index.keys()]) # 计算分母部分 (sqrt(count_i * count_j)) denominator = np.sqrt(np.outer(counts, counts)) # 计算相似度矩阵 similarity_matrix = co_occurrence_matrix / denominator # 将对角线设置为1 np.fill_diagonal(similarity_matrix, 1.0) return similarity_matrix def save_matrix(matrix, shops, save_path): """ 保存共现矩阵 """ matrix_df = pd.DataFrame(matrix, index=shops, columns=shops) matrix_df.to_csv(save_path, index=True, encoding="utf-8") def calculate_similarity_and_save_results(order_data, similarity_matrix_save_path): co_occurrence_matrix, shops, shops_to_index = build_co_occurence_matrix(order_data) similarity_matrix = calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index) save_matrix(similarity_matrix, shops, similarity_matrix_save_path) if __name__ == "__main__": co_occurrence_save_path = "./models/recall/itemCF/matrix/occurrence.csv" similarity_matrix_save_path = "./models/recall/itemCF/matrix/similarity.csv" # 从数据库中读取订单数据 order_data = dao.load_order_data() calculate_similarity_and_save_results(order_data, similarity_matrix_save_path)