| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- from dao import load_order_data_from_mysql
- import pandas as pd
- import numpy as np
- from itertools import combinations
- from dao.mysql_client import Mysql
- from tqdm import tqdm
- def build_co_occurence_matrix(order_data):
- """
- 构建商户共现矩阵
- """
- # 获取所有商户的唯一列表
- shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique()
- num_shops = len(shops)
-
- # 创建商户到索引的映射
- shops_to_index = {shop: idx for idx, shop in enumerate(shops)}
- # 初始化共现矩阵(上三角部分)
- co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
-
- # 按照品规分组
- grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list)
-
- # 遍历每个品规的商户列表
- for shop_in_product in grouped:
- # 生成商户对
- shop_pairs = combinations(shop_in_product, 2)
- for shop1, shop2 in shop_pairs:
- # 获取商户索引
- idx1 = shops_to_index[shop1]
- idx2 = shops_to_index[shop2]
- # 更新共现矩阵
- co_occurrence_matrix[idx1, idx2] += 1
- co_occurrence_matrix[idx2, idx1] += 1
- return co_occurrence_matrix, shops, shops_to_index
- def calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index):
- """
- 使用向量计算商铺之间的相似度矩阵
- """
- # 计算每个商铺售卖品规的总次数
- shop_counts = order_data.groupby("BB_RETAIL_CUSTOMER_CODE").size()
-
- # 将商户售卖次数转换为数组
- counts = np.array([shop_counts[shop] for shop in shops_to_index.keys()])
-
- # 计算分母部分 (sqrt(count_i * count_j))
- denominator = np.sqrt(np.outer(counts, counts))
-
- # 计算相似度矩阵
- similarity_matrix = co_occurrence_matrix / denominator
-
- # 将对角线设置为1
- np.fill_diagonal(similarity_matrix, 1.0)
-
- return similarity_matrix
- def save_matrix(matrix, shops, save_path):
- """
- 保存共现矩阵
- """
- matrix_df = pd.DataFrame(matrix, index=shops, columns=shops)
- matrix_df.to_csv(save_path, index=True, encoding="utf-8")
-
- def calculate_similarity_and_save_results(order_data, similarity_matrix_save_path):
- co_occurrence_matrix, shops, shops_to_index = build_co_occurence_matrix(order_data)
- similarity_matrix = calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index)
- save_matrix(similarity_matrix, shops, similarity_matrix_save_path)
-
- if __name__ == "__main__":
- co_occurrence_save_path = "./models/recall/itemCF/matrix/occurrence.csv"
- similarity_matrix_save_path = "./models/recall/itemCF/matrix/similarity.csv"
- # 从数据库中读取订单数据
- order_data = load_order_data_from_mysql()
-
- calculate_similarity_and_save_results(order_data, similarity_matrix_save_path)
-
-
|