import pandas as pd import numpy as np from itertools import combinations from dao.mysql_client import Mysql def load_data_from_dataset(): """从数据库中读取数据""" client = Mysql() tablename = "mock_order" query_text = "*" df = client.load_data(tablename, query_text) # 去除重复值和填补缺失值 df.drop_duplicates(inplace=True) df.fillna(0, inplace=True) return df def build_co_occurence_matrix(order_data): """ 构建商户共现矩阵 """ # 获取所有商户的唯一列表 shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique() num_shops = len(shops) # 创建商户到索引的映射 shops_to_index = {shop: idx for idx, shop in enumerate(shops)} # 初始化共现矩阵(上三角部分) co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int) # 按照品规分组 grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list) # 遍历每个品规的商户列表 for shop_in_product in grouped: # 生成商户对 shop_pairs = combinations(shop_in_product, 2) for shop1, shop2 in shop_pairs: # 获取商户索引 idx1 = shops_to_index[shop1] idx2 = shops_to_index[shop2] # 更新共现矩阵 co_occurrence_matrix[idx1, idx2] += 1 co_occurrence_matrix[idx2, idx1] += 1 return co_occurrence_matrix, shops def save_co_occurrence_matrix(matrix, shops, save_path): """ 保存共现矩阵 """ matrix_df = pd.DataFrame(matrix, index=shops, columns=shops) matrix_df.to_csv(save_path, index=True, encoding="utf-8") if __name__ == "__main__": save_path = "./models/recall/itemCF/matrix/occurrence.csv" order_data = load_data_from_dataset() co_occurrence_matrix, shops = build_co_occurence_matrix(order_data) save_co_occurrence_matrix(co_occurrence_matrix, shops, save_path)