calculate_co_occurrence_matrix.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. import pandas as pd
  2. import numpy as np
  3. from itertools import combinations
  4. from dao.mysql_client import Mysql
  5. from tqdm import tqdm
  6. def load_data_from_dataset():
  7. """从数据库中读取数据"""
  8. client = Mysql()
  9. tablename = "mock_order"
  10. query_text = "*"
  11. df = client.load_data(tablename, query_text)
  12. # 去除重复值和填补缺失值
  13. df.drop_duplicates(inplace=True)
  14. df.fillna(0, inplace=True)
  15. return df
  16. def build_co_occurence_matrix(order_data):
  17. """
  18. 构建商户共现矩阵
  19. """
  20. # 获取所有商户的唯一列表
  21. shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique()
  22. num_shops = len(shops)
  23. # 创建商户到索引的映射
  24. shops_to_index = {shop: idx for idx, shop in enumerate(shops)}
  25. # 初始化共现矩阵(上三角部分)
  26. co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
  27. # 按照品规分组
  28. grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list)
  29. # 遍历每个品规的商户列表
  30. for shop_in_product in grouped:
  31. # 生成商户对
  32. shop_pairs = combinations(shop_in_product, 2)
  33. for shop1, shop2 in shop_pairs:
  34. # 获取商户索引
  35. idx1 = shops_to_index[shop1]
  36. idx2 = shops_to_index[shop2]
  37. # 更新共现矩阵
  38. co_occurrence_matrix[idx1, idx2] += 1
  39. co_occurrence_matrix[idx2, idx1] += 1
  40. return co_occurrence_matrix, shops, shops_to_index
  41. def calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index):
  42. """
  43. 使用向量计算商铺之间的相似度矩阵
  44. """
  45. # 计算每个商铺售卖品规的总次数
  46. shop_counts = order_data.groupby("BB_RETAIL_CUSTOMER_CODE").size()
  47. num_shops = len(shops_to_index)
  48. # 将商户售卖次数转换为数组
  49. counts = np.array([shop_counts[shop] for shop in shops_to_index.keys()])
  50. # 计算分母部分 (sqrt(count_i * count_j))
  51. denominator = np.sqrt(np.outer(counts, counts))
  52. # 计算相似度矩阵
  53. similarity_matrix = co_occurrence_matrix / denominator
  54. # 将对角线设置为1
  55. np.fill_diagonal(similarity_matrix, 1.0)
  56. return similarity_matrix
  57. def save_matrix(matrix, shops, save_path):
  58. """
  59. 保存共现矩阵
  60. """
  61. matrix_df = pd.DataFrame(matrix, index=shops, columns=shops)
  62. matrix_df.to_csv(save_path, index=True, encoding="utf-8")
  63. if __name__ == "__main__":
  64. co_occurrence_save_path = "./models/recall/itemCF/matrix/occurrence.csv"
  65. similarity_matrix_save_path = "./models/recall/itemCF/matrix/similarity.csv"
  66. order_data = load_data_from_dataset()
  67. co_occurrence_matrix, shops, shops_to_index = build_co_occurence_matrix(order_data)
  68. # save_matrix(co_occurrence_matrix, shops, co_occurrence_save_path)
  69. similarity_matrix = calculate_similarity_matrix(co_occurrence_matrix, order_data, shops_to_index)
  70. save_matrix(similarity_matrix, shops, similarity_matrix_save_path)