calculate_co_occurrence_matrix.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. import pandas as pd
  2. import numpy as np
  3. from itertools import combinations
  4. from dao.mysql_client import Mysql
  5. def load_data_from_dataset():
  6. """从数据库中读取数据"""
  7. client = Mysql()
  8. tablename = "mock_order"
  9. query_text = "*"
  10. df = client.load_data(tablename, query_text)
  11. # 去除重复值和填补缺失值
  12. df.drop_duplicates(inplace=True)
  13. df.fillna(0, inplace=True)
  14. return df
  15. def build_co_occurence_matrix(order_data):
  16. """
  17. 构建商户共现矩阵
  18. """
  19. # 获取所有商户的唯一列表
  20. shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique()
  21. num_shops = len(shops)
  22. # 创建商户到索引的映射
  23. shops_to_index = {shop: idx for idx, shop in enumerate(shops)}
  24. # 初始化共现矩阵(上三角部分)
  25. co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
  26. # 按照品规分组
  27. grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list)
  28. # 遍历每个品规的商户列表
  29. for shop_in_product in grouped:
  30. # 生成商户对
  31. shop_pairs = combinations(shop_in_product, 2)
  32. for shop1, shop2 in shop_pairs:
  33. # 获取商户索引
  34. idx1 = shops_to_index[shop1]
  35. idx2 = shops_to_index[shop2]
  36. # 更新共现矩阵
  37. co_occurrence_matrix[idx1, idx2] += 1
  38. co_occurrence_matrix[idx2, idx1] += 1
  39. return co_occurrence_matrix, shops
  40. def save_co_occurrence_matrix(matrix, shops, save_path):
  41. """
  42. 保存共现矩阵
  43. """
  44. matrix_df = pd.DataFrame(matrix, index=shops, columns=shops)
  45. matrix_df.to_csv(save_path, index=True, encoding="utf-8")
  46. if __name__ == "__main__":
  47. save_path = "./models/recall/itemCF/matrix/occurrence.csv"
  48. order_data = load_data_from_dataset()
  49. co_occurrence_matrix, shops = build_co_occurence_matrix(order_data)
  50. save_co_occurrence_matrix(co_occurrence_matrix, shops, save_path)