similarity_matrix.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. from database import MySqlDao
  2. from itertools import combinations
  3. from models.rank.data.config import OrderConfig
  4. import numpy as np
  5. import pandas as pd
  6. from tqdm import tqdm
  7. class SimilarityMatrix:
  8. def __init__(self, data):
  9. self._order_data = data.copy()
  10. self._load_data()
  11. self._build_co_occurace_matrix()
  12. def _load_data(self):
  13. """加载订单记录表"""
  14. self._order_data =self._order_data[OrderConfig.FEATURE_COLUMNS]
  15. # 数据清洗
  16. self._order_data["sale_qty"] = self._order_data["sale_qty"].fillna(0)
  17. self._order_data = self._order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].mean()
  18. self._order_data = self._order_data[self._order_data["sale_qty"] != 0]
  19. def _build_co_occurace_matrix(self):
  20. """构建商户共现矩阵"""
  21. # 获取所有商户的唯一列表
  22. self._shops = self._order_data["cust_code"].unique()
  23. num_shops = len(self._shops)
  24. # 创建商户到索引的映射
  25. self._shops_to_index = {shop: idx for idx, shop in enumerate(self._shops)}
  26. # 初始化共现矩阵(上三角部分)
  27. self._co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
  28. # 按照品规分组
  29. grouped = self._order_data.groupby("product_code")["cust_code"].apply(list)
  30. # 遍历每个品规的商户列表
  31. for shop_in_product in tqdm(grouped, desc="正在构建共现矩阵..."):
  32. # 生成商户对
  33. shop_pairs = combinations(shop_in_product, 2)
  34. for shop1, shop2 in shop_pairs:
  35. # 获取商户索引
  36. idx1 = self._shops_to_index[shop1]
  37. idx2 = self._shops_to_index[shop2]
  38. # 更新共现矩阵
  39. self._co_occurrence_matrix[idx1, idx2] += 1
  40. self._co_occurrence_matrix[idx2, idx1] += 1
  41. def generate_similarity_matrix(self):
  42. """使用向量计算商铺之间的相似度矩阵"""
  43. # 计算每个商铺售卖品规的总次数
  44. shop_counts = self._order_data.groupby("cust_code").size()
  45. # 将商户售卖次数转换为数组
  46. counts = np.array([shop_counts[shop] for shop in self._shops_to_index.keys()])
  47. # 计算分母部分 (sqrt(count_i * count_j))
  48. denominator = np.sqrt(np.outer(counts, counts))
  49. # 计算相似度矩阵
  50. self._similarity_matrix = self._co_occurrence_matrix / denominator
  51. # 将对角线设置为1
  52. np.fill_diagonal(self._similarity_matrix, 1.0)
  53. # 保存结果
  54. self._similarity_matrix = pd.DataFrame(self._similarity_matrix, index=self._shops, columns=self._shops)
  55. # self._similarity_matrix.to_csv(save_path, index=True, encoding="utf-8")
  56. return self._similarity_matrix
  57. if __name__ == "__main__":
  58. similarity_matrix_save_path = "./data/itemcf/similarity.csv"
  59. similarity_matrix = SimilarityMatrix("00000000000000000000000011445301")
  60. similarity_matrix.generate_similarity_matrix(similarity_matrix_save_path)