huanghongbo
/
BrandCultivation


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
							import pandas as pd
import numpy as np

from itertools import combinations
from dao.mysql_client import Mysql

def load_data_from_dataset():
    """从数据库中读取数据"""
    client = Mysql()
    tablename = "mock_order"
    query_text = "*"
    
    df = client.load_data(tablename, query_text)
    
     # 去除重复值和填补缺失值
    df.drop_duplicates(inplace=True)
    df.fillna(0, inplace=True)
    return df

def build_co_occurence_matrix(order_data):
    """
    构建商户共现矩阵
    """
    # 获取所有商户的唯一列表
    shops = order_data["BB_RETAIL_CUSTOMER_CODE"].unique()
    num_shops = len(shops)
    
    # 创建商户到索引的映射
    shops_to_index = {shop: idx for idx, shop in enumerate(shops)}
    # 初始化共现矩阵（上三角部分）
    co_occurrence_matrix = np.zeros((num_shops, num_shops), dtype=int)
    
    # 按照品规分组
    grouped = order_data.groupby("PRODUCT_CODE")["BB_RETAIL_CUSTOMER_CODE"].apply(list)
    
    # 遍历每个品规的商户列表
    for shop_in_product in grouped:
        # 生成商户对
        shop_pairs = combinations(shop_in_product, 2)
        for shop1, shop2 in shop_pairs:
            # 获取商户索引
            idx1 = shops_to_index[shop1]
            idx2 = shops_to_index[shop2]
            # 更新共现矩阵
            co_occurrence_matrix[idx1, idx2] += 1
            co_occurrence_matrix[idx2, idx1] += 1
    return co_occurrence_matrix, shops

def save_co_occurrence_matrix(matrix, shops, save_path):
    """
    保存共现矩阵
    """
    matrix_df = pd.DataFrame(matrix, index=shops, columns=shops)
    matrix_df.to_csv(save_path, index=True, encoding="utf-8")
    
if __name__ == "__main__":
    save_path = "./models/recall/itemCF/matrix/occurrence.csv"
    order_data = load_data_from_dataset()
    
    co_occurrence_matrix, shops = build_co_occurence_matrix(order_data)
    
    save_co_occurrence_matrix(co_occurrence_matrix, shops, save_path)