| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- from database.dao.mysql_dao import MySqlDao
- from models.item2vec import Item2Vec
- from models.rank.data.config import OrderConfig, ProductConfig
- from models.rank.data.utils import sample_data_clear
- import numpy as np
- import pandas as pd
- from sklearn.preprocessing import StandardScaler
- from core import get_logger
- logger = get_logger("models.item2vec")
- class Item2VecModel:
- def __init__(self, city_uuid):
- self._dao = MySqlDao()
- self._city_uuid = city_uuid
- self._item2vec_model = Item2Vec(city_uuid)
-
- def generate_product_similarity_map(self, product_code):
- """根据product_code生成卷烟相似度矩阵"""
- logger.info(f"Generating similarity map for product {product_code}")
- product = self._dao.get_product_by_id(self._city_uuid, product_code)[ProductConfig.FEATURE_COLUMNS]
- product = sample_data_clear(product, ProductConfig)
-
- similarity_map = self._item2vec_model.get_similarity_map(product)
- similarity_map = pd.DataFrame(similarity_map)
- product_list = self._dao.load_product_data(self._city_uuid)[ProductConfig.FEATURE_COLUMNS + ["product_name"]]
- similarity_map = similarity_map.merge(product_list, on="product_code", how="inner")
- # self._similarity_map = self._similarity_map.query(f"product_code != {product_code}")
- return similarity_map
-
- def get_similarity_list(self, product_code, top=40):
- """获取与指卷烟最相似的top k个卷烟"""
- similarity_map = self.generate_product_similarity_map(product_code)
- similarity_list = similarity_map["product_code"].to_list()
- similarity_list = similarity_list[:top]
- return similarity_list
-
- def get_recommend_cust_list(self, product_code, top=100, cust_code_list=None):
- """获取推荐的商户列表,核心商户并入候选集统一评分"""
- if cust_code_list is None:
- cust_code_list = []
- logger.info(f"Getting recommend list for product {product_code}, top={top}")
- product_list = self.get_similarity_list(product_code)
- order_data = self._dao.get_order_by_product_ids(self._city_uuid, product_list)[OrderConfig.FEATURE_COLUMNS]
- order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
- order_data = order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].mean()
- # 按照卷烟分组,取每款卷烟售卖最好的前top个商户
- order_data = (
- order_data
- .sort_values(["product_code", "sale_qty", "cust_code"], ascending=[True, False, True])
- .groupby("product_code")
- .head(top)
- )
- recommend_cust = (
- order_data.groupby(["cust_code"], as_index=False)["sale_qty"].sum()
- .query("sale_qty > 0")
- .sort_values(["sale_qty", "cust_code"], ascending=[False, True])
- )
- # 将 cust_code_list 中不在候选集的商户补入,sale_qty=0 参与归一化
- existing_custs = set(recommend_cust["cust_code"].tolist())
- extra_rows = [{"cust_code": c, "sale_qty": 0} for c in cust_code_list if c not in existing_custs]
- if extra_rows:
- extra_df = pd.DataFrame(extra_rows)
- recommend_cust = pd.concat([recommend_cust, extra_df], ignore_index=True)
- # 对销量进行归一化:先 log1p 压缩幂律分布的长尾,再 StandardScaler + sigmoid
- # 不做 log 变换时,头部商户 z-score 过大会导致 sigmoid 饱和,分数全为 100
- log_qty = np.log1p(recommend_cust["sale_qty"].values).reshape(-1, 1)
- scaler = StandardScaler()
- normalized = scaler.fit_transform(log_qty)
- recommend_cust["recommend_score"] = ((1 / (1 + np.exp(-normalized))) * 100).flatten()
- recommend_cust = recommend_cust.sort_values(["recommend_score", "cust_code"], ascending=[False, True]).reset_index(drop=True)
- return recommend_cust
-
-
-
- if __name__ == "__main__":
- city_uuid = "00000000000000000000000011445301"
- product_id = "350139"
-
- model = Item2VecModel(city_uuid)
- model.get_similarity_list(product_id)
- # dao = MySqlDao()
- # data = dao.get_order_by_cust_and_product(city_uuid, "445300108802", "340223")[OrderConfig.FEATURE_COLUMNS]
- # data.to_csv("./data/result.csv", index=False)
|