Переглянути джерело

refactor(models): merge cust_code_list into Item2Vec candidate pool for unified scoring

Sherlock 1 тиждень тому
батько
коміт
af5d03792f
1 змінених файлів з 18 додано та 10 видалено
  1. 18 10
      models/item2vec/inference.py

+ 18 - 10
models/item2vec/inference.py

@@ -35,38 +35,46 @@ class Item2VecModel:
         similarity_list = similarity_list[:top]
         return similarity_list
     
-    def get_recommend_cust_list(self, product_code, top=100):
-        """获取推荐的商户列表"""
+    def get_recommend_cust_list(self, product_code, top=100, cust_code_list=None):
+        """获取推荐的商户列表,核心商户并入候选集统一评分"""
+        if cust_code_list is None:
+            cust_code_list = []
         logger.info(f"Getting recommend list for product {product_code}, top={top}")
         product_list = self.get_similarity_list(product_code)
         order_data = self._dao.get_order_by_product_ids(self._city_uuid, product_list)[OrderConfig.FEATURE_COLUMNS]
         order_data["sale_qty"] = order_data["sale_qty"].fillna(0)
         order_data = order_data.groupby(["cust_code", "product_code"], as_index=False)["sale_qty"].mean()
-        
-        
-        # 按照卷烟分组,取每款卷烟售卖最好的前50个商户
+
+        # 按照卷烟分组,取每款卷烟售卖最好的前top个商户
         order_data = (
             order_data
             .sort_values(["product_code", "sale_qty", "cust_code"], ascending=[True, False, True])
             .groupby("product_code")
             .head(top)
         )
-        
+
         recommend_cust = (
             order_data.groupby(["cust_code"], as_index=False)["sale_qty"].sum()
             .query("sale_qty > 0")
             .sort_values(["sale_qty", "cust_code"], ascending=[False, True])
         )
-        
+
+        # 将 cust_code_list 中不在候选集的商户补入,sale_qty=0 参与归一化
+        existing_custs = set(recommend_cust["cust_code"].tolist())
+        extra_rows = [{"cust_code": c, "sale_qty": 0} for c in cust_code_list if c not in existing_custs]
+        if extra_rows:
+            extra_df = pd.DataFrame(extra_rows)
+            recommend_cust = pd.concat([recommend_cust, extra_df], ignore_index=True)
+
         # 对销量进行归一化:先 log1p 压缩幂律分布的长尾,再 StandardScaler + sigmoid
         # 不做 log 变换时,头部商户 z-score 过大会导致 sigmoid 饱和,分数全为 100
         log_qty = np.log1p(recommend_cust["sale_qty"].values).reshape(-1, 1)
         scaler = StandardScaler()
         normalized = scaler.fit_transform(log_qty)
         recommend_cust["recommend_score"] = ((1 / (1 + np.exp(-normalized))) * 100).flatten()
-        # recommend_cust = recommend_cust.rename(columns={"sale_qty": "recommend_score"})
-        # recommend_cust.to_csv("./data/item2vec_recommend.csv", index=False)
-        
+
+        recommend_cust = recommend_cust.sort_values(["recommend_score", "cust_code"], ascending=[False, True]).reset_index(drop=True)
+
         return recommend_cust