|
|
@@ -24,64 +24,50 @@ class GbdtLrModel:
|
|
|
models = joblib.load(model_path)
|
|
|
self.gbdt_model, self.lr_model, self.onehot_encoder = models["gbdt_model"], models["lr_model"], models["onehot_encoder"]
|
|
|
|
|
|
-
|
|
|
- # def get_recall_list(self, city_uuid, product_id):
|
|
|
- # """根据卷烟id获取召回的商铺列表"""
|
|
|
- # key = f"fc:{city_uuid}:{product_id}"
|
|
|
- # self.recall_cust_list = self.redis.zrange(key, 0, -1, withscores=False)
|
|
|
-
|
|
|
- # def load_recall_data(self, city_uuid, product_id):
|
|
|
- # self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
|
|
|
- # self.custs_data = self._mysql_dao.get_cust_by_ids(city_uuid, self.recall_cust_list)[CustConfig.FEATURE_COLUMNS]
|
|
|
-
|
|
|
def get_cust_and_product_data(self, city_uuid, product_id):
|
|
|
"""从商户数据库中获取指定城市所有商户的id"""
|
|
|
self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
|
|
|
self.custs_data = self._mysql_dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
|
|
|
|
|
|
- def generate_feats_map(self, city_uuid, product_id):
|
|
|
+ def generate_feats_map(self, product_data, cust_data):
|
|
|
"""组合卷烟、商户特征矩阵"""
|
|
|
- # self.get_recall_list(city_uuid, product_id)
|
|
|
- # self.load_recall_data(city_uuid, product_id)
|
|
|
-
|
|
|
- self.get_cust_and_product_data(city_uuid, product_id)
|
|
|
- # 做数据清洗
|
|
|
- self.product_data = sample_data_clear(self.product_data, ProductConfig)
|
|
|
- self.custs_data = sample_data_clear(self.custs_data, CustConfig)
|
|
|
-
|
|
|
# 笛卡尔积联合
|
|
|
- self.custs_data["descartes"] = 1
|
|
|
- self.product_data["descartes"] = 1
|
|
|
- self.feats_map = pd.merge(self.custs_data, self.product_data, on="descartes").drop("descartes", axis=1)
|
|
|
- self.recall_cust_list = self.feats_map["BB_RETAIL_CUSTOMER_CODE"].to_list()
|
|
|
- self.feats_map.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True)
|
|
|
- self.feats_map.drop('product_code', axis=1, inplace=True)
|
|
|
+ cust_data["descartes"] = 1
|
|
|
+ product_data["descartes"] = 1
|
|
|
+ feats_map = pd.merge(cust_data, product_data, on="descartes").drop("descartes", axis=1)
|
|
|
+ # recall_cust_list = feats_map["BB_RETAIL_CUSTOMER_CODE"].to_list()
|
|
|
+ feats_map.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True)
|
|
|
+ feats_map.drop('product_code', axis=1, inplace=True)
|
|
|
|
|
|
# onehot编码
|
|
|
- onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
|
|
|
+ onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT, **ShopConfig.ONEHOT_CAT}
|
|
|
onehot_columns = list(onehot_feats.keys())
|
|
|
- numeric_columns = self.feats_map.drop(onehot_columns, axis=1).columns
|
|
|
- self.feats_map = one_hot_embedding(self.feats_map, onehot_feats)
|
|
|
+ numeric_columns = feats_map.drop(onehot_columns, axis=1).columns
|
|
|
+ feats_map = one_hot_embedding(feats_map, onehot_feats)
|
|
|
|
|
|
# 数字特征归一化
|
|
|
- scaler = StandardScaler()
|
|
|
- self.feats_map[numeric_columns] = scaler.fit_transform(self.feats_map[numeric_columns])
|
|
|
+ if len(numeric_columns) != 0:
|
|
|
+ scaler = StandardScaler()
|
|
|
+ feats_map[numeric_columns] = scaler.fit_transform(feats_map[numeric_columns])
|
|
|
+
|
|
|
+ return feats_map
|
|
|
|
|
|
- def recommend_sort(self, city_uuid, product_id):
|
|
|
- self.generate_feats_map(city_uuid, product_id)
|
|
|
-
|
|
|
- gbdt_preds = self.gbdt_model.apply(self.feats_map)[:, :, 0]
|
|
|
+ def get_recommend_list(self, recommend_sample, recall_list):
|
|
|
+ gbdt_preds = self.gbdt_model.apply(recommend_sample)[:, :, 0]
|
|
|
gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds)
|
|
|
scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1]
|
|
|
|
|
|
- self.recommend_list = []
|
|
|
- for cust_id, score in zip(self.recall_cust_list, scores):
|
|
|
- self.recommend_list.append({cust_id: float(score)})
|
|
|
-
|
|
|
- self.recommend_list = sorted(self.recommend_list, key=lambda x: list(x.values())[0], reverse=True)
|
|
|
- # for res in self.recommend_list[:200]:
|
|
|
- # print(res)
|
|
|
- return self.recommend_list
|
|
|
+ recommend_list = []
|
|
|
+ for cust_id, score in zip(recall_list, scores):
|
|
|
+ recommend_list.append({cust_id: float(score)})
|
|
|
+ recommend_list.append({"cust_code": cust_id, "recommend_score": score})
|
|
|
+
|
|
|
+ recommend_list = sorted(
|
|
|
+ [item for item in recommend_list if "recommend_score" in item],
|
|
|
+ key=lambda x: x["recommend_score"],
|
|
|
+ reverse=True
|
|
|
+ )
|
|
|
+ return recommend_list
|
|
|
|
|
|
def generate_feats_importance(self):
|
|
|
"""生成特征重要性"""
|
|
|
@@ -148,7 +134,6 @@ class GbdtLrModel:
|
|
|
batch_interaction = self._explanier.shap_interaction_values(batch_data)
|
|
|
fp[i:i+len(batch_interaction)] = batch_interaction.astype(np.float32)
|
|
|
fp.flush() # 确保数据写入磁盘
|
|
|
- print("SHAP交互值计算并存储完成")
|
|
|
|
|
|
# 分批计算均值
|
|
|
mean_interaction = np.zeros((n_features, n_features), dtype=np.float32)
|
|
|
@@ -157,7 +142,6 @@ class GbdtLrModel:
|
|
|
mean_interaction += batch.sum(axis=0) # 按批累加
|
|
|
|
|
|
mean_interaction /= n_samples # 计算最终均值
|
|
|
- print("均值计算完成")
|
|
|
|
|
|
# 构建交互矩阵DataFrame
|
|
|
interaction_df = pd.DataFrame(
|
|
|
@@ -165,7 +149,6 @@ class GbdtLrModel:
|
|
|
index=data.columns,
|
|
|
columns=data.columns
|
|
|
)
|
|
|
- print("交互矩阵构建完成")
|
|
|
|
|
|
# 分离卷烟和商户特征
|
|
|
product_feats = [
|
|
|
@@ -179,23 +162,19 @@ class GbdtLrModel:
|
|
|
for feat, categories in {**CustConfig.ONEHOT_CAT, **ShopConfig.ONEHOT_CAT}.items()
|
|
|
for item in categories
|
|
|
]
|
|
|
- print("特征分离完成")
|
|
|
|
|
|
# 提取交叉区块
|
|
|
cross_matrix = interaction_df.loc[product_feats, cust_feats]
|
|
|
- print("交叉区块提取完成")
|
|
|
|
|
|
# 转换为长格式
|
|
|
stacked = cross_matrix.stack().reset_index()
|
|
|
stacked.columns = ['product_feat', 'cust_feat', 'relation']
|
|
|
- print("转换为长格式完成")
|
|
|
|
|
|
# 过滤掉零值或NaN的配对
|
|
|
filtered = stacked[
|
|
|
(stacked['relation'].abs() > 1e-6) & # 排除极小值
|
|
|
(~stacked['relation'].isna()) # 排除NaN
|
|
|
].copy()
|
|
|
- print("过滤完成")
|
|
|
|
|
|
# 排序结果
|
|
|
results = (
|
|
|
@@ -203,7 +182,6 @@ class GbdtLrModel:
|
|
|
.sort_values('relation', ascending=False)
|
|
|
.to_dict('records')
|
|
|
)
|
|
|
- print("排序完成")
|
|
|
|
|
|
# 替换特征名称
|
|
|
feats_name_map = {
|
|
|
@@ -223,8 +201,6 @@ class GbdtLrModel:
|
|
|
cust_infos = cust_f.split("_")
|
|
|
item["cust_feat"] = f"{feats_name_map['_'.join(cust_infos[:-1])]}({cust_infos[-1]})"
|
|
|
|
|
|
- print("名称替换完成")
|
|
|
-
|
|
|
# 返回最终结果
|
|
|
return pd.DataFrame(results, columns=['product_feat', 'cust_feat', 'relation'])
|
|
|
|
|
|
@@ -258,8 +234,9 @@ if __name__ == "__main__":
|
|
|
# ])
|
|
|
# product_df.to_csv("./data/product_feats.csv", index=False)
|
|
|
data, _ = DataLoader("./data/gbdt/train_data.csv").split_dataset()
|
|
|
- # data = data["data"].sample(n=1000, replace=True, random_state=42)
|
|
|
- data = data["data"]
|
|
|
+ data = data["data"].sample(n=300, replace=True, random_state=42)
|
|
|
+ data.to_csv("./data/data.csv", index=False)
|
|
|
+ # data = data["data"]
|
|
|
result = gbdt_sort.generate_shap_interance(data)
|
|
|
print("保存结果")
|
|
|
result.to_csv("./data/feats_interaction.csv", index=False, encoding='utf-8-sig')
|