gbdt_lr_inference.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. import gc
  2. import joblib
  3. import re
  4. # from dao import Redis, get_product_by_id, get_custs_by_ids, load_cust_data_from_mysql
  5. from database import RedisDatabaseHelper, MySqlDao
  6. from models.rank.data import DataLoader
  7. from models.rank.data import ProductConfig, CustConfig, ImportanceFeaturesMap
  8. from models.rank.data.utils import one_hot_embedding, sample_data_clear
  9. import numpy as np
  10. import pandas as pd
  11. from sklearn.preprocessing import StandardScaler
  12. def clean_column_name(col):
  13. """清理列名中的特殊字符,与 one_hot_embedding 保持一致"""
  14. return (re.sub(r'[",\\\n\r\t\b\f]', '_', col)
  15. .replace(' ', '_'))
  16. import shap
  17. from tqdm import tqdm
  18. import os
  19. def generate_feats_map(product_data, cust_data):
  20. """组合卷烟、商户特征矩阵"""
  21. # 笛卡尔积联合
  22. cust_data["descartes"] = 1
  23. product_data["descartes"] = 1
  24. feats_map = pd.merge(cust_data, product_data, on="descartes").drop("descartes", axis=1)
  25. # recall_cust_list = feats_map["BB_RETAIL_CUSTOMER_CODE"].to_list()
  26. feats_map.drop('cust_code', axis=1, inplace=True)
  27. feats_map.drop('product_code', axis=1, inplace=True)
  28. # onehot编码
  29. onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
  30. onehot_columns = list(onehot_feats.keys())
  31. numeric_columns = feats_map.drop(onehot_columns, axis=1).columns
  32. feats_map = one_hot_embedding(feats_map, onehot_feats)
  33. # 数字特征归一化
  34. if len(numeric_columns) != 0:
  35. scaler = StandardScaler()
  36. feats_map[numeric_columns] = scaler.fit_transform(feats_map[numeric_columns])
  37. return feats_map
  38. class GbdtLrModel:
  39. def __init__(self, model_path):
  40. self.load_model(model_path)
  41. self.redis = RedisDatabaseHelper().redis
  42. self._mysql_dao = MySqlDao()
  43. self._explanier = None
  44. def load_model(self, model_path):
  45. models = joblib.load(model_path)
  46. self.gbdt_model, self.lr_model, self.onehot_encoder = models["lgbm_model"], models["lr_model"], models["onehot_encoder"]
  47. def get_cust_and_product_data(self, city_uuid, product_id):
  48. """从商户数据库中获取指定城市所有商户的id"""
  49. self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
  50. self.custs_data = self._mysql_dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
  51. def get_recommend_list(self, recommend_sample, recall_list):
  52. gbdt_preds = self.gbdt_model.predict(recommend_sample, pred_leaf=True)
  53. gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds)
  54. scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1] * 100
  55. recommend_list = []
  56. for cust_id, score in zip(recall_list, scores):
  57. recommend_list.append({cust_id: float(score)})
  58. recommend_list.append({"cust_code": cust_id, "recommend_score": score})
  59. recommend_list = sorted(
  60. [item for item in recommend_list if "recommend_score" in item],
  61. key=lambda x: x["recommend_score"],
  62. reverse=True
  63. )
  64. return recommend_list
  65. def generate_feats_importance(self):
  66. """生成特征重要性"""
  67. # 获取GBDT模型的特征重要性
  68. feats_importance = self.gbdt_model.feature_importances_
  69. # 获取特征名称
  70. feats_names = self.gbdt_model.feature_name_
  71. importance_dict = dict(zip(feats_names, feats_importance))
  72. onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
  73. for feat, categories in onehot_feats.items():
  74. related_columns = [f"{feat}_{item}" for item in categories]
  75. if related_columns:
  76. # 合并类别重要性
  77. combined_importance = sum(importance_dict[col] for col in related_columns)
  78. # 删除onehot类别列
  79. for col in related_columns:
  80. del importance_dict[col]
  81. # 添加合并后的重要性
  82. importance_dict[feat] = combined_importance
  83. # 排序
  84. sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
  85. # 输出特征重要性
  86. cust_features_importance = []
  87. product_features_importance = []
  88. for feat, importance in sorted_importance:
  89. if feat in list(ImportanceFeaturesMap.CUSTOM_FEATURES_MAP.keys()):
  90. cust_features_importance.append({ImportanceFeaturesMap.CUSTOM_FEATURES_MAP[feat]: float(importance)})
  91. if feat in list(ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP.keys()):
  92. product_features_importance.append({ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP[feat]: float(importance)})
  93. return cust_features_importance, product_features_importance
  94. def generate_shap_interance(self, data):
  95. # 初始化SHAP解释器
  96. if self._explanier is None:
  97. self._explanier = shap.TreeExplainer(self.gbdt_model)
  98. # 获取数据基本信息
  99. n_samples = len(data)
  100. n_features = len(data.columns)
  101. batch_size = 200 # 可根据内存调整
  102. # 创建临时内存映射文件
  103. # temp_dir = tempfile.mkdtemp()
  104. temp_dir = "./data/tmp"
  105. temp_file = os.path.join(temp_dir, "shap_interaction_temp.dat")
  106. if os.path.exists(temp_dir):
  107. os.remove(temp_file)
  108. else:
  109. os.makedirs(temp_dir)
  110. try:
  111. # 预创建内存映射文件
  112. fp_shape = (n_samples, n_features, n_features)
  113. fp = np.memmap(temp_file, dtype=np.float32,
  114. mode='w+',
  115. shape=fp_shape)
  116. # 分批计算并存储SHAP交互值
  117. for i in tqdm(range(0, n_samples, batch_size), desc="计算SHAP交互值..."):
  118. batch_data = data.iloc[i:i+batch_size]
  119. batch_interaction = self._explanier.shap_interaction_values(batch_data)
  120. fp[i:i+len(batch_interaction)] = batch_interaction.astype(np.float32)
  121. fp.flush() # 确保数据写入磁盘
  122. # 分批计算均值
  123. mean_interaction = np.zeros((n_features, n_features), dtype=np.float32)
  124. for i in tqdm(range(0, n_samples, batch_size), desc="计算均值..."):
  125. batch = fp[i:i+batch_size] # 读取批数据并取绝对值
  126. mean_interaction += batch.sum(axis=0) # 按批累加
  127. mean_interaction /= n_samples # 计算最终均值
  128. # 构建交互矩阵DataFrame
  129. interaction_df = pd.DataFrame(
  130. mean_interaction,
  131. index=data.columns,
  132. columns=data.columns
  133. )
  134. # 分离卷烟和商户特征(应用列名清理)
  135. product_feats = [
  136. clean_column_name(f"{feat}_{item}")
  137. for feat, categories in ProductConfig.ONEHOT_CAT.items()
  138. for item in categories
  139. ]
  140. cust_feats = [
  141. clean_column_name(f"{feat}_{item}")
  142. for feat, categories in {**CustConfig.ONEHOT_CAT}.items()
  143. for item in categories
  144. ]
  145. # 提取交叉区块
  146. cross_matrix = interaction_df.loc[product_feats, cust_feats]
  147. # 转换为长格式
  148. stacked = cross_matrix.stack().reset_index()
  149. stacked.columns = ['product_feat', 'cust_feat', 'relation']
  150. # 过滤掉零值或NaN的配对
  151. filtered = stacked[
  152. (stacked['relation'].abs() > 1e-6) & # 排除极小值
  153. (~stacked['relation'].isna()) # 排除NaN
  154. ].copy()
  155. # 排序结果
  156. results = (
  157. filtered
  158. .sort_values('relation', ascending=False)
  159. .to_dict('records')
  160. )
  161. # 替换特征名称
  162. feats_name_map = {
  163. **ImportanceFeaturesMap.CUSTOM_FEATURES_MAP,
  164. **ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP
  165. }
  166. for item in results:
  167. # 处理产品特征名
  168. product_f = item["product_feat"]
  169. product_feat_name = None
  170. product_feat_value = None
  171. for key in feats_name_map.keys():
  172. if product_f.startswith(key + "_"):
  173. product_feat_name = feats_name_map[key]
  174. product_feat_value = product_f[len(key) + 1:]
  175. break
  176. if product_feat_name:
  177. item["product_feat"] = f"{product_feat_name}({product_feat_value})"
  178. # 处理客户特征名
  179. cust_f = item["cust_feat"]
  180. cust_feat_name = None
  181. cust_feat_value = None
  182. for key in feats_name_map.keys():
  183. if cust_f.startswith(key + "_"):
  184. cust_feat_name = feats_name_map[key]
  185. cust_feat_value = cust_f[len(key) + 1:]
  186. break
  187. if cust_feat_name:
  188. item["cust_feat"] = f"{cust_feat_name}({cust_feat_value})"
  189. # 返回最终结果
  190. return pd.DataFrame(results, columns=['product_feat', 'cust_feat', 'relation'])
  191. finally:
  192. # 清理临时文件
  193. try:
  194. del fp # 必须先删除内存映射对象
  195. gc.collect()
  196. os.remove(temp_file)
  197. os.rmdir(temp_dir)
  198. except Exception as e:
  199. print(f"清理临时文件时出错: {e}")
  200. if __name__ == "__main__":
  201. model_path = "./models/rank/weights/00000000000000000000000011445301/gbdtlr_model.pkl"
  202. city_uuid = "00000000000000000000000011445301"
  203. product_id = "110102"
  204. gbdt_sort = GbdtLrModel(model_path)
  205. # gbdt_sort.sort(city_uuid, product_id)
  206. # cust_features_importance, product_features_importance = gbdt_sort.generate_feats_importance()
  207. # cust_df = pd.DataFrame([
  208. # {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
  209. # for item in cust_features_importance
  210. # ])
  211. # cust_df.to_csv("./data/cust_feats.csv", index=False)
  212. # product_df = pd.DataFrame([
  213. # {"Features": list(item.keys())[0], "Importance": list(item.values())[0]}
  214. # for item in product_features_importance
  215. # ])
  216. # product_df.to_csv("./data/product_feats.csv", index=False)
  217. data, _ = DataLoader("./data/gbdt/train_data.csv").split_dataset()
  218. data = data["data"].sample(n=300, replace=True, random_state=42)
  219. data.to_csv("./data/data.csv", index=False)
  220. # data = data["data"]