| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- import joblib
- # from dao import Redis, get_product_by_id, get_custs_by_ids, load_cust_data_from_mysql
- from database import RedisDatabaseHelper, MySqlDao
- from models.rank.data import ProductConfig, CustConfig, ImportanceFeaturesMap
- from models.rank.data.utils import one_hot_embedding, sample_data_clear
- import pandas as pd
- from sklearn.preprocessing import StandardScaler
- import os
- class GbdtLrModel:
- def __init__(self, model_path):
- self.load_model(model_path)
- self.redis = RedisDatabaseHelper().redis
- self._mysql_dao = MySqlDao()
-
- def load_model(self, model_path):
- self._modelname = os.path.basename(model_path).split(".")[0]
- models = joblib.load(model_path)
- self.gbdt_model, self.lr_model, self.onehot_encoder = models["gbdt_model"], models["lr_model"], models["onehot_encoder"]
-
-
- # def get_recall_list(self, city_uuid, product_id):
- # """根据卷烟id获取召回的商铺列表"""
- # key = f"fc:{city_uuid}:{product_id}"
- # self.recall_cust_list = self.redis.zrange(key, 0, -1, withscores=False)
-
- # def load_recall_data(self, city_uuid, product_id):
- # self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
- # self.custs_data = self._mysql_dao.get_cust_by_ids(city_uuid, self.recall_cust_list)[CustConfig.FEATURE_COLUMNS]
-
- def get_cust_and_product_data(self, city_uuid, product_id):
- """从商户数据库中获取指定城市所有商户的id"""
- self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS]
- self.custs_data = self._mysql_dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS]
-
- def generate_feats_map(self, city_uuid, product_id):
- """组合卷烟、商户特征矩阵"""
- # self.get_recall_list(city_uuid, product_id)
- # self.load_recall_data(city_uuid, product_id)
-
- self.get_cust_and_product_data(city_uuid, product_id)
- # 做数据清洗
- self.product_data = sample_data_clear(self.product_data, ProductConfig)
- self.custs_data = sample_data_clear(self.custs_data, CustConfig)
-
- # 笛卡尔积联合
- self.custs_data["descartes"] = 1
- self.product_data["descartes"] = 1
- self.feats_map = pd.merge(self.custs_data, self.product_data, on="descartes").drop("descartes", axis=1)
- self.recall_cust_list = self.feats_map["BB_RETAIL_CUSTOMER_CODE"].to_list()
- self.feats_map.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True)
- self.feats_map.drop('product_code', axis=1, inplace=True)
-
- # onehot编码
- onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
- onehot_columns = list(onehot_feats.keys())
- numeric_columns = self.feats_map.drop(onehot_columns, axis=1).columns
- self.feats_map = one_hot_embedding(self.feats_map, onehot_feats)
-
- # 数字特征归一化
- scaler = StandardScaler()
- self.feats_map[numeric_columns] = scaler.fit_transform(self.feats_map[numeric_columns])
-
- def sort(self, city_uuid, product_id):
- self.generate_feats_map(city_uuid, product_id)
-
- gbdt_preds = self.gbdt_model.apply(self.feats_map)[:, :, 0]
- gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds)
- scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1]
-
- self.recommend_list = []
- for cust_id, score in zip(self.recall_cust_list, scores):
- self.recommend_list.append({cust_id: float(score)})
-
- self.recommend_list = sorted(self.recommend_list, key=lambda x: list(x.values())[0], reverse=True)
- # for res in self.recommend_list[:200]:
- # print(res)
- return self.recommend_list
-
- def generate_feats_importance(self):
- """生成特征重要性"""
- # 获取GBDT模型的特征重要性
- feats_importance = self.gbdt_model.feature_importances_
-
- # 获取特征名称
- feats_names = self.gbdt_model.feature_names_in_
-
- importance_dict = dict(zip(feats_names, feats_importance))
-
- onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
- for feat, categories in onehot_feats.items():
- related_columns = [col for col in feats_names if col.startswith(feat)]
- if related_columns:
- # 合并类别重要性
- combined_importance = sum(importance_dict[col] for col in related_columns)
- # 删除onehot类别列
- for col in related_columns:
- del importance_dict[col]
- # 添加合并后的重要性
- importance_dict[feat] = combined_importance
-
- # 排序
- sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
-
- # 输出特征重要性
- cust_features_importance = []
- product_features_importance = []
- order_features_importance = []
-
- for feat, importance in sorted_importance:
- if feat in list(ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP.keys()):
- cust_features_importance.append({ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP[feat]: float(importance)})
- if feat in list(ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP.keys()):
- product_features_importance.append({ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP[feat]: float(importance)})
- if feat in list(ImportanceFeaturesMap.ORDER_FEATURE_MAP.keys()):
- order_features_importance.append({ImportanceFeaturesMap.ORDER_FEATURE_MAP[feat]: float(importance)})
-
- # 零消特征重要性
- if self._modelname == 'pos_model' and feat in list(ImportanceFeaturesMap.POS_FEATURE_MAP.keys()):
- order_features_importance.append({ImportanceFeaturesMap.POS_FEATURE_MAP[feat]: float(importance)})
-
- # 商圈特征重要性
- if self._modelname == 'shopping_model' and feat in list(ImportanceFeaturesMap.SHOPING_FEATURES_MAP.keys()):
- cust_features_importance.append({ImportanceFeaturesMap.SHOPING_FEATURES_MAP[feat]: float(importance)})
- return cust_features_importance, product_features_importance, order_features_importance
-
- if __name__ == "__main__":
- model_path = "./models/rank/weights/00000000000000000000000011445301/shopping_model.pkl"
- city_uuid = "00000000000000000000000011445301"
- product_id = "110102"
- gbdt_sort = GbdtLrModel(model_path)
- # gbdt_sort.sort(city_uuid, product_id)
-
- importances = gbdt_sort.generate_feats_importance()
- for importance in importances:
- print(importance)
|