import joblib # from dao import Redis, get_product_by_id, get_custs_by_ids, load_cust_data_from_mysql from database import RedisDatabaseHelper, MySqlDao from models.rank.data import ProductConfig, CustConfig, ImportanceFeaturesMap from models.rank.data.utils import one_hot_embedding, sample_data_clear import pandas as pd from sklearn.preprocessing import StandardScaler import os class GbdtLrModel: def __init__(self, model_path): self.load_model(model_path) self.redis = RedisDatabaseHelper().redis self._mysql_dao = MySqlDao() def load_model(self, model_path): self._modelname = os.path.basename(model_path).split(".")[0] models = joblib.load(model_path) self.gbdt_model, self.lr_model, self.onehot_encoder = models["gbdt_model"], models["lr_model"], models["onehot_encoder"] # def get_recall_list(self, city_uuid, product_id): # """根据卷烟id获取召回的商铺列表""" # key = f"fc:{city_uuid}:{product_id}" # self.recall_cust_list = self.redis.zrange(key, 0, -1, withscores=False) # def load_recall_data(self, city_uuid, product_id): # self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS] # self.custs_data = self._mysql_dao.get_cust_by_ids(city_uuid, self.recall_cust_list)[CustConfig.FEATURE_COLUMNS] def get_cust_and_product_data(self, city_uuid, product_id): """从商户数据库中获取指定城市所有商户的id""" self.product_data = self._mysql_dao.get_product_by_id(city_uuid, product_id)[ProductConfig.FEATURE_COLUMNS] self.custs_data = self._mysql_dao.load_cust_data(city_uuid)[CustConfig.FEATURE_COLUMNS] def generate_feats_map(self, city_uuid, product_id): """组合卷烟、商户特征矩阵""" # self.get_recall_list(city_uuid, product_id) # self.load_recall_data(city_uuid, product_id) self.get_cust_and_product_data(city_uuid, product_id) # 做数据清洗 self.product_data = sample_data_clear(self.product_data, ProductConfig) self.custs_data = sample_data_clear(self.custs_data, CustConfig) # 笛卡尔积联合 self.custs_data["descartes"] = 1 self.product_data["descartes"] = 1 self.feats_map = pd.merge(self.custs_data, self.product_data, on="descartes").drop("descartes", axis=1) self.recall_cust_list = self.feats_map["BB_RETAIL_CUSTOMER_CODE"].to_list() self.feats_map.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True) self.feats_map.drop('product_code', axis=1, inplace=True) # onehot编码 onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT} onehot_columns = list(onehot_feats.keys()) numeric_columns = self.feats_map.drop(onehot_columns, axis=1).columns self.feats_map = one_hot_embedding(self.feats_map, onehot_feats) # 数字特征归一化 scaler = StandardScaler() self.feats_map[numeric_columns] = scaler.fit_transform(self.feats_map[numeric_columns]) def sort(self, city_uuid, product_id): self.generate_feats_map(city_uuid, product_id) gbdt_preds = self.gbdt_model.apply(self.feats_map)[:, :, 0] gbdt_feats_encoded = self.onehot_encoder.transform(gbdt_preds) scores = self.lr_model.predict_proba(gbdt_feats_encoded)[:, 1] self.recommend_list = [] for cust_id, score in zip(self.recall_cust_list, scores): self.recommend_list.append({cust_id: float(score)}) self.recommend_list = sorted(self.recommend_list, key=lambda x: list(x.values())[0], reverse=True) # for res in self.recommend_list[:200]: # print(res) return self.recommend_list def generate_feats_importance(self): """生成特征重要性""" # 获取GBDT模型的特征重要性 feats_importance = self.gbdt_model.feature_importances_ # 获取特征名称 feats_names = self.gbdt_model.feature_names_in_ importance_dict = dict(zip(feats_names, feats_importance)) onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT} for feat, categories in onehot_feats.items(): related_columns = [col for col in feats_names if col.startswith(feat)] if related_columns: # 合并类别重要性 combined_importance = sum(importance_dict[col] for col in related_columns) # 删除onehot类别列 for col in related_columns: del importance_dict[col] # 添加合并后的重要性 importance_dict[feat] = combined_importance # 排序 sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True) # 输出特征重要性 cust_features_importance = [] product_features_importance = [] order_features_importance = [] for feat, importance in sorted_importance: if feat in list(ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP.keys()): cust_features_importance.append({ImportanceFeaturesMap.CUSTOM_FEATRUES_MAP[feat]: float(importance)}) if feat in list(ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP.keys()): product_features_importance.append({ImportanceFeaturesMap.PRODUCT_FEATRUES_MAP[feat]: float(importance)}) if feat in list(ImportanceFeaturesMap.ORDER_FEATURE_MAP.keys()): order_features_importance.append({ImportanceFeaturesMap.ORDER_FEATURE_MAP[feat]: float(importance)}) # 零消特征重要性 if self._modelname == 'pos_model' and feat in list(ImportanceFeaturesMap.POS_FEATURE_MAP.keys()): order_features_importance.append({ImportanceFeaturesMap.POS_FEATURE_MAP[feat]: float(importance)}) # 商圈特征重要性 if self._modelname == 'shopping_model' and feat in list(ImportanceFeaturesMap.SHOPING_FEATURES_MAP.keys()): cust_features_importance.append({ImportanceFeaturesMap.SHOPING_FEATURES_MAP[feat]: float(importance)}) return cust_features_importance, product_features_importance, order_features_importance if __name__ == "__main__": model_path = "./models/rank/weights/00000000000000000000000011445301/shopping_model.pkl" city_uuid = "00000000000000000000000011445301" product_id = "110102" gbdt_sort = GbdtLrModel(model_path) # gbdt_sort.sort(city_uuid, product_id) importances = gbdt_sort.generate_feats_importance() for importance in importances: print(importance)