from database.dao.mysql_dao import MySqlDao from models.rank.data.config import ProductConfig from models.rank.data.utils import sample_data_clear class Item2VecDataProcess: def __init__(self, city_uuid): self._mysql_dao = MySqlDao() print("item2vec: 正在加载product_info...") # self._product_data = self._mysql_dao.load_product_data(city_uuid) product_ids = self._mysql_dao.load_order_data(city_uuid)["product_code"].unique().tolist() self._product_data = self._mysql_dao.get_product_by_ids(city_uuid, product_ids) self._data_process() def _data_process(self): """数据预处理""" # 获取指定的特征 self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS] # 数据清洗 self._product_data = sample_data_clear(self._product_data, ProductConfig) def item_to_token(self, row): """根据每款烟的特征生成sentence""" token = [] for col in ProductConfig.FEATURE_COLUMNS: if col == 'product_code': continue else: token.append(f"{row[col].strip()}") token_map = {"product_code": row['product_code'], "token": token} return token_map def generate_tokens(self): tokens = self._product_data.apply(self.item_to_token, axis=1).tolist() return tokens if __name__ == "__main__": city_uuid = "00000000000000000000000011445301" processor = Item2VecDataProcess(city_uuid) processor.generate_tokens()