from database.dao.mysql_dao import MySqlDao from models.rank.data.config import ProductConfig from models.rank.data.utils import sample_data_clear class Item2VecDataProcess: def __init__(self, city_uuid): self._mysql_dao = MySqlDao() print("item2vec: 正在加载product_info...") self._product_data = self._mysql_dao.load_product_data(city_uuid) self._data_process() def _data_process(self): """数据预处理""" # 获取指定的特征 self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS] # 数据清洗 self._product_data = sample_data_clear(self._product_data, ProductConfig) def tokenize_features(self, row): """根据每款烟的特征生成sentence""" tokens = [] for col in ProductConfig.FEATURE_COLUMNS: if col == 'product_code': continue if col in ["direct_retail_price", "tbc_total_length"]: tokens.append(f"{col}_{row[col].replace('-', '_')}") else: tokens.append(f"{col}_{row[col]}") return tokens def generate_sentence(self): sentcens = self._product_data.apply(self.tokenize_features, axis=1).tolist() return sentcens if __name__ == "__main__": city_uuid = "00000000000000000000000011445301" processor = Item2VecDataProcess(city_uuid) processor.generate_sentence()