| 12345678910111213141516171819202122232425262728293031323334353637383940 |
- from database.dao.mysql_dao import MySqlDao
- from models.rank.data.config import ProductConfig
- from models.rank.data.utils import sample_data_clear
- class Item2VecDataProcess:
- def __init__(self, city_uuid):
- self._mysql_dao = MySqlDao()
- print("item2vec: 正在加载product_info...")
- self._product_data = self._mysql_dao.load_product_data(city_uuid)
- self._data_process()
-
- def _data_process(self):
- """数据预处理"""
- # 获取指定的特征
- self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
- # 数据清洗
- self._product_data = sample_data_clear(self._product_data, ProductConfig)
-
- def item_to_token(self, row):
- """根据每款烟的特征生成sentence"""
- token = []
-
- for col in ProductConfig.FEATURE_COLUMNS:
- if col == 'product_code':
- continue
- else:
- token.append(f"{row[col].strip()}")
- token_map = {"product_code": row['product_code'], "token": token}
- return token_map
-
- def generate_tokens(self):
- tokens = self._product_data.apply(self.item_to_token, axis=1).tolist()
- return tokens
-
- if __name__ == "__main__":
- city_uuid = "00000000000000000000000011445301"
- processor = Item2VecDataProcess(city_uuid)
- processor.generate_tokens()
|