preprocess.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. from database.dao.mysql_dao import MySqlDao
  2. from models.rank.data.config import ProductConfig
  3. from models.rank.data.utils import sample_data_clear
  4. class Item2VecDataProcess:
  5. def __init__(self, city_uuid):
  6. self._mysql_dao = MySqlDao()
  7. print("item2vec: 正在加载product_info...")
  8. # self._product_data = self._mysql_dao.load_product_data(city_uuid)
  9. product_ids = self._mysql_dao.load_order_data(city_uuid)["product_code"].unique().tolist()
  10. self._product_data = self._mysql_dao.get_product_by_ids(city_uuid, product_ids)
  11. self._data_process()
  12. def _data_process(self):
  13. """数据预处理"""
  14. # 获取指定的特征
  15. self._product_data = self._product_data[ProductConfig.FEATURE_COLUMNS]
  16. # 数据清洗
  17. self._product_data = sample_data_clear(self._product_data, ProductConfig)
  18. def item_to_token(self, row):
  19. """根据每款烟的特征生成sentence"""
  20. token = []
  21. for col in ProductConfig.FEATURE_COLUMNS:
  22. if col == 'product_code':
  23. continue
  24. else:
  25. token.append(f"{row[col].strip()}")
  26. token_map = {"product_code": row['product_code'], "token": token}
  27. return token_map
  28. def generate_tokens(self):
  29. tokens = self._product_data.apply(self.item_to_token, axis=1).tolist()
  30. return tokens
  31. if __name__ == "__main__":
  32. city_uuid = "00000000000000000000000011445301"
  33. processor = Item2VecDataProcess(city_uuid)
  34. processor.generate_tokens()