dataloader.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import pandas as pd
  2. from models.rank.data.config import CustConfig, ProductConfig
  3. from sklearn.preprocessing import OneHotEncoder
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.preprocessing import StandardScaler
  6. class DataLoader:
  7. def __init__(self,path):
  8. self._gbdt_data_path = path
  9. self._load_data()
  10. def _load_data(self):
  11. self._gbdt_data = pd.read_csv(self._gbdt_data_path, encoding="utf-8")
  12. self._gbdt_data.drop('BB_RETAIL_CUSTOMER_CODE', axis=1, inplace=True)
  13. self._gbdt_data.drop('product_code', axis=1, inplace=True)
  14. self._onehot_columns = CustConfig.ONEHOT + ProductConfig.ONEHOT
  15. self._numeric_columns = self._gbdt_data.drop(self._onehot_columns + ["label"], axis=1).columns
  16. # 将类别数据进行one-hot编码
  17. self.one_hot_embedding(self._onehot_columns)
  18. def one_hot_embedding(self, onehot_columns):
  19. """对指定的特征进行onehot编码"""
  20. self._gbdt_data = pd.get_dummies(self._gbdt_data, columns=onehot_columns, drop_first=False)
  21. def split_dataset(self):
  22. """数据集划分,将数据集划分为训练集、验证集、测试集"""
  23. # 1. 分离特征和标签
  24. features = self._gbdt_data.drop("label", axis=1)
  25. labels = self._gbdt_data["label"]
  26. # 2. 划分数据集,80%训练集、20%的测试集
  27. X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, shuffle=True)
  28. # 3. 数据标准化(仅对特征进行标准化)
  29. scaler = StandardScaler()
  30. X_train[self._numeric_columns] = scaler.fit_transform(X_train[self._numeric_columns])
  31. X_test[self._numeric_columns] = scaler.fit_transform(X_test[self._numeric_columns])
  32. train_dataset = {"data": X_train, "label": y_train}
  33. test_dataset = {"data": X_test, "label": y_test}
  34. return train_dataset, test_dataset
  35. if __name__ == '__main__':
  36. path = './models/rank/data/gbdt_data.csv'
  37. dataloader = DataLoader(path)
  38. dataloader.split_dataset()