dataloader.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import pandas as pd
  2. from models.rank.data.config import CustConfig, ProductConfig
  3. from sklearn.model_selection import train_test_split
  4. from sklearn.preprocessing import StandardScaler,MinMaxScaler
  5. from models.rank.data.utils import one_hot_embedding
  6. class DataLoader:
  7. def __init__(self,path):
  8. self._gbdt_data_path = path
  9. self._load_data()
  10. def _load_data(self):
  11. self._gbdt_data = pd.read_csv(self._gbdt_data_path, encoding="utf-8")
  12. self._gbdt_data.drop('cust_code', axis=1, inplace=True)
  13. self._gbdt_data.drop('product_code', axis=1, inplace=True)
  14. # 随机降采样数据
  15. sampled_data, _ = train_test_split(
  16. self._gbdt_data,
  17. test_size=0.7,
  18. random_state=42
  19. )
  20. self._gbdt_data = sampled_data
  21. self._onehot_feats = {**CustConfig.ONEHOT_CAT, **ProductConfig.ONEHOT_CAT}
  22. self._onehot_columns = list(self._onehot_feats.keys())
  23. self._numeric_columns = self._gbdt_data.drop(self._onehot_columns + ["label"], axis=1).columns
  24. # 将类别数据进行one-hot编码
  25. self._gbdt_data = one_hot_embedding(self._gbdt_data, self._onehot_feats)
  26. def split_dataset(self):
  27. """数据集划分,将数据集划分为训练集、验证集、测试集"""
  28. # 1. 分离特征和标签
  29. features = self._gbdt_data.drop("label", axis=1)
  30. labels = self._gbdt_data["label"]
  31. # 2. 划分数据集,80%训练集、20%的测试集
  32. X_train, X_test, y_train, y_test = train_test_split(
  33. features, labels,
  34. test_size=0.2,
  35. random_state=42,
  36. shuffle=True,
  37. stratify=labels,
  38. )
  39. # 3. 数据标准化(仅对特征进行标准化)
  40. if len(self._numeric_columns) != 0:
  41. scaler = StandardScaler()
  42. X_train[self._numeric_columns] = scaler.fit_transform(X_train[self._numeric_columns])
  43. X_test[self._numeric_columns] = scaler.fit_transform(X_test[self._numeric_columns])
  44. train_dataset = {"data": X_train, "label": y_train}
  45. test_dataset = {"data": X_test, "label": y_test}
  46. return train_dataset, test_dataset
  47. if __name__ == '__main__':
  48. path = './data/train_data.csv'
  49. dataloader = DataLoader(path)
  50. train_dataset, test_dataset = dataloader.split_dataset()
  51. # 打印训练集和测试集的正负样本分布
  52. print("训练集正负样本分布:")
  53. print(train_dataset["label"].value_counts(normalize=True))
  54. print("测试集正负样本分布:")
  55. print(test_dataset["label"].value_counts(normalize=True))