utils.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import pandas as pd
  2. import re
  3. def one_hot_embedding(dataframe, onehout_feat):
  4. """对数据的指定特征做embedding编码"""
  5. # 先将指定的特征进行Categorical处理
  6. for feat, categories in onehout_feat.items():
  7. dataframe[feat] = pd.Categorical(dataframe[feat], categories=categories, ordered=False)
  8. dataframe = pd.get_dummies(
  9. dataframe,
  10. columns=list(onehout_feat.keys()),
  11. prefix_sep="_",
  12. dtype=int,
  13. )
  14. # 清理列名中的特殊字符,避免 LightGBM 报错
  15. # 将 JSON 特殊字符替换为下划线或删除
  16. dataframe.columns = [
  17. re.sub(r'[",\\\n\r\t\b\f]', '_', col) # 替换 JSON 特殊字符
  18. .replace(' ', '_') # 替换空格
  19. for col in dataframe.columns
  20. ]
  21. return dataframe
  22. def sample_data_clear(data, config):
  23. for feature, rules, in config.CLEANING_RULES.items():
  24. if rules["type"] == "num":
  25. data[feature] = pd.to_numeric(data[feature], errors="coerce")
  26. if rules["method"] == "fillna":
  27. if rules["opt"] == "fill":
  28. data[feature] = data[feature].fillna(rules["value"]).infer_objects(copy=False)
  29. elif rules["opt"] == "mean":
  30. data[feature] = data[feature].fillna(data[feature].mean()).infer_objects(copy=False)
  31. data[feature] = data[feature].infer_objects(copy=False)
  32. return data