| 123456789101112131415161718192021222324252627282930313233343536 |
- import pandas as pd
- import re
- def one_hot_embedding(dataframe, onehout_feat):
- """对数据的指定特征做embedding编码"""
- # 先将指定的特征进行Categorical处理
- for feat, categories in onehout_feat.items():
- dataframe[feat] = pd.Categorical(dataframe[feat], categories=categories, ordered=False)
- dataframe = pd.get_dummies(
- dataframe,
- columns=list(onehout_feat.keys()),
- prefix_sep="_",
- dtype=int,
- )
- # 清理列名中的特殊字符,避免 LightGBM 报错
- # 将 JSON 特殊字符替换为下划线或删除
- dataframe.columns = [
- re.sub(r'[",\\\n\r\t\b\f]', '_', col) # 替换 JSON 特殊字符
- .replace(' ', '_') # 替换空格
- for col in dataframe.columns
- ]
- return dataframe
- def sample_data_clear(data, config):
- for feature, rules, in config.CLEANING_RULES.items():
- if rules["type"] == "num":
- data[feature] = pd.to_numeric(data[feature], errors="coerce")
- if rules["method"] == "fillna":
- if rules["opt"] == "fill":
- data[feature] = data[feature].fillna(rules["value"]).infer_objects(copy=False)
- elif rules["opt"] == "mean":
- data[feature] = data[feature].fillna(data[feature].mean()).infer_objects(copy=False)
- data[feature] = data[feature].infer_objects(copy=False)
- return data
|