import pandas as pd import re def one_hot_embedding(dataframe, onehout_feat): """对数据的指定特征做embedding编码""" # 先将指定的特征进行Categorical处理 for feat, categories in onehout_feat.items(): dataframe[feat] = pd.Categorical(dataframe[feat], categories=categories, ordered=False) dataframe = pd.get_dummies( dataframe, columns=list(onehout_feat.keys()), prefix_sep="_", dtype=int, ) # 清理列名中的特殊字符,避免 LightGBM 报错 # 将 JSON 特殊字符替换为下划线或删除 dataframe.columns = [ re.sub(r'[",\\\n\r\t\b\f]', '_', col) # 替换 JSON 特殊字符 .replace(' ', '_') # 替换空格 for col in dataframe.columns ] return dataframe def sample_data_clear(data, config): for feature, rules, in config.CLEANING_RULES.items(): if rules["type"] == "num": data[feature] = pd.to_numeric(data[feature], errors="coerce") if rules["method"] == "fillna": if rules["opt"] == "fill": data[feature] = data[feature].fillna(rules["value"]).infer_objects(copy=False) elif rules["opt"] == "mean": data[feature] = data[feature].fillna(data[feature].mean()).infer_objects(copy=False) data[feature] = data[feature].infer_objects(copy=False) return data