from database import MySqlDao, CustConfig, OrderConfig import os import pandas as pd class DataProcess(): def __init__(self, city_uuid, save_dir): self._mysql_dao = MySqlDao() self.save_dir = save_dir print("正在加载cust_info...") self._cust_data = self._mysql_dao.cust_table_dao.load_data(CustConfig.FEATURES_COLUMNS, city_uuid) print("正在加载analysis_info...") self._order_data = self._mysql_dao.order_table_dao.load_data(OrderConfig.FEATURE_COLUMNS, city_uuid) def data_process(self): """数据预处理""" # train_data_save_path = os.path.join(save_dir, "train.csv") # if os.path.exists(train_data_save_path): # os.remove(train_data_save_path) self._clean_cust_data() self._clean_order_data() train_data = self._generate_train_data() # train_data.to_csv(train_data_save_path, index=False, encoding="utf-8") return train_data def _clean_cust_data(self): """用户数据清洗""" self._cust_data["cust_code"] = self._cust_data["cust_code"].astype(str) # 根据配置规则清洗数据 for feature, rules, in CustConfig.CLEANING_RULES.items(): if rules["type"] == "num": # 先将数值型字符串转换为数值 self._cust_data[feature] = pd.to_numeric(self._cust_data[feature], errors="coerce") if rules["method"] == "fillna": if rules["opt"] == "fill": self._cust_data[feature] = self._cust_data[feature].fillna(rules["value"]).infer_objects(copy=False) elif rules["opt"] == "replace": self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]]).infer_objects(copy=False) elif rules["opt"] == "mean": self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean()).infer_objects(copy=False) self._cust_data[feature] = self._cust_data[feature].infer_objects(copy=False) def _clean_order_data(self): self._order_data["cust_code"] = self._order_data["cust_code"].astype(str) self._order_data["product_code"] = self._order_data["product_code"].astype(str) # self._order_data[order_cols.drop(col_all_missing)] = self._order_data[order_cols.drop(col_all_missing)].fillna(0) self._order_data["order_number_stability"] = self._order_data["order_number_stability"].fillna(0) self._order_data["order_quantity_stability"] = self._order_data["order_quantity_stability"].fillna(0) self._order_data["order_ratio_stability"] = self._order_data["order_ratio_stability"].fillna(0) self._order_data["real_demand_stability"] = self._order_data["real_demand_stability"].fillna(0) self._order_data = self._order_data.infer_objects(copy=False) def _generate_train_data(self): """生成训练数据""" union_data = self._order_data.merge(self._cust_data, on="cust_code", how="inner") return union_data if __name__ == '__main__': city_uuid = "00000000000000000000000011440601" save_dir = os.path.join("./data", city_uuid) dataprocess = DataProcess(city_uuid, save_dir) train_data = dataprocess.data_process() grouped = train_data.groupby('price_tier') os.makedirs(save_dir, exist_ok=True) for price_tier, group_df in grouped: tier_str = str(price_tier) file_name = f"价位段_{tier_str}.csv" save_data = group_df.drop('price_tier', axis=1) save_data.to_csv(os.path.join(save_dir, file_name), index=False)