| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- from database import MySqlDao, CustConfig, OrderConfig
- import os
- import pandas as pd
- class DataProcess():
- def __init__(self, city_uuid, save_dir):
- self._mysql_dao = MySqlDao()
- self.save_dir = save_dir
-
- print("正在加载cust_info...")
- self._cust_data = self._mysql_dao.cust_table_dao.load_data(CustConfig.FEATURES_COLUMNS, city_uuid)
- print("正在加载analysis_info...")
- self._order_data = self._mysql_dao.order_table_dao.load_data(OrderConfig.FEATURE_COLUMNS, city_uuid)
-
- def data_process(self):
- """数据预处理"""
- # train_data_save_path = os.path.join(save_dir, "train.csv")
- # if os.path.exists(train_data_save_path):
- # os.remove(train_data_save_path)
-
- self._clean_cust_data()
- self._clean_order_data()
-
- train_data = self._generate_train_data()
- # train_data.to_csv(train_data_save_path, index=False, encoding="utf-8")
- return train_data
-
- def _clean_cust_data(self):
- """用户数据清洗"""
- self._cust_data["cust_code"] = self._cust_data["cust_code"].astype(str)
- # 根据配置规则清洗数据
- for feature, rules, in CustConfig.CLEANING_RULES.items():
- if rules["type"] == "num":
- # 先将数值型字符串转换为数值
- self._cust_data[feature] = pd.to_numeric(self._cust_data[feature], errors="coerce")
-
- if rules["method"] == "fillna":
- if rules["opt"] == "fill":
- self._cust_data[feature] = self._cust_data[feature].fillna(rules["value"]).infer_objects(copy=False)
- elif rules["opt"] == "replace":
- self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]]).infer_objects(copy=False)
- elif rules["opt"] == "mean":
- self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean()).infer_objects(copy=False)
- self._cust_data[feature] = self._cust_data[feature].infer_objects(copy=False)
-
- def _clean_order_data(self):
- self._order_data["cust_code"] = self._order_data["cust_code"].astype(str)
- self._order_data["product_code"] = self._order_data["product_code"].astype(str)
-
- # self._order_data[order_cols.drop(col_all_missing)] = self._order_data[order_cols.drop(col_all_missing)].fillna(0)
- self._order_data["order_number_stability"] = self._order_data["order_number_stability"].fillna(0)
- self._order_data["order_quantity_stability"] = self._order_data["order_quantity_stability"].fillna(0)
- self._order_data["order_ratio_stability"] = self._order_data["order_ratio_stability"].fillna(0)
- self._order_data["real_demand_stability"] = self._order_data["real_demand_stability"].fillna(0)
- self._order_data = self._order_data.infer_objects(copy=False)
-
- def _generate_train_data(self):
- """生成训练数据"""
- union_data = self._order_data.merge(self._cust_data, on="cust_code", how="inner")
- return union_data
-
-
- if __name__ == '__main__':
- city_uuid = "00000000000000000000000011440601"
- save_dir = os.path.join("./data", city_uuid)
- dataprocess = DataProcess(city_uuid, save_dir)
- train_data = dataprocess.data_process()
-
- grouped = train_data.groupby('price_tier')
-
- os.makedirs(save_dir, exist_ok=True)
- for price_tier, group_df in grouped:
- tier_str = str(price_tier)
-
- file_name = f"价位段_{tier_str}.csv"
- save_data = group_df.drop('price_tier', axis=1)
- save_data.to_csv(os.path.join(save_dir, file_name), index=False)
|