|
@@ -3,6 +3,7 @@ from models.rank.data.config import CustConfig, ProductConfig, OrderConfig
|
|
|
import os
|
|
import os
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
+from sklearn.utils import shuffle
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
|
|
|
|
|
class DataProcess():
|
|
class DataProcess():
|
|
@@ -40,7 +41,7 @@ class DataProcess():
|
|
|
self._calculate_score()
|
|
self._calculate_score()
|
|
|
|
|
|
|
|
# 4. 根据中位数打标签
|
|
# 4. 根据中位数打标签
|
|
|
- self.labeled_data_by_score()
|
|
|
|
|
|
|
+ self.labeled_data()
|
|
|
|
|
|
|
|
# 5. 选取训练样本
|
|
# 5. 选取训练样本
|
|
|
self._generate_train_data()
|
|
self._generate_train_data()
|
|
@@ -61,6 +62,7 @@ class DataProcess():
|
|
|
self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]])
|
|
self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[rules["value"]])
|
|
|
elif rules["opt"] == "mean":
|
|
elif rules["opt"] == "mean":
|
|
|
self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean())
|
|
self._cust_data[feature] = self._cust_data[feature].fillna(self._cust_data[feature].mean())
|
|
|
|
|
+ self._cust_data[feature] = self._cust_data[feature].infer_objects(copy=False)
|
|
|
|
|
|
|
|
def _clean_product_data(self):
|
|
def _clean_product_data(self):
|
|
|
"""卷烟信息表数据清洗"""
|
|
"""卷烟信息表数据清洗"""
|
|
@@ -73,6 +75,7 @@ class DataProcess():
|
|
|
self._product_data[feature] = self._product_data[feature].fillna(rules["value"])
|
|
self._product_data[feature] = self._product_data[feature].fillna(rules["value"])
|
|
|
elif rules["opt"] == "mean":
|
|
elif rules["opt"] == "mean":
|
|
|
self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean())
|
|
self._product_data[feature] = self._product_data[feature].fillna(self._product_data[feature].mean())
|
|
|
|
|
+ self._product_data[feature] = self._product_data[feature].infer_objects(copy=False)
|
|
|
|
|
|
|
|
def _clean_order_data(self):
|
|
def _clean_order_data(self):
|
|
|
pass
|
|
pass
|
|
@@ -87,7 +90,7 @@ class DataProcess():
|
|
|
self._order_score["score"] = sum(self._order_score[feat] * weight
|
|
self._order_score["score"] = sum(self._order_score[feat] * weight
|
|
|
for feat, weight in OrderConfig.WEIGHTS.items())
|
|
for feat, weight in OrderConfig.WEIGHTS.items())
|
|
|
|
|
|
|
|
- def labeled_data_by_score(self):
|
|
|
|
|
|
|
+ def labeled_data(self):
|
|
|
"""通过计算分数打标签"""
|
|
"""通过计算分数打标签"""
|
|
|
# 按品规分组计算中位数
|
|
# 按品规分组计算中位数
|
|
|
product_medians = self._order_score.groupby("PRODUCT_CODE")["score"].median().reset_index()
|
|
product_medians = self._order_score.groupby("PRODUCT_CODE")["score"].median().reset_index()
|
|
@@ -102,7 +105,20 @@ class DataProcess():
|
|
|
)
|
|
)
|
|
|
self._order_score = self._order_score.sort_values("score", ascending=False)
|
|
self._order_score = self._order_score.sort_values("score", ascending=False)
|
|
|
self._order_score = self._order_score[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE", "label"]]
|
|
self._order_score = self._order_score[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE", "label"]]
|
|
|
- self._order_score.to_csv("./models/rank/data/train.csv")
|
|
|
|
|
|
|
+ self._order_score.rename(columns={"PRODUCT_CODE": "product_code"}, inplace=True)
|
|
|
|
|
+
|
|
|
|
|
+ def _generate_train_data(self):
|
|
|
|
|
+ cust_feats = self._cust_data.set_index("BB_RETAIL_CUSTOMER_CODE")
|
|
|
|
|
+ product_feats = self._product_data.set_index("product_code")
|
|
|
|
|
+
|
|
|
|
|
+ self._train_data = self._order_score.copy()
|
|
|
|
|
+
|
|
|
|
|
+ self._train_data = self._train_data.join(cust_feats, on="BB_RETAIL_CUSTOMER_CODE", how="left")
|
|
|
|
|
+ self._train_data = self._train_data.join(product_feats, on="product_code", how="left")
|
|
|
|
|
+
|
|
|
|
|
+ self._train_data = shuffle(self._train_data, random_state=42)
|
|
|
|
|
+
|
|
|
|
|
+ self._train_data.to_csv(self._save_res_path, index=False)
|
|
|
|
|
|
|
|
def _descartes(self):
|
|
def _descartes(self):
|
|
|
"""将零售户信息与卷烟信息进行笛卡尔积连接"""
|
|
"""将零售户信息与卷烟信息进行笛卡尔积连接"""
|
|
@@ -111,7 +127,7 @@ class DataProcess():
|
|
|
|
|
|
|
|
self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
|
|
self._descartes_data = pd.merge(self._cust_data, self._product_data, on="descartes").drop("descartes", axis=1)
|
|
|
|
|
|
|
|
- def _labeled_data(self):
|
|
|
|
|
|
|
+ def _labeled_data_from_descartes(self):
|
|
|
"""根据order表信息给descartes_data数据打标签"""
|
|
"""根据order表信息给descartes_data数据打标签"""
|
|
|
# 获取order表中的正样本组合
|
|
# 获取order表中的正样本组合
|
|
|
order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
|
|
order_combinations = self._order_data[["BB_RETAIL_CUSTOMER_CODE", "PRODUCT_CODE"]].drop_duplicates()
|
|
@@ -121,7 +137,7 @@ class DataProcess():
|
|
|
self._descartes_data['label'] = self._descartes_data.apply(
|
|
self._descartes_data['label'] = self._descartes_data.apply(
|
|
|
lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
|
|
lambda row: 1 if (row['BB_RETAIL_CUSTOMER_CODE'], row['product_code']) in order_set else 0, axis=1)
|
|
|
|
|
|
|
|
- def _generate_train_data(self):
|
|
|
|
|
|
|
+ def _generate_train_data_from_descartes(self):
|
|
|
"""从descartes_data中生成训练数据"""
|
|
"""从descartes_data中生成训练数据"""
|
|
|
positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
|
|
positive_samples = self._descartes_data[self._descartes_data["label"] == 1]
|
|
|
negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
|
|
negative_samples = self._descartes_data[self._descartes_data["label"] == 0]
|