|
@@ -1,48 +1,56 @@
|
|
|
-from database import RedisDatabaseHelper
|
|
|
|
|
|
|
+from database import RedisDatabaseHelper, MySqlDao
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
|
+from models import UserItemScore, SimilarityMatrix
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
from tqdm import tqdm
|
|
from tqdm import tqdm
|
|
|
from scipy.sparse import csr_matrix
|
|
from scipy.sparse import csr_matrix
|
|
|
from joblib import Parallel, delayed
|
|
from joblib import Parallel, delayed
|
|
|
-import joblib
|
|
|
|
|
|
|
|
|
|
class ItemCFModel:
|
|
class ItemCFModel:
|
|
|
def __init__(self):
|
|
def __init__(self):
|
|
|
self._recommendations = {}
|
|
self._recommendations = {}
|
|
|
|
|
+ self._dao = MySqlDao()
|
|
|
|
|
|
|
|
- def train(self, score_path, similatity_path, city_uuid, n=100, k=10, top_n=100, n_jobs=4):
|
|
|
|
|
- self._score_df = pd.read_csv(score_path)
|
|
|
|
|
- self._similarity_df = pd.read_csv(similatity_path, index_col=0)
|
|
|
|
|
- self._similarity_matrix = csr_matrix(self._similarity_df.values)
|
|
|
|
|
- self._shop_index = {shop: idx for idx, shop in enumerate(self._similarity_df.index)}
|
|
|
|
|
- self._index_shop = {idx: shop for idx, shop in enumerate(self._similarity_df.index)}
|
|
|
|
|
|
|
+ def train(self, city_uuid, n=300, k=100, top_n=300, n_jobs=4):
|
|
|
|
|
+ # self._score_df = pd.read_csv(score_path)
|
|
|
|
|
+ # self._similarity_df = pd.read_csv(similatity_path, index_col=0)
|
|
|
|
|
+ print("itemcf: 正在加载order_info...")
|
|
|
|
|
+ self._order_data = self._dao.load_order_data(city_uuid)
|
|
|
|
|
+ print("正在计算品规培育分数...")
|
|
|
|
|
+ self._score_df = UserItemScore(self._order_data).generate_product_scores()
|
|
|
|
|
+ print("正在计算商户相似度矩阵...")
|
|
|
|
|
+ self._similarity_df = SimilarityMatrix(self._order_data).generate_similarity_matrix()
|
|
|
|
|
+
|
|
|
|
|
+ similarity_matrix = csr_matrix(self._similarity_df.values)
|
|
|
|
|
+ shop_index = {shop: idx for idx, shop in enumerate(self._similarity_df.index)}
|
|
|
|
|
+ index_shop = {idx: shop for idx, shop in enumerate(self._similarity_df.index)}
|
|
|
|
|
|
|
|
def process_product(product_code, scores):
|
|
def process_product(product_code, scores):
|
|
|
# 获取热度最高的n个商户
|
|
# 获取热度最高的n个商户
|
|
|
top_n_shops = scores.nlargest(n, "score")["cust_code"].values
|
|
top_n_shops = scores.nlargest(n, "score")["cust_code"].values
|
|
|
- top_n_indices = [self._shop_index[shop] for shop in top_n_shops]
|
|
|
|
|
|
|
+ top_n_indices = [shop_index[shop] for shop in top_n_shops]
|
|
|
|
|
|
|
|
# 找到每个商户最相似的k个商户
|
|
# 找到每个商户最相似的k个商户
|
|
|
similar_shops = {}
|
|
similar_shops = {}
|
|
|
for shop_idx in top_n_indices:
|
|
for shop_idx in top_n_indices:
|
|
|
- similarities = self._similarity_matrix[shop_idx].toarray().flatten()
|
|
|
|
|
|
|
+ similarities = similarity_matrix[shop_idx].toarray().flatten()
|
|
|
similar_indices = np.argpartition(similarities, -k-1)[-k-1:]
|
|
similar_indices = np.argpartition(similarities, -k-1)[-k-1:]
|
|
|
similar_indices = similar_indices[similar_indices != shop_idx][:k]
|
|
similar_indices = similar_indices[similar_indices != shop_idx][:k]
|
|
|
- similar_shops[self._index_shop[shop_idx]] = [self._index_shop[idx] for idx in similar_indices]
|
|
|
|
|
|
|
+ similar_shops[index_shop[shop_idx]] = [index_shop[idx] for idx in similar_indices]
|
|
|
|
|
|
|
|
# 生成候选商户列表
|
|
# 生成候选商户列表
|
|
|
- candidate_shops = list(set([m for sublist in similar_shops.values() for m in sublist]))
|
|
|
|
|
- candidate_indices = [self._shop_index[shop] for shop in candidate_shops]
|
|
|
|
|
|
|
+ candidate_shops = list(set(top_n_shops).union(set([m for sublist in similar_shops.values() for m in sublist])))
|
|
|
|
|
+ candidate_indices = [shop_index[shop] for shop in candidate_shops]
|
|
|
|
|
|
|
|
# 计算每个候选商户的兴趣得分
|
|
# 计算每个候选商户的兴趣得分
|
|
|
interest_scores = {}
|
|
interest_scores = {}
|
|
|
for candidate_idx in candidate_indices:
|
|
for candidate_idx in candidate_indices:
|
|
|
interest_score = 0
|
|
interest_score = 0
|
|
|
for shop_idx in top_n_indices:
|
|
for shop_idx in top_n_indices:
|
|
|
- if self._index_shop[candidate_idx] in similar_shops[self._index_shop[shop_idx]]:
|
|
|
|
|
- shop_score = scores[scores["cust_code"]==self._index_shop[shop_idx]]["score"].values[0]
|
|
|
|
|
- interest_score += shop_score * self._similarity_matrix[shop_idx, candidate_idx]
|
|
|
|
|
- interest_scores[self._index_shop[candidate_idx]] = interest_score
|
|
|
|
|
|
|
+ if index_shop[candidate_idx] in similar_shops[index_shop[shop_idx]]:
|
|
|
|
|
+ shop_score = scores[scores["cust_code"]==index_shop[shop_idx]]["score"].values[0]
|
|
|
|
|
+ interest_score += shop_score * similarity_matrix[shop_idx, candidate_idx]
|
|
|
|
|
+ interest_scores[index_shop[candidate_idx]] = interest_score
|
|
|
|
|
|
|
|
# 将候选商户的兴趣得分转换为字典列表,并按照从大到小排列
|
|
# 将候选商户的兴趣得分转换为字典列表,并按照从大到小排列
|
|
|
sorted_candidates = sorted([{shop_id: s} for shop_id, s in interest_scores.items()],
|
|
sorted_candidates = sorted([{shop_id: s} for shop_id, s in interest_scores.items()],
|
|
@@ -53,7 +61,7 @@ class ItemCFModel:
|
|
|
# 并行处理每个品规
|
|
# 并行处理每个品规
|
|
|
results = Parallel(n_jobs=n_jobs)(delayed(process_product)(product_code, scores)
|
|
results = Parallel(n_jobs=n_jobs)(delayed(process_product)(product_code, scores)
|
|
|
for product_code, scores in tqdm(self._score_df.groupby("product_code"), desc="train:正在计算候选得分"))
|
|
for product_code, scores in tqdm(self._score_df.groupby("product_code"), desc="train:正在计算候选得分"))
|
|
|
- print(len(results))
|
|
|
|
|
|
|
+
|
|
|
# 存储结果
|
|
# 存储结果
|
|
|
self._recommendations = {product_code: sorted_candidates for product_code, sorted_candidates in results}
|
|
self._recommendations = {product_code: sorted_candidates for product_code, sorted_candidates in results}
|
|
|
self.to_redis_zset(city_uuid)
|
|
self.to_redis_zset(city_uuid)
|
|
@@ -88,14 +96,4 @@ if __name__ == "__main__":
|
|
|
score_path = "./data/itemcf/scores.csv"
|
|
score_path = "./data/itemcf/scores.csv"
|
|
|
similarity_path = "./data/itemcf/similarity.csv"
|
|
similarity_path = "./data/itemcf/similarity.csv"
|
|
|
itemcf_model = ItemCFModel()
|
|
itemcf_model = ItemCFModel()
|
|
|
- itemcf_model.train(score_path, similarity_path, "00000000000000000000000011445301", n_jobs=4)
|
|
|
|
|
- # recommend_list = itemcf_model.inference(110111)
|
|
|
|
|
- # itemcf_model.to_redis_zset()
|
|
|
|
|
- # print(len(recommend_list))
|
|
|
|
|
- # print(recommend_list)
|
|
|
|
|
- # joblib.dump(itemcf_model, "itemCF.model")
|
|
|
|
|
-
|
|
|
|
|
- # model = joblib.load("./itemCF.model")
|
|
|
|
|
- # recommend_list = model.inference(110102)
|
|
|
|
|
- # print(len(recommend_list))
|
|
|
|
|
- # print(recommend_list)
|
|
|
|
|
|
|
+ itemcf_model.train("00000000000000000000000011445301", n_jobs=4)
|