User-base/Item-base实现
User-Base: 计算similarity matrix of user-user using cosine similarity
然后通过similarity matrix between user- user 来user vector之间的weighted sum来计算rating
import pandas as pd
import numpy as np
def userCF(users, items):
num_user = len(users.keys())
num_item = len(items.keys())
sim_matrix_user = pd.DataFrame(np.zeros((num_user,num_user)), index=users.keys(), columns=users.keys())
for i in range(num_user):
for j in range(i, num_user):
intersec_items = []
dot_prod = 0
num_ui,num_uj = 0,0
ui = sim_matrix_user.columns[i]
uj = sim_matrix_user.columns[j]
for item in items.keys():
# using cosine similarity
if item in users[ui].keys():
num_ui += users[ui][item]**2
if item in users[uj].keys():
num_uj += users[uj][item]**2
if item in users[ui].keys() and item in users[uj].keys():
dot_prod += (users[ui][item] * users[uj][item])
similarity = dot_prod/(np.sqrt(num_uj) * np.sqrt(num_ui))
sim_matrix_user[ui][uj] = similarity
sim_matrix_user[uj][ui] = similarity
return sim_matrix_user
def user_Recommend(user, sim_matrix_user, users, items, k):
# select top K similar users for selection
similar_users = sim_matrix_user[user].sort_values(ascending = False)
topk_users = similar_users[1:1+k]
# dataframe storing result
rating_df = pd.DataFrame()
user_rating = pd.DataFrame(users)
w_sum = 0
# find weighted sum of rating between input user and all item
for u in topk_users.keys():
rating_df = rating_df.append(topk_users[u]* user_rating[u])
rating_df = (rating_df.sum()/sum(topk_users)).sort_values(ascending = False)
return topk_users, rating_df
另外一种写法:
计算 user-user similarity
考虑到存在着这样一种用户,他与很多商品都发生过交互,但他发生交互可能是因为他是做这个生意的或者怎么,并不是因为这些商品之间存在某种相似性,所以这里我们同样可以引入Inverse User Frequence来对物品相似度进行加权,从而避免这种用户的影响。 这里的similariry用了
def usercf_sim(all_click_df, user_activate_degree_dict):
"""
用户相似性矩阵计算
:param all_click_df: 数据表
:param user_activate_degree_dict: 用户活跃度的字典
return 用户相似性矩阵
思路: 基于用户的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则
"""
# 返回一个key 是item, value是user list的dictionary
item_user_time_dict = get_item_user_time_dict(all_click_df)
u2u_sim = {}
user_cnt = defaultdict(int)
for item, user_time_list in tqdm(item_user_time_dict.items()):
for u, click_time in user_time_list:
user_cnt[u] += 1
u2u_sim.setdefault(u, {})
for v, click_time in user_time_list:
u2u_sim[u].setdefault(v, 0)
if u == v:
continue
u2u_sim[u][v] += 1 / math.log(len(user_time_list) + 1)
u2u_sim_ = u2u_sim.copy()
for u, related_users in u2u_sim.items():
for v, wij in related_users.items():
u2u_sim_[u][v] = wij / math.sqrt(user_cnt[u] * user_cnt[v])
# 将得到的相似性矩阵保存到本地
pickle.dump(u2u_sim_, open(save_path + 'usercf_u2u_sim.pkl', 'wb'))
return u2u_sim_
计算topK ranking
# 基于用户的召回 u2u2i
def user_based_recommend(user_id, user_item_time_dict, u2u_sim, sim_user_topk, recall_item_num,
item_topk_click, item_created_time_dict, emb_i2i_sim):
"""
基于文章协同过滤的召回
:param user_id: 用户id
:param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}
:param u2u_sim: 字典,文章相似性矩阵
:param sim_user_topk: 整数, 选择与当前用户最相似的前k个用户
:param recall_item_num: 整数, 最后的召回文章数量
:param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全
:param item_created_time_dict: 文章创建时间列表
:param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵
return: 召回的文章列表 [(item1, score1), (item2, score2)...]
"""
# 历史交互
user_item_time_list = user_item_time_dict[user_id] # [(item1, time1), (item2, time2)..]
user_hist_items = set([i for i, t in user_item_time_list]) # 存在一个用户与某篇文章的多次交互, 这里得去重
items_rank = {}
for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:sim_user_topk]:
for i, click_time in user_item_time_dict[sim_u]:
if i in user_hist_items:
continue
items_rank.setdefault(i, 0)
items_rank[i] += wuv
# 热度补全
if len(items_rank) < recall_item_num:
for i, item in enumerate(item_topk_click):
if item in items_rank.items(): # 填充的item应该不在原来的列表中
continue
items_rank[item] = - i - 100 # 随便给个复数就行
if len(items_rank) == recall_item_num:
break
items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
return items_rank
Item-Base: 计算similarity matrix of item-item using cosine similarity
然后通过similarity matrix between item- item 来计算item vector之间的weighted sum以及每个user对这个item的rating
def itemCF(users, items):
num_user = len(users.keys())
num_item = len(items.keys())
sim_matrix_item = pd.DataFrame(np.zeros((num_item,num_item)), index=items.keys(), columns=items.keys())
for i in range(num_item):
for j in range(i, num_item):
#intersec_items = []
dot_prod = 0
num_ii,num_ij = 0,0
ii = sim_matrix_item.columns[i]
ij = sim_matrix_item.columns[j]
for user in users.keys():
# using cosine similarity
if user in items[ii].keys():
num_ii += items[ii][user]**2
if user in items[ij].keys():
num_ij += items[ij][user]**2
if user in items[ii].keys() and user in items[ij].keys():
dot_prod += (items[ii][user] * items[ij][user])
similarity = dot_prod/(np.sqrt(num_ij) * np.sqrt(num_ii))
sim_matrix_item[ii][ij] = similarity
sim_matrix_item[ij][ii] = similarity
return sim_matrix_item
def item_Recommend(item, sim_matrix_item, users, items, k):
# select top K similar users for selection
similar_items = sim_matrix_item[item].sort_values(ascending = False)
topk_items = similar_items[1:1+k]
# dataframe storing result
rating_df = pd.DataFrame()
item_rating = pd.DataFrame(items)
w_sum = 0
# find weighted sum of rating between input user and all item
for i in topk_items.keys():
rating_df = rating_df.append(topk_items[i]* item_rating[i])
rating_df = (rating_df.sum()/sum(topk_items)).sort_values(ascending = False)
return topk_items, rating_df
另外一种写法:
计算item-item similarity
def itemcf_sim(df, item_created_time_dict):
"""
文章与文章之间的相似性矩阵计算
:param df: 数据表
:item_created_time_dict: 文章创建时间的字典
return : 文章与文章的相似性矩阵
思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则
"""
user_item_time_dict = get_user_item_time(df)
# 计算物品相似度
i2i_sim = {}
item_cnt = defaultdict(int)
for user, item_time_list in tqdm(user_item_time_dict.items()):
# 在基于商品的协同过滤优化的时候可以考虑时间因素
for loc1, (i, i_click_time) in enumerate(item_time_list):
item_cnt[i] += 1
i2i_sim.setdefault(i, {})
for loc2, (j, j_click_time) in enumerate(item_time_list):
if(i == j):
continue
i2i_sim[i][j] += 1/ math.log(len(item_time_list) + 1)
i2i_sim_ = i2i_sim.copy()
for i, related_items in i2i_sim.items():
for j, wij in related_items.items():
i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
# 将得到的相似性矩阵保存到本地
pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))
return i2i_sim_
计算item的 ranking和选择topK个召回
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim):
"""
基于文章协同过滤的召回
:param user_id: 用户id
:param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}
:param i2i_sim: 字典,文章相似性矩阵
:param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章
:param recall_item_num: 整数, 最后的召回文章数量
:param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全
:param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵
return: 召回的文章列表 [(item1, score1), (item2, score2)...]
"""
# 获取用户历史交互的文章
user_hist_items = user_item_time_dict[user_id]
user_hist_items_ = {user_id for user_id, _ in user_hist_items}
item_rank = {}
for loc, (i, click_time) in enumerate(user_hist_items):
for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:
if j in user_hist_items_:
continue
item_rank.setdefault(j, 0)
item_rank[j] += wij
# 不足10个,用热门商品补全
if len(item_rank) < recall_item_num:
for i, item in enumerate(item_topk_click):
if item in item_rank.items(): # 填充的item应该不在原来的列表中
continue
item_rank[item] = - i - 100 # 随便给个负数就行
if len(item_rank) == recall_item_num:
break
item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
return item_rank
Reference
https://blog.csdn.net/sinat_22594309/article/details/86420207
Last updated