-
Notifications
You must be signed in to change notification settings - Fork 627
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from RUCAIBox/master
Update
- Loading branch information
Showing
5 changed files
with
204 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
import numpy as np | ||
from model.abstract_recommender import AbstractRecommender | ||
import scipy.sparse as sp | ||
import torch | ||
|
||
|
||
class ComputeSimilarity: | ||
|
||
def __init__(self, dataMatrix, topK=100, shrink=0, normalize=True): | ||
""" | ||
Computes the cosine similarity on the columns of dataMatrix | ||
If it is computed on URM=|users|x|items|, pass the URM as is. | ||
If it is computed on ICM=|items|x|features|, pass the ICM transposed. | ||
:param dataMatrix: | ||
:param topK: | ||
:param shrink: | ||
:param normalize: If True divide the dot product by the product of the norms | ||
""" | ||
""" | ||
Asymmetric Cosine as described in: | ||
Aiolli, F. (2013, October). Efficient top-n recommendation for very large scale binary rated datasets. | ||
In Proceedings of the 7th ACM conference on Recommender systems (pp. 273-280). ACM. | ||
""" | ||
|
||
super(ComputeSimilarity, self).__init__() | ||
|
||
self.shrink = shrink | ||
self.normalize = normalize | ||
|
||
self.n_rows, self.n_columns = dataMatrix.shape | ||
self.TopK = min(topK, self.n_columns) | ||
|
||
self.dataMatrix = dataMatrix.copy() | ||
|
||
def compute_similarity(self, block_size=100): | ||
""" | ||
Compute the similarity for the given dataset | ||
:param block_size: divide matrix to n_columns/block_size to calculate cosine_distance | ||
:return: sparse matrix W shape of (self.n_columns, self.n_columns) | ||
""" | ||
|
||
values = [] | ||
rows = [] | ||
cols = [] | ||
|
||
self.dataMatrix = self.dataMatrix.astype(np.float32) | ||
|
||
# Compute sum of squared values to be used in normalization | ||
sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() | ||
sumOfSquared = np.sqrt(sumOfSquared) | ||
|
||
end_col_local = self.n_columns | ||
start_col_block = 0 | ||
|
||
# Compute all similarities for each item using vectorization | ||
while start_col_block < end_col_local: | ||
|
||
end_col_block = min(start_col_block + block_size, end_col_local) | ||
this_block_size = end_col_block - start_col_block | ||
|
||
# All data points for a given item | ||
item_data = self.dataMatrix[:, start_col_block:end_col_block] | ||
item_data = item_data.toarray().squeeze() | ||
|
||
if item_data.ndim == 1: | ||
item_data = np.atleast_2d(item_data) | ||
|
||
# Compute item similarities | ||
this_block_weights = self.dataMatrix.T.dot(item_data) | ||
|
||
for col_index_in_block in range(this_block_size): | ||
|
||
if this_block_size == 1: | ||
this_column_weights = this_block_weights | ||
else: | ||
this_column_weights = this_block_weights[:, col_index_in_block] | ||
|
||
columnIndex = col_index_in_block + start_col_block | ||
this_column_weights[columnIndex] = 0.0 | ||
|
||
# Apply normalization and shrinkage, ensure denominator != 0 | ||
if self.normalize: | ||
denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6 | ||
this_column_weights = np.multiply(this_column_weights, 1 / denominator) | ||
|
||
elif self.shrink != 0: | ||
this_column_weights = this_column_weights / self.shrink | ||
|
||
# Sort indices and select TopK | ||
# Sorting is done in three steps. Faster then plain np.argsort for higher number of items | ||
# - Partition the data to extract the set of relevant items | ||
# - Sort only the relevant items | ||
# - Get the original item index | ||
relevant_items_partition = (-this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] | ||
relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition]) | ||
top_k_idx = relevant_items_partition[relevant_items_partition_sorting] | ||
|
||
# Incrementally build sparse matrix, do not add zeros | ||
notZerosMask = this_column_weights[top_k_idx] != 0.0 | ||
numNotZeros = np.sum(notZerosMask) | ||
|
||
values.extend(this_column_weights[top_k_idx][notZerosMask]) | ||
rows.extend(top_k_idx[notZerosMask]) | ||
cols.extend(np.ones(numNotZeros) * columnIndex) | ||
|
||
start_col_block += block_size | ||
|
||
# End while on columns | ||
|
||
W_sparse = sp.csr_matrix((values, (rows, cols)), | ||
shape=(self.n_columns, self.n_columns), | ||
dtype=np.float32) | ||
|
||
return W_sparse.tocsc() | ||
|
||
|
||
class ItemKNN(AbstractRecommender): | ||
def __init__(self, config, dataset): | ||
self.device = config['device'] | ||
self.USER_ID = config['USER_ID_FIELD'] | ||
self.ITEM_ID = config['ITEM_ID_FIELD'] | ||
self.n_users = len(dataset.field2id_token[self.USER_ID]) | ||
self.n_items = len(dataset.field2id_token[self.ITEM_ID]) | ||
|
||
self.interaction_matrix = dataset.train_matrix.tocsr().astype(np.float32) | ||
shape = self.interaction_matrix.shape | ||
assert self.n_users == shape[0] and self.n_items == shape[1] | ||
self.k = config['k'] | ||
self.shrink = config['shrink'] if 'shrink' in config else 0.0 | ||
self.w = ComputeSimilarity(self.interaction_matrix, topK=self.k, shrink=self.shrink).compute_similarity() | ||
self.pred_mat = self.interaction_matrix.dot(self.w).tolil() | ||
|
||
def forward(self, user, item): | ||
pass | ||
|
||
def calculate_loss(self, interaction): | ||
pass | ||
|
||
def predict(self, interaction): | ||
user = interaction[self.USER_ID] | ||
item = interaction[self.ITEM_ID] | ||
user = user.cpu().numpy().astype(int) | ||
item = item.cpu().numpy().astype(int) | ||
result = [] | ||
|
||
for index in range(len(user)): | ||
uid = user[index] | ||
iid = item[item] | ||
score = self.pred_mat[uid, iid] | ||
result.append(score) | ||
result = torch.from_numpy(np.array(result)).to(self.device) | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters