-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommender_engine.py
72 lines (41 loc) · 2.07 KB
/
recommender_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
import scipy.sparse
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
# from surprise import Reader, Dataset, SVD, evaluate
import warnings; warnings.simplefilter('ignore')
def get_recommendations(title, number_of_recommendations):
smd = pd.read_csv('data/smd.txt')
tfidf_matrix = scipy.sparse.load_npz('data/tfidf_matrix.npz')
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim = np.load('data/cosine_sim.npy')
# We now have a pairwise cosine similarity matrix for all the movies in our dataset. The next step is to write a function that returns the 30 most similar movies based on the cosine similarity score.
smd = smd.reset_index()
titles = smd['title']
# indices = pd.Series(smd.index, index=smd['title'])
indices = dict(list(zip(smd['title'], smd.index)))
# Creates a Levenshtein distance score list
title_fuzzy_scores = [[fuzz.ratio(title.lower(), list_title.lower()) + fuzz.partial_ratio(title.lower(), list_title.lower()), list_title] for list_title in titles]
#Sort score in descending order
sorted_title_fuzzy_scores = sorted(title_fuzzy_scores, key = lambda x: x[0], reverse=True)
# print(sorted_title_fuzzy_scores[0:20])
#Choose best match to the title given by the user
best_match_title = sorted_title_fuzzy_scores[0][1]
# print(best_match_title)
#Search titles for the matched title
idx = indices[best_match_title]
sim_scores = list(enumerate(cosine_sim[idx]))
# print(sim_scores)
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#Select from 1 to exclude the title
sim_scores = sim_scores[1:number_of_recommendations + 1]
movie_indices = [i[0] for i in sim_scores]
return titles.iloc[movie_indices]
# if __name__ == "__main__":
# print(get_recommendations('The Godfather', 5)
#
#
# get_recommendations('The Dark Knight', 5)