-
Notifications
You must be signed in to change notification settings - Fork 0
/
FlixsterDataSub.py
142 lines (114 loc) · 5.87 KB
/
FlixsterDataSub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import numpy as np
import pandas as pd
def load_user_item_matrix_FX_All(max_user=2370, max_item=2835): #2370 2835 24676 11927
"""
this function loads the user x items matrix from the movie lens data set.
Both input parameter represent a threshold for the maximum user id or maximum item id
The highest user id is 6040 and the highest movie id is 3952 for the original data set, however, the masked data
set contains only 943 users and 1330 items
:return: user-item matrix
"""
# Flixster/subset_FX_O.dat Flixster/subset_FX_O.csv FX/FX_subsubset_Users.dat
df = np.zeros(shape=(max_user, max_item))
with open("Flixster/subset_FX_O.dat", 'r') as f: #subset_FX_O All_2370_allUsers_KNN_fancy_imputation_FX_k_30
for line in f.readlines():
user_id, movie_id, rating, timestamp = line.split("::")
user_id, movie_id, rating, timestamp = int(user_id), int(movie_id), float(rating), int (timestamp)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_user_item_matrix_FX_TrainingSet(max_user=2370, max_item=2835 ): #2370 2835 24676 11927 2835
"""
this function loads the user x items matrix from the movie lens data set.
Both input parameter represent a threshold for the maximum user id or maximum item id
The highest user id is 6040 and the highest movie id is 3952 for the original data set, however, the masked data
set contains only 943 users and 1330 items
:return: user-item matrix
"""
df = np.zeros(shape=(max_user, max_item))
with open("Flixster/trainingSet_FX_1.dat", 'r') as f: # Flixster/trainingSet_FX_1.dat New_Flixster/FX_train.csv
for line in f.readlines():
user_id, movie_id, rating, timestamp = line.split("::")
user_id, movie_id, rating, timestamp = int(user_id), int(movie_id), float(rating), int (timestamp)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_user_item_matrix_FX_Test(max_user=2370, max_item=2835): # 2370 2008
"""
this function loads the user x items matrix from the movie lens data set.
Both input parameter represent a threshold for the maximum user id or maximum item id
The highest user id is 6040 and the highest movie id is 3952 for the original data set, however, the masked data
set contains only 943 users and 1330 items
:return: user-item matrix
"""
df = np.zeros(shape=(max_user, max_item))
with open("Flixster/testSet_FX_1.dat", 'r') as f: #Flixster/testSet_FX_1.dat FX/FX_original_test.dat
for line in f.readlines():
user_id, movie_id, rating, timestamp = line.split("::")
user_id, movie_id, rating, timestamp = int(user_id), int(movie_id), float(rating), int (timestamp)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_user_item_FX_Complet(max_user=2370, max_item=2835):# 2370
df = np.zeros(shape=(max_user, max_item))
with open(
"Flixster/With_Fancy_KNN/TrainingSet_2370_allUsers_KNN_fancy_imputation_FX_k_30.dat",
'r') as f:
for line in f.readlines():
user_id, movie_id, rating, timestamp = line.split("::")
user_id, movie_id, rating, timestamp = int(user_id), int(movie_id), float(rating), int(timestamp)
if user_id <= max_user and movie_id <= max_item:
df[user_id - 1, movie_id - 1] = rating
return df
def load_user_item_matrix_FX_limited_ratings(limit=20):
user_item = load_user_item_matrix_FX_All()
user_item_limited = np.zeros(shape=user_item.shape)
for user_index, user in enumerate(user_item):
# filter rating indices
rating_index = np.argwhere(user > 0).reshape(1, -1)[0]
# shuffle them
np.random.shuffle(rating_index)
for i in rating_index[:limit]:
user_item_limited[user_index, i] = user[i]
#print(np.sum(user_item_limited, axis=1))
return user_item_limited
def load_user_item_matrix_FX_trainMasked(max_user=2370, max_item=2835, file_index=-1):
df = np.zeros(shape=(max_user, max_item))
masked_files = [
# ,#0
]
with open(masked_files[file_index], 'r') as f:
for line in f.readlines():
user_id, movie_id, rating, _ = line.split("::")
user_id, movie_id, rating = int(user_id), int(movie_id), float(rating)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df
def load_gender_vector_FX(max_user=2370 ): #2370 2008
"""
this function loads and returns the gender for all users with an id smaller than max_user
:param max_user: the highest user id to be retrieved
:return: the gender vector
"""
gender_vec = []
with open("Flixster/subset_FX_User_O.csv", 'r') as f:
for line in f.readlines()[:max_user]:
user_id, gender, _ = line.split(",") #, location, _, _, _ , _
if gender == "M":
gender_vec.append(0)
else:
gender_vec.append(1)
return np.asarray(gender_vec)
def load_user_item_matrix_FX_masked(max_user=2370, max_item=2835, file_index=-1):
files = [
# Here add path to your files. Please note that we start from #0 like in the example
"Flixster/BlurMe/All_FX_blurme_obfuscated_0.01_greedy_avg_top-1.dat",#0
]
df = np.zeros(shape=(max_user, max_item))
with open(files[file_index], 'r') as f:
for line in f.readlines():
user_id, movie_id, rating, _ = line.split("::")
user_id, movie_id, rating = int(user_id), int(movie_id), float(rating)
if user_id <= max_user and movie_id <= max_item:
df[user_id-1, movie_id-1] = rating
return df