-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaseline.py
129 lines (91 loc) · 4.1 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os, time, json
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Callable
from utils.tools import submit, coalesce, read_json
from utils.constants import MAX_THREADS, TRAIN_FOLDER, N_RECS, INPUT_FILE
class BaselineModel:
NAME = 'BaselineModel'
def __init__(self, binfreq: bool = True):
super().__init__()
self.popular = None
self.binfreq = binfreq
def preprocess(self, num_threads: int, verbose: bool) -> List[str]:
paths = os.listdir(TRAIN_FOLDER)
if verbose:
print('Initializing track counting')
# parallelize track counting
with ProcessPoolExecutor(max_workers=num_threads) as pool:
indexes = coalesce(len(paths), num_threads)
futures = list()
for i in range(num_threads):
start, end = indexes[i], indexes[i + 1]
futures.append(
pool.submit(count, paths[start:end], set if self.binfreq else tuple, verbose)
)
counter = futures.pop(0).result()
for f in futures:
new_counter = f.result()
# append those tracks that are in counter but are not in counter
counter |= {key: new_counter[key] for key in new_counter.keys() - counter.keys()}
# add those tracks that are both in counter and counter
counter |= {key: new_counter[key] + counter[key] for key in new_counter.keys() & counter.keys()}
# sort dictionary by value (since value stores occurrences)
self.popular = list(sorted(counter.keys(), key=counter.get, reverse=True))
return self.popular
def recommend(self, submit_path: str, num_threads: int, verbose: bool) -> Dict[int, List[str]]:
# extract playlists for evaluation
pids, tracks = zip(*read_json(INPUT_FILE).items())
# parallelize prediction
if verbose:
print(f'Initializing recommendation')
with ProcessPoolExecutor(max_workers=num_threads) as pool:
indexes = coalesce(len(pids), num_threads)
futures = list()
for i in range(num_threads):
start, end = indexes[i], indexes[i+1]
futures.append(
pool.submit(get_popular, dict(zip(pids[start:end], tracks[start:end])), self.popular, verbose)
)
playlists = futures.pop(0).result()
for _ in range(len(futures)):
playlists |= futures.pop(0).result()
submit(submit_path, playlists)
return playlists
def count(paths: List[str], func: Callable, verbose: bool) -> Dict[int, List[str]]:
counter = dict()
info = lambda i: print(f'Counting tracks from file {i}/{len(paths)}') if verbose and (i%10==0) else None
for i, path in enumerate(paths):
info(i)
playlists = read_json(f'{TRAIN_FOLDER}/{path}', func)
for pid, tracks in playlists.items():
for track in tracks:
try:
counter[track] += 1
except KeyError:
counter[track] = 1
return counter
def get_popular(playlists: Dict[int, List[str]], popular: List[int], verbose: bool) -> Dict[int, List[str]]:
info = lambda i: print(f'Getting popularity for playlist {i}/{len(playlists)}') if verbose and (i%100 == 0) else None
for i, (pid, tracks) in enumerate(playlists.items()):
info(i)
tracks = tracks.copy()
playlists[pid] = []
j = 0
while len(playlists[pid]) < N_RECS:
if not popular[j] in tracks:
playlists[pid].append(popular[j])
j += 1
return playlists
if __name__ == '__main__':
model = BaselineModel()
# training (counting track occurrences)
start = time.time()
tracks = model.preprocess(MAX_THREADS, True)
end = time.time()
print('Preprocessing time:', (end-start))
print('-'*80)
# prediction
start = time.time()
model.recommend('submissions/baseline.csv.gz', MAX_THREADS, True)
end = time.time()
print('Prediction time:', (end-start))