Skip to content

Commit

Permalink
Store also true labels as sparse vectors in PAV backend, saving RAM (…
Browse files Browse the repository at this point in the history
…part of #377)
  • Loading branch information
osma committed Jan 28, 2020
1 parent 7008316 commit e2dc157
Showing 1 changed file with 17 additions and 5 deletions.
22 changes: 17 additions & 5 deletions annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,32 @@ def _normalize_hits(self, hits, source_project):

@staticmethod
def _suggest_train_corpus(source_project, corpus):
true = []
data = []
row = []
col = []
trow = []
tcol = []
ndocs = 0
for docid, doc in enumerate(corpus.documents):
hits = source_project.suggest(doc.text)
for cid in np.flatnonzero(hits.vector):
data.append(hits.vector[cid])
row.append(docid)
col.append(cid)
subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
true.append(subjects.as_vector(source_project.subjects))
scores = csc_matrix(coo_matrix((data, (row, col)), dtype=np.float32))
return scores, np.array(true)
for cid in np.flatnonzero(subjects.as_vector(source_project.subjects)):
trow.append(docid)
tcol.append(cid)
ndocs += 1
scores = csc_matrix(
coo_matrix((data, (row, col)),
shape=(ndocs, len(source_project.subjects)),
dtype=np.float32))
true = csc_matrix(
coo_matrix((np.ones(len(trow), dtype=np.bool), (trow, tcol)),
shape=(ndocs, len(source_project.subjects)),
dtype=np.bool))
return scores, true

def _create_pav_model(self, source_project_id, min_docs, corpus):
self.info("creating PAV model for source {}, min_docs={}".format(
Expand All @@ -94,7 +106,7 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
continue # don't create model b/c of too few examples
reg = IsotonicRegression(out_of_bounds='clip')
cid_scores = scores[:, cid].toarray().flatten().astype(np.float64)
reg.fit(cid_scores, true[:, cid])
reg.fit(cid_scores, true[:, cid].toarray().flatten())
pav_regressions[source_project.subjects[cid][0]] = reg
self.info("created PAV model for {} concepts".format(
len(pav_regressions)))
Expand Down

0 comments on commit e2dc157

Please sign in to comment.