Skip to content

Commit

Permalink
Store returned scores as sparse vectors in PAV backend, saving RAM (p…
Browse files Browse the repository at this point in the history
…art of #377)
  • Loading branch information
osma committed Jan 28, 2020
1 parent 8c1a09a commit 7008316
Showing 1 changed file with 13 additions and 5 deletions.
18 changes: 13 additions & 5 deletions annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import os.path
import joblib
from scipy.sparse import coo_matrix, csc_matrix
from sklearn.isotonic import IsotonicRegression
import numpy as np
import annif.corpus
Expand Down Expand Up @@ -65,14 +66,20 @@ def _normalize_hits(self, hits, source_project):

@staticmethod
def _suggest_train_corpus(source_project, corpus):
scores = []
true = []
for doc in corpus.documents:
data = []
row = []
col = []
for docid, doc in enumerate(corpus.documents):
hits = source_project.suggest(doc.text)
scores.append(hits.vector)
for cid in np.flatnonzero(hits.vector):
data.append(hits.vector[cid])
row.append(docid)
col.append(cid)
subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
true.append(subjects.as_vector(source_project.subjects))
return np.array(scores), np.array(true)
scores = csc_matrix(coo_matrix((data, (row, col)), dtype=np.float32))
return scores, np.array(true)

def _create_pav_model(self, source_project_id, min_docs, corpus):
self.info("creating PAV model for source {}, min_docs={}".format(
Expand All @@ -86,7 +93,8 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
if true[:, cid].sum() < min_docs:
continue # don't create model b/c of too few examples
reg = IsotonicRegression(out_of_bounds='clip')
reg.fit(scores[:, cid].astype(np.float64), true[:, cid])
cid_scores = scores[:, cid].toarray().flatten().astype(np.float64)
reg.fit(cid_scores, true[:, cid])
pav_regressions[source_project.subjects[cid][0]] = reg
self.info("created PAV model for {} concepts".format(
len(pav_regressions)))
Expand Down

0 comments on commit 7008316

Please sign in to comment.