From e2dc1576c45a0198c7707118a0c6321fea66fbfc Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 28 Jan 2020 10:45:46 +0200 Subject: [PATCH] Store also true labels as sparse vectors in PAV backend, saving RAM (part of #377) --- annif/backend/pav.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/annif/backend/pav.py b/annif/backend/pav.py index e4e34146a..baa58e5bd 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -66,10 +66,12 @@ def _normalize_hits(self, hits, source_project): @staticmethod def _suggest_train_corpus(source_project, corpus): - true = [] data = [] row = [] col = [] + trow = [] + tcol = [] + ndocs = 0 for docid, doc in enumerate(corpus.documents): hits = source_project.suggest(doc.text) for cid in np.flatnonzero(hits.vector): @@ -77,9 +79,19 @@ def _suggest_train_corpus(source_project, corpus): row.append(docid) col.append(cid) subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) - true.append(subjects.as_vector(source_project.subjects)) - scores = csc_matrix(coo_matrix((data, (row, col)), dtype=np.float32)) - return scores, np.array(true) + for cid in np.flatnonzero(subjects.as_vector(source_project.subjects)): + trow.append(docid) + tcol.append(cid) + ndocs += 1 + scores = csc_matrix( + coo_matrix((data, (row, col)), + shape=(ndocs, len(source_project.subjects)), + dtype=np.float32)) + true = csc_matrix( + coo_matrix((np.ones(len(trow), dtype=np.bool), (trow, tcol)), + shape=(ndocs, len(source_project.subjects)), + dtype=np.bool)) + return scores, true def _create_pav_model(self, source_project_id, min_docs, corpus): self.info("creating PAV model for source {}, min_docs={}".format( @@ -94,7 +106,7 @@ def _create_pav_model(self, source_project_id, min_docs, corpus): continue # don't create model b/c of too few examples reg = IsotonicRegression(out_of_bounds='clip') cid_scores = scores[:, cid].toarray().flatten().astype(np.float64) - reg.fit(cid_scores, true[:, cid]) + reg.fit(cid_scores, true[:, cid].toarray().flatten()) pav_regressions[source_project.subjects[cid][0]] = reg self.info("created PAV model for {} concepts".format( len(pav_regressions)))