From 0c6ee3be4dda6560d2e0388ba4821f15aa4feb22 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 14:05:31 +0300 Subject: [PATCH] Split up TFIDFBackend.train --- annif/backend/tfidf.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index b753174a9..ec5d6e6d0 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -51,6 +51,16 @@ def initialize(self): self._initialize_vectorizer() self._initialize_index() + def _create_index(self, veccorpus): + self.info('creating similarity index') + gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) + self._index = gensim.similarities.SparseMatrixSimilarity( + gscorpus, num_features=len(self._vectorizer.vocabulary_)) + annif.util.atomic_save( + self._index, + self.datadir, + self.INDEX_FILE) + def train(self, corpus, project): if corpus.is_empty(): raise NotSupportedException( @@ -67,14 +77,7 @@ def train(self, corpus, project): self.datadir, self.VECTORIZER_FILE, method=joblib.dump) - self.info('creating similarity index') - gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) - self._index = gensim.similarities.SparseMatrixSimilarity( - gscorpus, num_features=len(self._vectorizer.vocabulary_)) - annif.util.atomic_save( - self._index, - self.datadir, - self.INDEX_FILE) + self._create_index(veccorpus) def _suggest(self, text, project, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format(