From a4687d40b6882752afe828685a37538f93de7e40 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 26 Sep 2019 15:14:26 +0300 Subject: [PATCH 1/9] Speed up conversion of document to subject corpus using buffered writing --- annif/corpus/convert.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py index f8e17a611..0ceb0ff50 100644 --- a/annif/corpus/convert.py +++ b/annif/corpus/convert.py @@ -6,11 +6,41 @@ from .types import Document, DocumentCorpus, SubjectCorpus +class SubjectWriter: + """Writes a single subject file into a SubjectDirectory, performing + buffering to limit the number of I/O operations.""" + + _buffer = None + + BUFFER_SIZE = 100 + + def __init__(self, path, uri, label): + self._path = path + self._buffer = [] + with open(path, 'w', encoding='utf-8') as subjfile: + print("{} {}".format(uri, label), file=subjfile) + + def _flush(self): + with open(self._path, 'a', encoding='utf-8') as subjfile: + for text in self._buffer: + print(text, file=subjfile) + self._buffer = [] + + def write(self, text): + self._buffer.append(text) + if len(self._buffer) >= self.BUFFER_SIZE: + self._flush() + + def close(self): + self._flush() + + class DocumentToSubjectCorpusMixin(SubjectCorpus): """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus""" _subject_corpus = None _temp_directory = None + _subject_writer = None @property def subjects(self): @@ -24,16 +54,14 @@ def _subject_filename(self, subject_id): def _create_subject(self, subject_id, uri, label): filename = self._subject_filename(subject_id) - with open(filename, 'w', encoding='utf-8') as subjfile: - print("{} {}".format(uri, label), file=subjfile) + self._subject_writer[subject_id] = SubjectWriter(filename, uri, label) def _add_text_to_subject(self, subject_id, text): - filename = self._subject_filename(subject_id) - with open(filename, 'a', encoding='utf-8') as subjfile: - print(text, file=subjfile) + self._subject_writer[subject_id].write(text) def _generate_corpus_from_documents(self): self._temp_directory = tempfile.TemporaryDirectory() + self._subject_writer = {} for subject_id, subject_info in enumerate(self._subject_index): uri, label = subject_info @@ -46,6 +74,9 @@ def _generate_corpus_from_documents(self): continue self._add_text_to_subject(subject_id, doc.text) + for subject_id, _ in enumerate(self._subject_index): + self._subject_writer[subject_id].close() + from .subject import SubjectDirectory self._subject_corpus = SubjectDirectory(self._temp_directory.name) From c01c88d3d51cccb65a73c91eee13cc17d8670153 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 3 Oct 2019 14:23:21 +0300 Subject: [PATCH 2/9] Also delay file creation when buffering corpus conversion --- annif/corpus/convert.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py index 0ceb0ff50..e3a1fe77a 100644 --- a/annif/corpus/convert.py +++ b/annif/corpus/convert.py @@ -16,15 +16,20 @@ class SubjectWriter: def __init__(self, path, uri, label): self._path = path - self._buffer = [] - with open(path, 'w', encoding='utf-8') as subjfile: - print("{} {}".format(uri, label), file=subjfile) + self._buffer = ["{} {}".format(uri, label)] + self._created = False def _flush(self): - with open(self._path, 'a', encoding='utf-8') as subjfile: + if self._created: + mode = 'a' + else: + mode = 'w' + + with open(self._path, mode, encoding='utf-8') as subjfile: for text in self._buffer: print(text, file=subjfile) self._buffer = [] + self._created = True def write(self, text): self._buffer.append(text) From 466dd5f7a82fc798df488eec3e607e423342b4f9 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 3 Oct 2019 17:50:44 +0300 Subject: [PATCH 3/9] pre-transform document corpus to subject corpus before vectorizing - for reasons I don't quite understand this brings a small performance boost --- annif/project.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/annif/project.py b/annif/project.py index 34af903dd..b4d101810 100644 --- a/annif/project.py +++ b/annif/project.py @@ -196,10 +196,12 @@ def _create_vectorizer(self, subjectcorpus): if subjectcorpus.is_empty(): raise NotSupportedException( 'using TfidfVectorizer with no documents') + logger.info('transforming subject corpus') + subjects = subjectcorpus.subjects logger.info('creating vectorizer') self._vectorizer = TfidfVectorizer( tokenizer=self.analyzer.tokenize_words) - self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects)) + self._vectorizer.fit((subj.text for subj in subjects)) annif.util.atomic_save( self._vectorizer, self.datadir, From ea6dbebec1f9202ccc608fa54e566094f183ecb9 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 3 Oct 2019 22:17:28 +0300 Subject: [PATCH 4/9] add a small cache to Analyzer.is_valid_token() to improve performance --- annif/analyzer/analyzer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 4f1f58b42..1fb5df41a 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -1,6 +1,7 @@ """Common functionality for analyzers.""" import abc +import functools import unicodedata import nltk.tokenize @@ -17,6 +18,7 @@ def tokenize_sentences(self, text): """Tokenize a piece of text (e.g. a document) into sentences.""" return nltk.tokenize.sent_tokenize(text) + @functools.lru_cache(maxsize=50000) def is_valid_token(self, word): """Return True if the word is an acceptable token.""" if len(word) < self.TOKEN_MIN_LENGTH: From 65033fa6d393dcee56b469bee02e293d33886091 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 13:10:59 +0300 Subject: [PATCH 5/9] Move subject vectorizer handling inside tfidf backend, as no other backend needs it and it is unlikely other backends will need it in the future --- annif/backend/backend.py | 1 - annif/backend/tfidf.py | 39 +++++++++++++++++++++++++++++----- annif/project.py | 46 ---------------------------------------- tests/test_project.py | 2 +- 4 files changed, 35 insertions(+), 53 deletions(-) diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 93dbc0ead..ea5946e2f 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -10,7 +10,6 @@ class AnnifBackend(metaclass=abc.ABCMeta): name = None needs_subject_index = False - needs_subject_vectorizer = False DEFAULT_PARAMS = {'limit': 100} diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 8e7f1dc5e..8054ee30a 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -2,11 +2,13 @@ TF-IDF normalized bag-of-words vector space""" import os.path +import joblib import gensim.similarities from gensim.matutils import Sparse2Corpus +from sklearn.feature_extraction.text import TfidfVectorizer import annif.util from annif.suggestion import VectorSuggestionResult -from annif.exception import NotInitializedException +from annif.exception import NotInitializedException, NotSupportedException from . import backend @@ -14,14 +16,40 @@ class TFIDFBackend(backend.AnnifBackend): """TF-IDF vector space similarity based backend for Annif""" name = "tfidf" needs_subject_index = True - needs_subject_vectorizer = True # defaults for uninitialized instances + _vectorizer = None _index = None + VECTORIZER_FILE = 'vectorizer' INDEX_FILE = 'tfidf-index' + def _create_vectorizer(self, corpus, project): + if corpus.is_empty(): + raise NotSupportedException( + 'Cannot train tfidf project with no documents') + self.info('transforming subject corpus') + subjects = corpus.subjects + self.info('creating vectorizer') + self._vectorizer = TfidfVectorizer( + tokenizer=project.analyzer.tokenize_words) + self._vectorizer.fit((subj.text for subj in subjects)) + annif.util.atomic_save( + self._vectorizer, + self.datadir, + 'vectorizer', + method=joblib.dump) + def initialize(self): + if self._vectorizer is None: + path = os.path.join(self.datadir, self.VECTORIZER_FILE) + if os.path.exists(path): + self.debug('loading vectorizer from {}'.format(path)) + self._vectorizer = joblib.load(path) + else: + raise NotInitializedException( + "vectorizer file '{}' not found".format(path), + backend_id=self.backend_id) if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) self.debug('loading similarity index from {}'.format(path)) @@ -34,12 +62,13 @@ def initialize(self): backend_id=self.backend_id) def train(self, corpus, project): + self._create_vectorizer(corpus, project) self.info('creating similarity index') - veccorpus = project.vectorizer.transform( + veccorpus = self._vectorizer.transform( (subj.text for subj in corpus.subjects)) gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) self._index = gensim.similarities.SparseMatrixSimilarity( - gscorpus, num_features=len(project.vectorizer.vocabulary_)) + gscorpus, num_features=len(self._vectorizer.vocabulary_)) annif.util.atomic_save( self._index, self.datadir, @@ -48,7 +77,7 @@ def train(self, corpus, project): def _suggest(self, text, project, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) - vectors = project.vectorizer.transform([text]) + vectors = self._vectorizer.transform([text]) docsim = self._index[vectors[0]] fullresult = VectorSuggestionResult(docsim, project.subjects) return fullresult.filter(limit=int(self.params['limit'])) diff --git a/annif/project.py b/annif/project.py index 3837f5ade..741ea1e9f 100644 --- a/annif/project.py +++ b/annif/project.py @@ -4,8 +4,6 @@ import configparser import enum import os.path -import joblib -from sklearn.feature_extraction.text import TfidfVectorizer from flask import current_app from shutil import rmtree import annif @@ -36,7 +34,6 @@ class AnnifProject(DatadirMixin): _analyzer = None _backend = None _vocab = None - _vectorizer = None initialized = False # default values for configuration settings @@ -80,15 +77,6 @@ def _initialize_subjects(self): except AnnifException as err: logger.warning(err.format_message()) - def _initialize_vectorizer(self): - try: - vectorizer = self.vectorizer - logger.debug("Project '%s': initialized vectorizer: %s", - self.project_id, - str(vectorizer)) - except AnnifException as err: - logger.warning(err.format_message()) - def _initialize_backend(self): logger.debug("Project '%s': initializing backend", self.project_id) try: @@ -107,7 +95,6 @@ def initialize(self): self._initialize_analyzer() self._initialize_subjects() - self._initialize_vectorizer() self._initialize_backend() self.initialized = True @@ -167,19 +154,6 @@ def vocab(self): def subjects(self): return self.vocab.subjects - @property - def vectorizer(self): - if self._vectorizer is None: - path = os.path.join(self.datadir, 'vectorizer') - if os.path.exists(path): - logger.debug('loading vectorizer from %s', path) - self._vectorizer = joblib.load(path) - else: - raise NotInitializedException( - "vectorizer file '{}' not found".format(path), - project_id=self.project_id) - return self._vectorizer - def suggest(self, text, backend_params=None): """Suggest subjects the given text by passing it to the backend. Returns a list of SubjectSuggestion objects ordered by decreasing score.""" @@ -190,30 +164,10 @@ def suggest(self, text, backend_params=None): logger.debug('%d hits from backend', len(hits)) return hits - def _create_vectorizer(self, subjectcorpus): - if not self.backend.needs_subject_vectorizer: - logger.debug('not creating vectorizer: not needed by backend') - return - if subjectcorpus.is_empty(): - raise NotSupportedException( - 'using TfidfVectorizer with no documents') - logger.info('transforming subject corpus') - subjects = subjectcorpus.subjects - logger.info('creating vectorizer') - self._vectorizer = TfidfVectorizer( - tokenizer=self.analyzer.tokenize_words) - self._vectorizer.fit((subj.text for subj in subjects)) - annif.util.atomic_save( - self._vectorizer, - self.datadir, - 'vectorizer', - method=joblib.dump) - def train(self, corpus): """train the project using documents from a metadata source""" corpus.set_subject_index(self.subjects) - self._create_vectorizer(corpus) self.backend.train(corpus, project=self) def learn(self, corpus): diff --git a/tests/test_project.py b/tests/test_project.py index a6604c219..7c1a291bb 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -152,7 +152,7 @@ def test_project_train_tfidf_nodocuments(app, tmpdir): empty_document_corpus = annif.corpus.DocumentFile(str(empty_file)) with pytest.raises(NotSupportedException) as excinfo: project.train(empty_document_corpus) - assert 'using TfidfVectorizer with no documents' in str(excinfo.value) + assert 'Cannot train tfidf project with no documents' in str(excinfo.value) def test_project_learn(app, tmpdir): From b81f0c7531a25ba82826a25e4d5c5f9932ab23fe Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 13:15:12 +0300 Subject: [PATCH 6/9] Use TfidfVectorizer.fit_transform as it is more efficient than separate fit and transform steps --- annif/backend/tfidf.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 8054ee30a..f3212756c 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -24,22 +24,6 @@ class TFIDFBackend(backend.AnnifBackend): VECTORIZER_FILE = 'vectorizer' INDEX_FILE = 'tfidf-index' - def _create_vectorizer(self, corpus, project): - if corpus.is_empty(): - raise NotSupportedException( - 'Cannot train tfidf project with no documents') - self.info('transforming subject corpus') - subjects = corpus.subjects - self.info('creating vectorizer') - self._vectorizer = TfidfVectorizer( - tokenizer=project.analyzer.tokenize_words) - self._vectorizer.fit((subj.text for subj in subjects)) - annif.util.atomic_save( - self._vectorizer, - self.datadir, - 'vectorizer', - method=joblib.dump) - def initialize(self): if self._vectorizer is None: path = os.path.join(self.datadir, self.VECTORIZER_FILE) @@ -62,10 +46,22 @@ def initialize(self): backend_id=self.backend_id) def train(self, corpus, project): - self._create_vectorizer(corpus, project) + if corpus.is_empty(): + raise NotSupportedException( + 'Cannot train tfidf project with no documents') + self.info('transforming subject corpus') + subjects = corpus.subjects + self.info('creating vectorizer') + self._vectorizer = TfidfVectorizer( + tokenizer=project.analyzer.tokenize_words) + veccorpus = self._vectorizer.fit_transform( + (subj.text for subj in subjects)) + annif.util.atomic_save( + self._vectorizer, + self.datadir, + 'vectorizer', + method=joblib.dump) self.info('creating similarity index') - veccorpus = self._vectorizer.transform( - (subj.text for subj in corpus.subjects)) gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) self._index = gensim.similarities.SparseMatrixSimilarity( gscorpus, num_features=len(self._vectorizer.vocabulary_)) From 6f49b21b8cb92523df49a1d9f3e5b2f768cdf51c Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 14:02:15 +0300 Subject: [PATCH 7/9] Split up TFIDFBackend.initialize --- annif/backend/tfidf.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index f3212756c..6c95da1a7 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -24,7 +24,7 @@ class TFIDFBackend(backend.AnnifBackend): VECTORIZER_FILE = 'vectorizer' INDEX_FILE = 'tfidf-index' - def initialize(self): + def _initialize_vectorizer(self): if self._vectorizer is None: path = os.path.join(self.datadir, self.VECTORIZER_FILE) if os.path.exists(path): @@ -34,6 +34,8 @@ def initialize(self): raise NotInitializedException( "vectorizer file '{}' not found".format(path), backend_id=self.backend_id) + + def _initialize_index(self): if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) self.debug('loading similarity index from {}'.format(path)) @@ -45,6 +47,10 @@ def initialize(self): 'similarity index {} not found'.format(path), backend_id=self.backend_id) + def initialize(self): + self._initialize_vectorizer() + self._initialize_index() + def train(self, corpus, project): if corpus.is_empty(): raise NotSupportedException( From b6dbbcdcebbd8c0bb38e53f6709d05fb3aa15565 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 14:02:55 +0300 Subject: [PATCH 8/9] use named constant instead of hardcoded value --- annif/backend/tfidf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 6c95da1a7..b753174a9 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -65,7 +65,7 @@ def train(self, corpus, project): annif.util.atomic_save( self._vectorizer, self.datadir, - 'vectorizer', + self.VECTORIZER_FILE, method=joblib.dump) self.info('creating similarity index') gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) From 0c6ee3be4dda6560d2e0388ba4821f15aa4feb22 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 14:05:31 +0300 Subject: [PATCH 9/9] Split up TFIDFBackend.train --- annif/backend/tfidf.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index b753174a9..ec5d6e6d0 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -51,6 +51,16 @@ def initialize(self): self._initialize_vectorizer() self._initialize_index() + def _create_index(self, veccorpus): + self.info('creating similarity index') + gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) + self._index = gensim.similarities.SparseMatrixSimilarity( + gscorpus, num_features=len(self._vectorizer.vocabulary_)) + annif.util.atomic_save( + self._index, + self.datadir, + self.INDEX_FILE) + def train(self, corpus, project): if corpus.is_empty(): raise NotSupportedException( @@ -67,14 +77,7 @@ def train(self, corpus, project): self.datadir, self.VECTORIZER_FILE, method=joblib.dump) - self.info('creating similarity index') - gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) - self._index = gensim.similarities.SparseMatrixSimilarity( - gscorpus, num_features=len(self._vectorizer.vocabulary_)) - annif.util.atomic_save( - self._index, - self.datadir, - self.INDEX_FILE) + self._create_index(veccorpus) def _suggest(self, text, project, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format(