diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 4f1f58b42..1fb5df41a 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -1,6 +1,7 @@ """Common functionality for analyzers.""" import abc +import functools import unicodedata import nltk.tokenize @@ -17,6 +18,7 @@ def tokenize_sentences(self, text): """Tokenize a piece of text (e.g. a document) into sentences.""" return nltk.tokenize.sent_tokenize(text) + @functools.lru_cache(maxsize=50000) def is_valid_token(self, word): """Return True if the word is an acceptable token.""" if len(word) < self.TOKEN_MIN_LENGTH: diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 93dbc0ead..ea5946e2f 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -10,7 +10,6 @@ class AnnifBackend(metaclass=abc.ABCMeta): name = None needs_subject_index = False - needs_subject_vectorizer = False DEFAULT_PARAMS = {'limit': 100} diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 8e7f1dc5e..ec5d6e6d0 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -2,11 +2,13 @@ TF-IDF normalized bag-of-words vector space""" import os.path +import joblib import gensim.similarities from gensim.matutils import Sparse2Corpus +from sklearn.feature_extraction.text import TfidfVectorizer import annif.util from annif.suggestion import VectorSuggestionResult -from annif.exception import NotInitializedException +from annif.exception import NotInitializedException, NotSupportedException from . import backend @@ -14,14 +16,26 @@ class TFIDFBackend(backend.AnnifBackend): """TF-IDF vector space similarity based backend for Annif""" name = "tfidf" needs_subject_index = True - needs_subject_vectorizer = True # defaults for uninitialized instances + _vectorizer = None _index = None + VECTORIZER_FILE = 'vectorizer' INDEX_FILE = 'tfidf-index' - def initialize(self): + def _initialize_vectorizer(self): + if self._vectorizer is None: + path = os.path.join(self.datadir, self.VECTORIZER_FILE) + if os.path.exists(path): + self.debug('loading vectorizer from {}'.format(path)) + self._vectorizer = joblib.load(path) + else: + raise NotInitializedException( + "vectorizer file '{}' not found".format(path), + backend_id=self.backend_id) + + def _initialize_index(self): if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) self.debug('loading similarity index from {}'.format(path)) @@ -33,22 +47,42 @@ def initialize(self): 'similarity index {} not found'.format(path), backend_id=self.backend_id) - def train(self, corpus, project): + def initialize(self): + self._initialize_vectorizer() + self._initialize_index() + + def _create_index(self, veccorpus): self.info('creating similarity index') - veccorpus = project.vectorizer.transform( - (subj.text for subj in corpus.subjects)) gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) self._index = gensim.similarities.SparseMatrixSimilarity( - gscorpus, num_features=len(project.vectorizer.vocabulary_)) + gscorpus, num_features=len(self._vectorizer.vocabulary_)) annif.util.atomic_save( self._index, self.datadir, self.INDEX_FILE) + def train(self, corpus, project): + if corpus.is_empty(): + raise NotSupportedException( + 'Cannot train tfidf project with no documents') + self.info('transforming subject corpus') + subjects = corpus.subjects + self.info('creating vectorizer') + self._vectorizer = TfidfVectorizer( + tokenizer=project.analyzer.tokenize_words) + veccorpus = self._vectorizer.fit_transform( + (subj.text for subj in subjects)) + annif.util.atomic_save( + self._vectorizer, + self.datadir, + self.VECTORIZER_FILE, + method=joblib.dump) + self._create_index(veccorpus) + def _suggest(self, text, project, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) - vectors = project.vectorizer.transform([text]) + vectors = self._vectorizer.transform([text]) docsim = self._index[vectors[0]] fullresult = VectorSuggestionResult(docsim, project.subjects) return fullresult.filter(limit=int(self.params['limit'])) diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py index f8e17a611..e3a1fe77a 100644 --- a/annif/corpus/convert.py +++ b/annif/corpus/convert.py @@ -6,11 +6,46 @@ from .types import Document, DocumentCorpus, SubjectCorpus +class SubjectWriter: + """Writes a single subject file into a SubjectDirectory, performing + buffering to limit the number of I/O operations.""" + + _buffer = None + + BUFFER_SIZE = 100 + + def __init__(self, path, uri, label): + self._path = path + self._buffer = ["{} {}".format(uri, label)] + self._created = False + + def _flush(self): + if self._created: + mode = 'a' + else: + mode = 'w' + + with open(self._path, mode, encoding='utf-8') as subjfile: + for text in self._buffer: + print(text, file=subjfile) + self._buffer = [] + self._created = True + + def write(self, text): + self._buffer.append(text) + if len(self._buffer) >= self.BUFFER_SIZE: + self._flush() + + def close(self): + self._flush() + + class DocumentToSubjectCorpusMixin(SubjectCorpus): """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus""" _subject_corpus = None _temp_directory = None + _subject_writer = None @property def subjects(self): @@ -24,16 +59,14 @@ def _subject_filename(self, subject_id): def _create_subject(self, subject_id, uri, label): filename = self._subject_filename(subject_id) - with open(filename, 'w', encoding='utf-8') as subjfile: - print("{} {}".format(uri, label), file=subjfile) + self._subject_writer[subject_id] = SubjectWriter(filename, uri, label) def _add_text_to_subject(self, subject_id, text): - filename = self._subject_filename(subject_id) - with open(filename, 'a', encoding='utf-8') as subjfile: - print(text, file=subjfile) + self._subject_writer[subject_id].write(text) def _generate_corpus_from_documents(self): self._temp_directory = tempfile.TemporaryDirectory() + self._subject_writer = {} for subject_id, subject_info in enumerate(self._subject_index): uri, label = subject_info @@ -46,6 +79,9 @@ def _generate_corpus_from_documents(self): continue self._add_text_to_subject(subject_id, doc.text) + for subject_id, _ in enumerate(self._subject_index): + self._subject_writer[subject_id].close() + from .subject import SubjectDirectory self._subject_corpus = SubjectDirectory(self._temp_directory.name) diff --git a/annif/project.py b/annif/project.py index 7b9b5a909..741ea1e9f 100644 --- a/annif/project.py +++ b/annif/project.py @@ -4,8 +4,6 @@ import configparser import enum import os.path -import joblib -from sklearn.feature_extraction.text import TfidfVectorizer from flask import current_app from shutil import rmtree import annif @@ -36,7 +34,6 @@ class AnnifProject(DatadirMixin): _analyzer = None _backend = None _vocab = None - _vectorizer = None initialized = False # default values for configuration settings @@ -80,15 +77,6 @@ def _initialize_subjects(self): except AnnifException as err: logger.warning(err.format_message()) - def _initialize_vectorizer(self): - try: - vectorizer = self.vectorizer - logger.debug("Project '%s': initialized vectorizer: %s", - self.project_id, - str(vectorizer)) - except AnnifException as err: - logger.warning(err.format_message()) - def _initialize_backend(self): logger.debug("Project '%s': initializing backend", self.project_id) try: @@ -107,7 +95,6 @@ def initialize(self): self._initialize_analyzer() self._initialize_subjects() - self._initialize_vectorizer() self._initialize_backend() self.initialized = True @@ -167,19 +154,6 @@ def vocab(self): def subjects(self): return self.vocab.subjects - @property - def vectorizer(self): - if self._vectorizer is None: - path = os.path.join(self.datadir, 'vectorizer') - if os.path.exists(path): - logger.debug('loading vectorizer from %s', path) - self._vectorizer = joblib.load(path) - else: - raise NotInitializedException( - "vectorizer file '{}' not found".format(path), - project_id=self.project_id) - return self._vectorizer - def suggest(self, text, backend_params=None): """Suggest subjects the given text by passing it to the backend. Returns a list of SubjectSuggestion objects ordered by decreasing score.""" @@ -190,28 +164,10 @@ def suggest(self, text, backend_params=None): logger.debug('%d hits from backend', len(hits)) return hits - def _create_vectorizer(self, subjectcorpus): - if not self.backend.needs_subject_vectorizer: - logger.debug('not creating vectorizer: not needed by backend') - return - if subjectcorpus.is_empty(): - raise NotSupportedException( - 'using TfidfVectorizer with no documents') - logger.info('creating vectorizer') - self._vectorizer = TfidfVectorizer( - tokenizer=self.analyzer.tokenize_words) - self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects)) - annif.util.atomic_save( - self._vectorizer, - self.datadir, - 'vectorizer', - method=joblib.dump) - def train(self, corpus): """train the project using documents from a metadata source""" corpus.set_subject_index(self.subjects) - self._create_vectorizer(corpus) self.backend.train(corpus, project=self) def learn(self, corpus): diff --git a/tests/test_project.py b/tests/test_project.py index a6604c219..7c1a291bb 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -152,7 +152,7 @@ def test_project_train_tfidf_nodocuments(app, tmpdir): empty_document_corpus = annif.corpus.DocumentFile(str(empty_file)) with pytest.raises(NotSupportedException) as excinfo: project.train(empty_document_corpus) - assert 'using TfidfVectorizer with no documents' in str(excinfo.value) + assert 'Cannot train tfidf project with no documents' in str(excinfo.value) def test_project_learn(app, tmpdir):