Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizations to tfidf backend training #335

Merged
merged 10 commits into from
Oct 4, 2019
2 changes: 2 additions & 0 deletions annif/analyzer/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Common functionality for analyzers."""

import abc
import functools
import unicodedata
import nltk.tokenize

Expand All @@ -17,6 +18,7 @@ def tokenize_sentences(self, text):
"""Tokenize a piece of text (e.g. a document) into sentences."""
return nltk.tokenize.sent_tokenize(text)

@functools.lru_cache(maxsize=50000)
def is_valid_token(self, word):
"""Return True if the word is an acceptable token."""
if len(word) < self.TOKEN_MIN_LENGTH:
Expand Down
1 change: 0 additions & 1 deletion annif/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ class AnnifBackend(metaclass=abc.ABCMeta):

name = None
needs_subject_index = False
needs_subject_vectorizer = False

DEFAULT_PARAMS = {'limit': 100}

Expand Down
50 changes: 42 additions & 8 deletions annif/backend/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,40 @@
TF-IDF normalized bag-of-words vector space"""

import os.path
import joblib
import gensim.similarities
from gensim.matutils import Sparse2Corpus
from sklearn.feature_extraction.text import TfidfVectorizer
import annif.util
from annif.suggestion import VectorSuggestionResult
from annif.exception import NotInitializedException
from annif.exception import NotInitializedException, NotSupportedException
from . import backend


class TFIDFBackend(backend.AnnifBackend):
"""TF-IDF vector space similarity based backend for Annif"""
name = "tfidf"
needs_subject_index = True
needs_subject_vectorizer = True

# defaults for uninitialized instances
_vectorizer = None
_index = None

VECTORIZER_FILE = 'vectorizer'
INDEX_FILE = 'tfidf-index'

def initialize(self):
def _initialize_vectorizer(self):
if self._vectorizer is None:
path = os.path.join(self.datadir, self.VECTORIZER_FILE)
if os.path.exists(path):
self.debug('loading vectorizer from {}'.format(path))
self._vectorizer = joblib.load(path)
else:
raise NotInitializedException(
"vectorizer file '{}' not found".format(path),
backend_id=self.backend_id)

def _initialize_index(self):
if self._index is None:
path = os.path.join(self.datadir, self.INDEX_FILE)
self.debug('loading similarity index from {}'.format(path))
Expand All @@ -33,22 +47,42 @@ def initialize(self):
'similarity index {} not found'.format(path),
backend_id=self.backend_id)

def train(self, corpus, project):
def initialize(self):
self._initialize_vectorizer()
self._initialize_index()

def _create_index(self, veccorpus):
self.info('creating similarity index')
veccorpus = project.vectorizer.transform(
(subj.text for subj in corpus.subjects))
gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
self._index = gensim.similarities.SparseMatrixSimilarity(
gscorpus, num_features=len(project.vectorizer.vocabulary_))
gscorpus, num_features=len(self._vectorizer.vocabulary_))
annif.util.atomic_save(
self._index,
self.datadir,
self.INDEX_FILE)

def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException(
'Cannot train tfidf project with no documents')
self.info('transforming subject corpus')
subjects = corpus.subjects
self.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
tokenizer=project.analyzer.tokenize_words)
veccorpus = self._vectorizer.fit_transform(
(subj.text for subj in subjects))
annif.util.atomic_save(
self._vectorizer,
self.datadir,
self.VECTORIZER_FILE,
method=joblib.dump)
self._create_index(veccorpus)

def _suggest(self, text, project, params):
self.debug('Suggesting subjects for text "{}..." (len={})'.format(
text[:20], len(text)))
vectors = project.vectorizer.transform([text])
vectors = self._vectorizer.transform([text])
docsim = self._index[vectors[0]]
fullresult = VectorSuggestionResult(docsim, project.subjects)
return fullresult.filter(limit=int(self.params['limit']))
46 changes: 41 additions & 5 deletions annif/corpus/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,46 @@
from .types import Document, DocumentCorpus, SubjectCorpus


class SubjectWriter:
"""Writes a single subject file into a SubjectDirectory, performing
buffering to limit the number of I/O operations."""

_buffer = None

BUFFER_SIZE = 100

def __init__(self, path, uri, label):
self._path = path
self._buffer = ["{} {}".format(uri, label)]
self._created = False

def _flush(self):
if self._created:
mode = 'a'
else:
mode = 'w'

with open(self._path, mode, encoding='utf-8') as subjfile:
for text in self._buffer:
print(text, file=subjfile)
self._buffer = []
self._created = True

def write(self, text):
self._buffer.append(text)
if len(self._buffer) >= self.BUFFER_SIZE:
self._flush()

def close(self):
self._flush()


class DocumentToSubjectCorpusMixin(SubjectCorpus):
"""Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus"""

_subject_corpus = None
_temp_directory = None
_subject_writer = None

@property
def subjects(self):
Expand All @@ -24,16 +59,14 @@ def _subject_filename(self, subject_id):

def _create_subject(self, subject_id, uri, label):
filename = self._subject_filename(subject_id)
with open(filename, 'w', encoding='utf-8') as subjfile:
print("{} {}".format(uri, label), file=subjfile)
self._subject_writer[subject_id] = SubjectWriter(filename, uri, label)

def _add_text_to_subject(self, subject_id, text):
filename = self._subject_filename(subject_id)
with open(filename, 'a', encoding='utf-8') as subjfile:
print(text, file=subjfile)
self._subject_writer[subject_id].write(text)

def _generate_corpus_from_documents(self):
self._temp_directory = tempfile.TemporaryDirectory()
self._subject_writer = {}

for subject_id, subject_info in enumerate(self._subject_index):
uri, label = subject_info
Expand All @@ -46,6 +79,9 @@ def _generate_corpus_from_documents(self):
continue
self._add_text_to_subject(subject_id, doc.text)

for subject_id, _ in enumerate(self._subject_index):
self._subject_writer[subject_id].close()

from .subject import SubjectDirectory
self._subject_corpus = SubjectDirectory(self._temp_directory.name)

Expand Down
44 changes: 0 additions & 44 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import configparser
import enum
import os.path
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from flask import current_app
from shutil import rmtree
import annif
Expand Down Expand Up @@ -36,7 +34,6 @@ class AnnifProject(DatadirMixin):
_analyzer = None
_backend = None
_vocab = None
_vectorizer = None
initialized = False

# default values for configuration settings
Expand Down Expand Up @@ -80,15 +77,6 @@ def _initialize_subjects(self):
except AnnifException as err:
logger.warning(err.format_message())

def _initialize_vectorizer(self):
try:
vectorizer = self.vectorizer
logger.debug("Project '%s': initialized vectorizer: %s",
self.project_id,
str(vectorizer))
except AnnifException as err:
logger.warning(err.format_message())

def _initialize_backend(self):
logger.debug("Project '%s': initializing backend", self.project_id)
try:
Expand All @@ -107,7 +95,6 @@ def initialize(self):

self._initialize_analyzer()
self._initialize_subjects()
self._initialize_vectorizer()
self._initialize_backend()

self.initialized = True
Expand Down Expand Up @@ -167,19 +154,6 @@ def vocab(self):
def subjects(self):
return self.vocab.subjects

@property
def vectorizer(self):
if self._vectorizer is None:
path = os.path.join(self.datadir, 'vectorizer')
if os.path.exists(path):
logger.debug('loading vectorizer from %s', path)
self._vectorizer = joblib.load(path)
else:
raise NotInitializedException(
"vectorizer file '{}' not found".format(path),
project_id=self.project_id)
return self._vectorizer

def suggest(self, text, backend_params=None):
"""Suggest subjects the given text by passing it to the backend. Returns a
list of SubjectSuggestion objects ordered by decreasing score."""
Expand All @@ -190,28 +164,10 @@ def suggest(self, text, backend_params=None):
logger.debug('%d hits from backend', len(hits))
return hits

def _create_vectorizer(self, subjectcorpus):
if not self.backend.needs_subject_vectorizer:
logger.debug('not creating vectorizer: not needed by backend')
return
if subjectcorpus.is_empty():
raise NotSupportedException(
'using TfidfVectorizer with no documents')
logger.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
tokenizer=self.analyzer.tokenize_words)
self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
annif.util.atomic_save(
self._vectorizer,
self.datadir,
'vectorizer',
method=joblib.dump)

def train(self, corpus):
"""train the project using documents from a metadata source"""

corpus.set_subject_index(self.subjects)
self._create_vectorizer(corpus)
self.backend.train(corpus, project=self)

def learn(self, corpus):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def test_project_train_tfidf_nodocuments(app, tmpdir):
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))
with pytest.raises(NotSupportedException) as excinfo:
project.train(empty_document_corpus)
assert 'using TfidfVectorizer with no documents' in str(excinfo.value)
assert 'Cannot train tfidf project with no documents' in str(excinfo.value)


def test_project_learn(app, tmpdir):
Expand Down