Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Further optimizations to tfidf backend + rearchitecting #336

Merged
merged 6 commits into from
Oct 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 62 additions & 6 deletions annif/backend/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
TF-IDF normalized bag-of-words vector space"""

import os.path
import tempfile
import joblib
import gensim.similarities
from gensim.matutils import Sparse2Corpus
Expand All @@ -12,6 +13,44 @@
from . import backend


class SubjectBuffer:
"""A file-backed buffer to store and retrieve subject text."""

BUFFER_SIZE = 100

def __init__(self, tempdir, subject_id):
filename = '{:08d}.txt'.format(subject_id)
self._path = os.path.join(tempdir, filename)
self._buffer = []
self._created = False

def flush(self):
if self._created:
mode = 'a'
else:
mode = 'w'

with open(self._path, mode, encoding='utf-8') as subjfile:
for text in self._buffer:
print(text, file=subjfile)

self._buffer = []
self._created = True

def write(self, text):
self._buffer.append(text)
if len(self._buffer) >= self.BUFFER_SIZE:
self.flush()

def read(self):
if not self._created:
# file was never created - we can simply return the buffer content
return "\n".join(self._buffer)
else:
with open(self._path, 'r', encoding='utf-8') as subjfile:
return subjfile.read() + "\n" + "\n".join(self._buffer)


class TFIDFBackend(backend.AnnifBackend):
"""TF-IDF vector space similarity based backend for Annif"""
name = "tfidf"
Expand All @@ -24,6 +63,24 @@ class TFIDFBackend(backend.AnnifBackend):
VECTORIZER_FILE = 'vectorizer'
INDEX_FILE = 'tfidf-index'

def _generate_subjects_from_documents(self, corpus, project):
with tempfile.TemporaryDirectory() as tempdir:
subject_buffer = {}
for subject_id in range(len(project.subjects)):
subject_buffer[subject_id] = SubjectBuffer(tempdir,
subject_id)

for doc in corpus.documents:
tokens = project.analyzer.tokenize_words(doc.text)
for uri in doc.uris:
subject_id = project.subjects.by_uri(uri)
if subject_id is None:
continue
subject_buffer[subject_id].write(" ".join(tokens))

for sid in range(len(project.subjects)):
yield subject_buffer[sid].read()

def _initialize_vectorizer(self):
if self._vectorizer is None:
path = os.path.join(self.datadir, self.VECTORIZER_FILE)
Expand Down Expand Up @@ -66,12 +123,10 @@ def train(self, corpus, project):
raise NotSupportedException(
'Cannot train tfidf project with no documents')
self.info('transforming subject corpus')
subjects = corpus.subjects
subjects = self._generate_subjects_from_documents(corpus, project)
self.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
tokenizer=project.analyzer.tokenize_words)
veccorpus = self._vectorizer.fit_transform(
(subj.text for subj in subjects))
self._vectorizer = TfidfVectorizer()
veccorpus = self._vectorizer.fit_transform(subjects)
annif.util.atomic_save(
self._vectorizer,
self.datadir,
Expand All @@ -82,7 +137,8 @@ def train(self, corpus, project):
def _suggest(self, text, project, params):
self.debug('Suggesting subjects for text "{}..." (len={})'.format(
text[:20], len(text)))
vectors = self._vectorizer.transform([text])
tokens = project.analyzer.tokenize_words(text)
vectors = self._vectorizer.transform([" ".join(tokens)])
docsim = self._index[vectors[0]]
fullresult = VectorSuggestionResult(docsim, project.subjects)
return fullresult.filter(limit=int(self.params['limit']))
6 changes: 3 additions & 3 deletions annif/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@


from .document import DocumentDirectory, DocumentFile, DocumentList
from .subject import Subject, SubjectDirectory, SubjectFileTSV
from .subject import Subject, SubjectFileTSV
from .subject import SubjectIndex, SubjectSet
from .skos import SubjectFileSKOS
from .types import Document
from .combine import CombinedCorpus

__all__ = [DocumentDirectory, DocumentFile, DocumentList, Subject,
SubjectDirectory, SubjectFileTSV, SubjectIndex, SubjectSet,
SubjectFileSKOS, Document, CombinedCorpus]
SubjectFileTSV, SubjectIndex, SubjectSet, SubjectFileSKOS,
Document, CombinedCorpus]
17 changes: 2 additions & 15 deletions annif/corpus/combine.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,16 @@
"""Class for combining multiple corpora so they behave like a single corpus"""

import itertools
from .types import DocumentCorpus, SubjectCorpus, Subject
from .types import DocumentCorpus


class CombinedCorpus(SubjectCorpus, DocumentCorpus):
class CombinedCorpus(DocumentCorpus):
"""Class for combining multiple corpora so they behave like a single
corpus"""

def __init__(self, corpora):
self._corpora = corpora

@property
def subjects(self):
for source_subjects in zip(
*[corpus.subjects for corpus in self._corpora]):
uri = None
label = None
texts = []
for subject in source_subjects:
uri = subject.uri
label = subject.label
texts.append(subject.text)
yield Subject(uri=uri, label=label, text=" ".join(texts))

@property
def documents(self):
return itertools.chain.from_iterable(
Expand Down
109 changes: 0 additions & 109 deletions annif/corpus/convert.py

This file was deleted.

7 changes: 3 additions & 4 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
import gzip
import annif.util
from .types import DocumentCorpus
from .convert import DocumentToSubjectCorpusMixin
from .subject import SubjectSet

logger = annif.logger


class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
class DocumentDirectory(DocumentCorpus):
"""A directory of files as a full text document corpus"""

def __init__(self, path, require_subjects=False):
Expand Down Expand Up @@ -49,7 +48,7 @@ def documents(self):
labels=subjects.subject_labels)


class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
class DocumentFile(DocumentCorpus):
"""A TSV file as a corpus of documents with subjects"""

def __init__(self, path):
Expand Down Expand Up @@ -78,7 +77,7 @@ def _parse_tsv_line(self, line):
line.rstrip())


class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
class DocumentList(DocumentCorpus):
"""A document corpus based on a list of other iterable of Document
objects"""

Expand Down
2 changes: 1 addition & 1 deletion annif/corpus/skos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import rdflib
import rdflib.util
from rdflib.namespace import SKOS, RDF, OWL
from .subject import Subject, SubjectCorpus
from .types import Subject, SubjectCorpus


class SubjectFileSKOS(SubjectCorpus):
Expand Down
25 changes: 3 additions & 22 deletions annif/corpus/subject.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,13 @@
"""Classes for supporting subject corpora expressed as directories or files"""

import glob
import os.path
import annif.util
import numpy as np
from annif import logger
from .types import Subject, SubjectCorpus
from .convert import SubjectToDocumentCorpusMixin
from .types import Subject


class SubjectDirectory(SubjectCorpus, SubjectToDocumentCorpusMixin):
"""A subject corpus in the form of a directory with .txt files."""

def __init__(self, path):
self.path = path
self._filenames = sorted(glob.glob(os.path.join(path, '*.txt')))

@property
def subjects(self):
for filename in self._filenames:
with open(filename, encoding='utf-8') as subjfile:
uri, label = subjfile.readline().strip().split(' ', 1)
text = ' '.join(subjfile.readlines())
yield Subject(uri=uri, label=label, text=text)


class SubjectFileTSV(SubjectCorpus, SubjectToDocumentCorpusMixin):
"""A subject corpus stored in a TSV file."""
class SubjectFileTSV:
"""A subject vocabulary stored in a TSV file."""

def __init__(self, path):
self.path = path
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def document_corpus(subject_index):


@pytest.fixture(scope='module')
def project(document_corpus):
def project(document_corpus, subject_index):
proj = unittest.mock.Mock()
proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)')
proj.subjects = annif.corpus.SubjectIndex(document_corpus)
proj.subjects = subject_index
return proj
6 changes: 2 additions & 4 deletions tests/test_backend_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,10 @@


@pytest.fixture(scope='module')
def project(document_corpus):
def project(document_corpus, subject_index):
proj = unittest.mock.Mock()
proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)')
proj.subjects = annif.corpus.SubjectIndex(document_corpus)
proj.vectorizer = TfidfVectorizer(tokenizer=proj.analyzer.tokenize_words)
proj.vectorizer.fit([subj.text for subj in document_corpus.subjects])
proj.subjects = subject_index
return proj


Expand Down
Loading