diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index ec5d6e6d0..c98bd77be 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -2,6 +2,7 @@ TF-IDF normalized bag-of-words vector space""" import os.path +import tempfile import joblib import gensim.similarities from gensim.matutils import Sparse2Corpus @@ -12,6 +13,44 @@ from . import backend +class SubjectBuffer: + """A file-backed buffer to store and retrieve subject text.""" + + BUFFER_SIZE = 100 + + def __init__(self, tempdir, subject_id): + filename = '{:08d}.txt'.format(subject_id) + self._path = os.path.join(tempdir, filename) + self._buffer = [] + self._created = False + + def flush(self): + if self._created: + mode = 'a' + else: + mode = 'w' + + with open(self._path, mode, encoding='utf-8') as subjfile: + for text in self._buffer: + print(text, file=subjfile) + + self._buffer = [] + self._created = True + + def write(self, text): + self._buffer.append(text) + if len(self._buffer) >= self.BUFFER_SIZE: + self.flush() + + def read(self): + if not self._created: + # file was never created - we can simply return the buffer content + return "\n".join(self._buffer) + else: + with open(self._path, 'r', encoding='utf-8') as subjfile: + return subjfile.read() + "\n" + "\n".join(self._buffer) + + class TFIDFBackend(backend.AnnifBackend): """TF-IDF vector space similarity based backend for Annif""" name = "tfidf" @@ -24,6 +63,24 @@ class TFIDFBackend(backend.AnnifBackend): VECTORIZER_FILE = 'vectorizer' INDEX_FILE = 'tfidf-index' + def _generate_subjects_from_documents(self, corpus, project): + with tempfile.TemporaryDirectory() as tempdir: + subject_buffer = {} + for subject_id in range(len(project.subjects)): + subject_buffer[subject_id] = SubjectBuffer(tempdir, + subject_id) + + for doc in corpus.documents: + tokens = project.analyzer.tokenize_words(doc.text) + for uri in doc.uris: + subject_id = project.subjects.by_uri(uri) + if subject_id is None: + continue + subject_buffer[subject_id].write(" ".join(tokens)) + + for sid in range(len(project.subjects)): + yield subject_buffer[sid].read() + def _initialize_vectorizer(self): if self._vectorizer is None: path = os.path.join(self.datadir, self.VECTORIZER_FILE) @@ -66,12 +123,10 @@ def train(self, corpus, project): raise NotSupportedException( 'Cannot train tfidf project with no documents') self.info('transforming subject corpus') - subjects = corpus.subjects + subjects = self._generate_subjects_from_documents(corpus, project) self.info('creating vectorizer') - self._vectorizer = TfidfVectorizer( - tokenizer=project.analyzer.tokenize_words) - veccorpus = self._vectorizer.fit_transform( - (subj.text for subj in subjects)) + self._vectorizer = TfidfVectorizer() + veccorpus = self._vectorizer.fit_transform(subjects) annif.util.atomic_save( self._vectorizer, self.datadir, @@ -82,7 +137,8 @@ def train(self, corpus, project): def _suggest(self, text, project, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) - vectors = self._vectorizer.transform([text]) + tokens = project.analyzer.tokenize_words(text) + vectors = self._vectorizer.transform([" ".join(tokens)]) docsim = self._index[vectors[0]] fullresult = VectorSuggestionResult(docsim, project.subjects) return fullresult.filter(limit=int(self.params['limit'])) diff --git a/annif/corpus/__init__.py b/annif/corpus/__init__.py index 7d87d565a..b5dad8495 100644 --- a/annif/corpus/__init__.py +++ b/annif/corpus/__init__.py @@ -2,12 +2,12 @@ from .document import DocumentDirectory, DocumentFile, DocumentList -from .subject import Subject, SubjectDirectory, SubjectFileTSV +from .subject import Subject, SubjectFileTSV from .subject import SubjectIndex, SubjectSet from .skos import SubjectFileSKOS from .types import Document from .combine import CombinedCorpus __all__ = [DocumentDirectory, DocumentFile, DocumentList, Subject, - SubjectDirectory, SubjectFileTSV, SubjectIndex, SubjectSet, - SubjectFileSKOS, Document, CombinedCorpus] + SubjectFileTSV, SubjectIndex, SubjectSet, SubjectFileSKOS, + Document, CombinedCorpus] diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index bb9109acb..119044d40 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -1,29 +1,16 @@ """Class for combining multiple corpora so they behave like a single corpus""" import itertools -from .types import DocumentCorpus, SubjectCorpus, Subject +from .types import DocumentCorpus -class CombinedCorpus(SubjectCorpus, DocumentCorpus): +class CombinedCorpus(DocumentCorpus): """Class for combining multiple corpora so they behave like a single corpus""" def __init__(self, corpora): self._corpora = corpora - @property - def subjects(self): - for source_subjects in zip( - *[corpus.subjects for corpus in self._corpora]): - uri = None - label = None - texts = [] - for subject in source_subjects: - uri = subject.uri - label = subject.label - texts.append(subject.text) - yield Subject(uri=uri, label=label, text=" ".join(texts)) - @property def documents(self): return itertools.chain.from_iterable( diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py deleted file mode 100644 index e3a1fe77a..000000000 --- a/annif/corpus/convert.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Mixin classes for converting between SubjectCorpus and DocumentCorpus""" - -import collections -import os.path -import tempfile -from .types import Document, DocumentCorpus, SubjectCorpus - - -class SubjectWriter: - """Writes a single subject file into a SubjectDirectory, performing - buffering to limit the number of I/O operations.""" - - _buffer = None - - BUFFER_SIZE = 100 - - def __init__(self, path, uri, label): - self._path = path - self._buffer = ["{} {}".format(uri, label)] - self._created = False - - def _flush(self): - if self._created: - mode = 'a' - else: - mode = 'w' - - with open(self._path, mode, encoding='utf-8') as subjfile: - for text in self._buffer: - print(text, file=subjfile) - self._buffer = [] - self._created = True - - def write(self, text): - self._buffer.append(text) - if len(self._buffer) >= self.BUFFER_SIZE: - self._flush() - - def close(self): - self._flush() - - -class DocumentToSubjectCorpusMixin(SubjectCorpus): - """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus""" - - _subject_corpus = None - _temp_directory = None - _subject_writer = None - - @property - def subjects(self): - if self._subject_corpus is None: - self._generate_corpus_from_documents() - return self._subject_corpus.subjects - - def _subject_filename(self, subject_id): - filename = '{:08d}.txt'.format(subject_id) - return os.path.join(self._temp_directory.name, filename) - - def _create_subject(self, subject_id, uri, label): - filename = self._subject_filename(subject_id) - self._subject_writer[subject_id] = SubjectWriter(filename, uri, label) - - def _add_text_to_subject(self, subject_id, text): - self._subject_writer[subject_id].write(text) - - def _generate_corpus_from_documents(self): - self._temp_directory = tempfile.TemporaryDirectory() - self._subject_writer = {} - - for subject_id, subject_info in enumerate(self._subject_index): - uri, label = subject_info - self._create_subject(subject_id, uri, label) - - for doc in self.documents: - for uri in doc.uris: - subject_id = self._subject_index.by_uri(uri) - if subject_id is None: - continue - self._add_text_to_subject(subject_id, doc.text) - - for subject_id, _ in enumerate(self._subject_index): - self._subject_writer[subject_id].close() - - from .subject import SubjectDirectory - self._subject_corpus = SubjectDirectory(self._temp_directory.name) - - -class SubjectToDocumentCorpusMixin(DocumentCorpus): - """Mixin class for enabling a SubjectCorpus to act as a DocumentCorpus""" - - _document_uris = None - _document_labels = None - - @property - def documents(self): - if self._document_uris is None: - self._generate_corpus_from_subjects() - for text, uris in self._document_uris.items(): - labels = self._document_labels[text] - yield Document(text=text, uris=uris, labels=labels) - - def _generate_corpus_from_subjects(self): - self._document_uris = collections.defaultdict(set) - self._document_labels = collections.defaultdict(set) - for subj in self.subjects: - for line in subj.text.splitlines(): - self._document_uris[line].add(subj.uri) - self._document_labels[line].add(subj.label) diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 773dfc623..3ebee44a1 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -6,13 +6,12 @@ import gzip import annif.util from .types import DocumentCorpus -from .convert import DocumentToSubjectCorpusMixin from .subject import SubjectSet logger = annif.logger -class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin): +class DocumentDirectory(DocumentCorpus): """A directory of files as a full text document corpus""" def __init__(self, path, require_subjects=False): @@ -49,7 +48,7 @@ def documents(self): labels=subjects.subject_labels) -class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin): +class DocumentFile(DocumentCorpus): """A TSV file as a corpus of documents with subjects""" def __init__(self, path): @@ -78,7 +77,7 @@ def _parse_tsv_line(self, line): line.rstrip()) -class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin): +class DocumentList(DocumentCorpus): """A document corpus based on a list of other iterable of Document objects""" diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 3087da91f..67c2a39e4 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -3,7 +3,7 @@ import rdflib import rdflib.util from rdflib.namespace import SKOS, RDF, OWL -from .subject import Subject, SubjectCorpus +from .types import Subject, SubjectCorpus class SubjectFileSKOS(SubjectCorpus): diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 96225b9a8..33e1c10ec 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -1,32 +1,13 @@ """Classes for supporting subject corpora expressed as directories or files""" -import glob -import os.path import annif.util import numpy as np from annif import logger -from .types import Subject, SubjectCorpus -from .convert import SubjectToDocumentCorpusMixin +from .types import Subject -class SubjectDirectory(SubjectCorpus, SubjectToDocumentCorpusMixin): - """A subject corpus in the form of a directory with .txt files.""" - - def __init__(self, path): - self.path = path - self._filenames = sorted(glob.glob(os.path.join(path, '*.txt'))) - - @property - def subjects(self): - for filename in self._filenames: - with open(filename, encoding='utf-8') as subjfile: - uri, label = subjfile.readline().strip().split(' ', 1) - text = ' '.join(subjfile.readlines()) - yield Subject(uri=uri, label=label, text=text) - - -class SubjectFileTSV(SubjectCorpus, SubjectToDocumentCorpusMixin): - """A subject corpus stored in a TSV file.""" +class SubjectFileTSV: + """A subject vocabulary stored in a TSV file.""" def __init__(self, path): self.path = path diff --git a/tests/conftest.py b/tests/conftest.py index d10c14d1e..75babdbd9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,8 +74,8 @@ def document_corpus(subject_index): @pytest.fixture(scope='module') -def project(document_corpus): +def project(document_corpus, subject_index): proj = unittest.mock.Mock() proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)') - proj.subjects = annif.corpus.SubjectIndex(document_corpus) + proj.subjects = subject_index return proj diff --git a/tests/test_backend_tfidf.py b/tests/test_backend_tfidf.py index 238909c99..66d008ea7 100644 --- a/tests/test_backend_tfidf.py +++ b/tests/test_backend_tfidf.py @@ -9,12 +9,10 @@ @pytest.fixture(scope='module') -def project(document_corpus): +def project(document_corpus, subject_index): proj = unittest.mock.Mock() proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)') - proj.subjects = annif.corpus.SubjectIndex(document_corpus) - proj.vectorizer = TfidfVectorizer(tokenizer=proj.analyzer.tokenize_words) - proj.vectorizer.fit([subj.text for subj in document_corpus.subjects]) + proj.subjects = subject_index return proj diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 1b2b81501..a93c399e1 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -148,46 +148,6 @@ def test_docdir_key_as_doccorpus(tmpdir, subject_index): assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'} -def test_subjdir(tmpdir): - tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one - first subject - this is the first thing we know about""") - tmpdir.join('subj2.txt').write("""http://example.org/subj2 subject two - second subject - this is the second thing we know about""") - tmpdir.join('subj3.txt').write("""http://example.org/subj3 subject three - third subject - this is the third thing we know about""") - - subjdir = annif.corpus.SubjectDirectory(str(tmpdir)) - subjects = sorted(list(subjdir.subjects), key=lambda subj: subj.uri) - assert len(subjects) == 3 - assert subjects[0].uri == 'http://example.org/subj1' - assert subjects[0].label == 'subject one' - assert 'first' in subjects[0].text - assert subjects[1].uri == 'http://example.org/subj2' - assert subjects[1].label == 'subject two' - assert 'second' in subjects[1].text - assert subjects[2].uri == 'http://example.org/subj3' - assert subjects[2].label == 'subject three' - assert 'third' in subjects[2].text - - -def test_subjdir_as_doccorpus(tmpdir): - tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one - first subject - this is the first thing we know about""") - tmpdir.join('subj2.txt').write("""http://example.org/subj2 subject two - second subject - this is the second thing we know about""") - tmpdir.join('subj3.txt').write("""http://example.org/subj3 subject three - third subject - this is the third thing we know about""") - subjdir = annif.corpus.SubjectDirectory(str(tmpdir)) - documents = list(subjdir.documents) - assert len(documents) == 6 - - def test_subject_by_uri(subject_index): subj_id = subject_index.by_uri('http://www.yso.fi/onto/yso/p7141') assert subject_index[subj_id][1] == 'sinetit' @@ -252,7 +212,7 @@ def test_docfile_is_empty(tmpdir): assert docs.is_empty() -def test_combinedcorpus(tmpdir, subject_index): +def test_combinedcorpus(tmpdir): docfile = tmpdir.join('documents.tsv') docfile.write("""Läntinen\t Oulunlinnan\t @@ -262,7 +222,5 @@ def test_combinedcorpus(tmpdir, subject_index): corpus2 = annif.corpus.DocumentFile(str(docfile)) combined = annif.corpus.CombinedCorpus([corpus1, corpus2]) - combined.set_subject_index(subject_index) assert len(list(combined.documents)) == 6 - assert len(list(combined.subjects)) == len(list(corpus1.subjects)) diff --git a/tests/test_suggestion.py b/tests/test_suggestion.py index 460cde70b..50b88cf1a 100644 --- a/tests/test_suggestion.py +++ b/tests/test_suggestion.py @@ -51,8 +51,7 @@ def test_lazy_suggestion_result(subject_index): assert lar._object is not None -def test_list_suggestions_vector(document_corpus): - subjects = SubjectIndex(document_corpus) +def test_list_suggestions_vector(document_corpus, subject_index): suggestions = ListSuggestionResult( [ SubjectSuggestion( @@ -63,26 +62,25 @@ def test_list_suggestions_vector(document_corpus): uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', score=0.5)], - subjects) + subject_index) assert isinstance(suggestions.vector, np.ndarray) - assert len(suggestions.vector) == len(subjects) + assert len(suggestions.vector) == len(subject_index) assert suggestions.vector.sum() == 1.5 for subject_id, score in enumerate(suggestions.vector): - if subjects[subject_id][1] == 'sinetit': + if subject_index[subject_id][1] == 'sinetit': assert score == 1.0 - elif subjects[subject_id][1] == 'viikingit': + elif subject_index[subject_id][1] == 'viikingit': assert score == 0.5 else: assert score == 0.0 -def test_list_suggestions_vector_notfound(document_corpus): - subjects = SubjectIndex(document_corpus) +def test_list_suggestions_vector_notfound(document_corpus, subject_index): suggestions = ListSuggestionResult( [ SubjectSuggestion( uri='http://example.com/notfound', label='not found', score=1.0)], - subjects) + subject_index) assert suggestions.vector.sum() == 0