From 01d1c4367f589b79bb71fb9587353e7043948929 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 16:33:17 +0300 Subject: [PATCH 1/6] Remove dead code: conversion from SubjectCorpus to DocumentCorpus was only used by a unit test, not real code --- annif/corpus/convert.py | 23 ----------------------- annif/corpus/subject.py | 5 ++--- tests/test_corpus.py | 15 --------------- 3 files changed, 2 insertions(+), 41 deletions(-) diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py index e3a1fe77a..0648cef2d 100644 --- a/annif/corpus/convert.py +++ b/annif/corpus/convert.py @@ -84,26 +84,3 @@ def _generate_corpus_from_documents(self): from .subject import SubjectDirectory self._subject_corpus = SubjectDirectory(self._temp_directory.name) - - -class SubjectToDocumentCorpusMixin(DocumentCorpus): - """Mixin class for enabling a SubjectCorpus to act as a DocumentCorpus""" - - _document_uris = None - _document_labels = None - - @property - def documents(self): - if self._document_uris is None: - self._generate_corpus_from_subjects() - for text, uris in self._document_uris.items(): - labels = self._document_labels[text] - yield Document(text=text, uris=uris, labels=labels) - - def _generate_corpus_from_subjects(self): - self._document_uris = collections.defaultdict(set) - self._document_labels = collections.defaultdict(set) - for subj in self.subjects: - for line in subj.text.splitlines(): - self._document_uris[line].add(subj.uri) - self._document_labels[line].add(subj.label) diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 96225b9a8..82fb30e13 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -6,10 +6,9 @@ import numpy as np from annif import logger from .types import Subject, SubjectCorpus -from .convert import SubjectToDocumentCorpusMixin -class SubjectDirectory(SubjectCorpus, SubjectToDocumentCorpusMixin): +class SubjectDirectory(SubjectCorpus): """A subject corpus in the form of a directory with .txt files.""" def __init__(self, path): @@ -25,7 +24,7 @@ def subjects(self): yield Subject(uri=uri, label=label, text=text) -class SubjectFileTSV(SubjectCorpus, SubjectToDocumentCorpusMixin): +class SubjectFileTSV(SubjectCorpus): """A subject corpus stored in a TSV file.""" def __init__(self, path): diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 1b2b81501..1006a9d8a 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -173,21 +173,6 @@ def test_subjdir(tmpdir): assert 'third' in subjects[2].text -def test_subjdir_as_doccorpus(tmpdir): - tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one - first subject - this is the first thing we know about""") - tmpdir.join('subj2.txt').write("""http://example.org/subj2 subject two - second subject - this is the second thing we know about""") - tmpdir.join('subj3.txt').write("""http://example.org/subj3 subject three - third subject - this is the third thing we know about""") - subjdir = annif.corpus.SubjectDirectory(str(tmpdir)) - documents = list(subjdir.documents) - assert len(documents) == 6 - - def test_subject_by_uri(subject_index): subj_id = subject_index.by_uri('http://www.yso.fi/onto/yso/p7141') assert subject_index[subj_id][1] == 'sinetit' From 987c6ddcd2bcb07a5dadc5687bd2da3cb64fc99b Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 16:57:43 +0300 Subject: [PATCH 2/6] Move the conversion from document to subject corpus entirely inside TFIDFBackend since nothing else uses it --- annif/backend/tfidf.py | 73 ++++++++++++++++++++++++++++++- annif/corpus/combine.py | 17 +------- annif/corpus/convert.py | 86 ------------------------------------- annif/corpus/document.py | 7 ++- annif/corpus/subject.py | 6 +-- tests/conftest.py | 4 +- tests/test_backend_tfidf.py | 6 +-- tests/test_corpus.py | 4 +- tests/test_suggestion.py | 16 +++---- 9 files changed, 92 insertions(+), 127 deletions(-) delete mode 100644 annif/corpus/convert.py diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index ec5d6e6d0..d4afc68fe 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -2,16 +2,52 @@ TF-IDF normalized bag-of-words vector space""" import os.path +import tempfile import joblib import gensim.similarities from gensim.matutils import Sparse2Corpus from sklearn.feature_extraction.text import TfidfVectorizer import annif.util +from annif.corpus.subject import SubjectDirectory from annif.suggestion import VectorSuggestionResult from annif.exception import NotInitializedException, NotSupportedException from . import backend +class SubjectWriter: + """Writes a single subject file into a SubjectDirectory, performing + buffering to limit the number of I/O operations.""" + + _buffer = None + + BUFFER_SIZE = 100 + + def __init__(self, path, uri, label): + self._path = path + self._buffer = ["{} {}".format(uri, label)] + self._created = False + + def _flush(self): + if self._created: + mode = 'a' + else: + mode = 'w' + + with open(self._path, mode, encoding='utf-8') as subjfile: + for text in self._buffer: + print(text, file=subjfile) + self._buffer = [] + self._created = True + + def write(self, text): + self._buffer.append(text) + if len(self._buffer) >= self.BUFFER_SIZE: + self._flush() + + def close(self): + self._flush() + + class TFIDFBackend(backend.AnnifBackend): """TF-IDF vector space similarity based backend for Annif""" name = "tfidf" @@ -24,6 +60,40 @@ class TFIDFBackend(backend.AnnifBackend): VECTORIZER_FILE = 'vectorizer' INDEX_FILE = 'tfidf-index' + _temp_directory = None + _subject_writer = None + + def _subject_filename(self, subject_id): + filename = '{:08d}.txt'.format(subject_id) + return os.path.join(self._temp_directory.name, filename) + + def _create_subject(self, subject_id, uri, label): + filename = self._subject_filename(subject_id) + self._subject_writer[subject_id] = SubjectWriter(filename, uri, label) + + def _add_text_to_subject(self, subject_id, text): + self._subject_writer[subject_id].write(text) + + def _generate_subjects_from_documents(self, corpus, project): + self._temp_directory = tempfile.TemporaryDirectory() + self._subject_writer = {} + + for subject_id, subject_info in enumerate(project.subjects): + uri, label = subject_info + self._create_subject(subject_id, uri, label) + + for doc in corpus.documents: + for uri in doc.uris: + subject_id = project.subjects.by_uri(uri) + if subject_id is None: + continue + self._add_text_to_subject(subject_id, doc.text) + + for subject_id, _ in enumerate(project.subjects): + self._subject_writer[subject_id].close() + + return SubjectDirectory(self._temp_directory.name) + def _initialize_vectorizer(self): if self._vectorizer is None: path = os.path.join(self.datadir, self.VECTORIZER_FILE) @@ -66,7 +136,8 @@ def train(self, corpus, project): raise NotSupportedException( 'Cannot train tfidf project with no documents') self.info('transforming subject corpus') - subjects = corpus.subjects + subjects = self._generate_subjects_from_documents( + corpus, project).subjects self.info('creating vectorizer') self._vectorizer = TfidfVectorizer( tokenizer=project.analyzer.tokenize_words) diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index bb9109acb..119044d40 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -1,29 +1,16 @@ """Class for combining multiple corpora so they behave like a single corpus""" import itertools -from .types import DocumentCorpus, SubjectCorpus, Subject +from .types import DocumentCorpus -class CombinedCorpus(SubjectCorpus, DocumentCorpus): +class CombinedCorpus(DocumentCorpus): """Class for combining multiple corpora so they behave like a single corpus""" def __init__(self, corpora): self._corpora = corpora - @property - def subjects(self): - for source_subjects in zip( - *[corpus.subjects for corpus in self._corpora]): - uri = None - label = None - texts = [] - for subject in source_subjects: - uri = subject.uri - label = subject.label - texts.append(subject.text) - yield Subject(uri=uri, label=label, text=" ".join(texts)) - @property def documents(self): return itertools.chain.from_iterable( diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py deleted file mode 100644 index 0648cef2d..000000000 --- a/annif/corpus/convert.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Mixin classes for converting between SubjectCorpus and DocumentCorpus""" - -import collections -import os.path -import tempfile -from .types import Document, DocumentCorpus, SubjectCorpus - - -class SubjectWriter: - """Writes a single subject file into a SubjectDirectory, performing - buffering to limit the number of I/O operations.""" - - _buffer = None - - BUFFER_SIZE = 100 - - def __init__(self, path, uri, label): - self._path = path - self._buffer = ["{} {}".format(uri, label)] - self._created = False - - def _flush(self): - if self._created: - mode = 'a' - else: - mode = 'w' - - with open(self._path, mode, encoding='utf-8') as subjfile: - for text in self._buffer: - print(text, file=subjfile) - self._buffer = [] - self._created = True - - def write(self, text): - self._buffer.append(text) - if len(self._buffer) >= self.BUFFER_SIZE: - self._flush() - - def close(self): - self._flush() - - -class DocumentToSubjectCorpusMixin(SubjectCorpus): - """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus""" - - _subject_corpus = None - _temp_directory = None - _subject_writer = None - - @property - def subjects(self): - if self._subject_corpus is None: - self._generate_corpus_from_documents() - return self._subject_corpus.subjects - - def _subject_filename(self, subject_id): - filename = '{:08d}.txt'.format(subject_id) - return os.path.join(self._temp_directory.name, filename) - - def _create_subject(self, subject_id, uri, label): - filename = self._subject_filename(subject_id) - self._subject_writer[subject_id] = SubjectWriter(filename, uri, label) - - def _add_text_to_subject(self, subject_id, text): - self._subject_writer[subject_id].write(text) - - def _generate_corpus_from_documents(self): - self._temp_directory = tempfile.TemporaryDirectory() - self._subject_writer = {} - - for subject_id, subject_info in enumerate(self._subject_index): - uri, label = subject_info - self._create_subject(subject_id, uri, label) - - for doc in self.documents: - for uri in doc.uris: - subject_id = self._subject_index.by_uri(uri) - if subject_id is None: - continue - self._add_text_to_subject(subject_id, doc.text) - - for subject_id, _ in enumerate(self._subject_index): - self._subject_writer[subject_id].close() - - from .subject import SubjectDirectory - self._subject_corpus = SubjectDirectory(self._temp_directory.name) diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 773dfc623..3ebee44a1 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -6,13 +6,12 @@ import gzip import annif.util from .types import DocumentCorpus -from .convert import DocumentToSubjectCorpusMixin from .subject import SubjectSet logger = annif.logger -class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin): +class DocumentDirectory(DocumentCorpus): """A directory of files as a full text document corpus""" def __init__(self, path, require_subjects=False): @@ -49,7 +48,7 @@ def documents(self): labels=subjects.subject_labels) -class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin): +class DocumentFile(DocumentCorpus): """A TSV file as a corpus of documents with subjects""" def __init__(self, path): @@ -78,7 +77,7 @@ def _parse_tsv_line(self, line): line.rstrip()) -class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin): +class DocumentList(DocumentCorpus): """A document corpus based on a list of other iterable of Document objects""" diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 82fb30e13..c1a1fa853 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -8,7 +8,7 @@ from .types import Subject, SubjectCorpus -class SubjectDirectory(SubjectCorpus): +class SubjectDirectory: """A subject corpus in the form of a directory with .txt files.""" def __init__(self, path): @@ -24,8 +24,8 @@ def subjects(self): yield Subject(uri=uri, label=label, text=text) -class SubjectFileTSV(SubjectCorpus): - """A subject corpus stored in a TSV file.""" +class SubjectFileTSV: + """A subject vocabulary stored in a TSV file.""" def __init__(self, path): self.path = path diff --git a/tests/conftest.py b/tests/conftest.py index d10c14d1e..75babdbd9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,8 +74,8 @@ def document_corpus(subject_index): @pytest.fixture(scope='module') -def project(document_corpus): +def project(document_corpus, subject_index): proj = unittest.mock.Mock() proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)') - proj.subjects = annif.corpus.SubjectIndex(document_corpus) + proj.subjects = subject_index return proj diff --git a/tests/test_backend_tfidf.py b/tests/test_backend_tfidf.py index 238909c99..66d008ea7 100644 --- a/tests/test_backend_tfidf.py +++ b/tests/test_backend_tfidf.py @@ -9,12 +9,10 @@ @pytest.fixture(scope='module') -def project(document_corpus): +def project(document_corpus, subject_index): proj = unittest.mock.Mock() proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)') - proj.subjects = annif.corpus.SubjectIndex(document_corpus) - proj.vectorizer = TfidfVectorizer(tokenizer=proj.analyzer.tokenize_words) - proj.vectorizer.fit([subj.text for subj in document_corpus.subjects]) + proj.subjects = subject_index return proj diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 1006a9d8a..fe67eccf8 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -237,7 +237,7 @@ def test_docfile_is_empty(tmpdir): assert docs.is_empty() -def test_combinedcorpus(tmpdir, subject_index): +def test_combinedcorpus(tmpdir): docfile = tmpdir.join('documents.tsv') docfile.write("""Läntinen\t Oulunlinnan\t @@ -247,7 +247,5 @@ def test_combinedcorpus(tmpdir, subject_index): corpus2 = annif.corpus.DocumentFile(str(docfile)) combined = annif.corpus.CombinedCorpus([corpus1, corpus2]) - combined.set_subject_index(subject_index) assert len(list(combined.documents)) == 6 - assert len(list(combined.subjects)) == len(list(corpus1.subjects)) diff --git a/tests/test_suggestion.py b/tests/test_suggestion.py index 460cde70b..50b88cf1a 100644 --- a/tests/test_suggestion.py +++ b/tests/test_suggestion.py @@ -51,8 +51,7 @@ def test_lazy_suggestion_result(subject_index): assert lar._object is not None -def test_list_suggestions_vector(document_corpus): - subjects = SubjectIndex(document_corpus) +def test_list_suggestions_vector(document_corpus, subject_index): suggestions = ListSuggestionResult( [ SubjectSuggestion( @@ -63,26 +62,25 @@ def test_list_suggestions_vector(document_corpus): uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', score=0.5)], - subjects) + subject_index) assert isinstance(suggestions.vector, np.ndarray) - assert len(suggestions.vector) == len(subjects) + assert len(suggestions.vector) == len(subject_index) assert suggestions.vector.sum() == 1.5 for subject_id, score in enumerate(suggestions.vector): - if subjects[subject_id][1] == 'sinetit': + if subject_index[subject_id][1] == 'sinetit': assert score == 1.0 - elif subjects[subject_id][1] == 'viikingit': + elif subject_index[subject_id][1] == 'viikingit': assert score == 0.5 else: assert score == 0.0 -def test_list_suggestions_vector_notfound(document_corpus): - subjects = SubjectIndex(document_corpus) +def test_list_suggestions_vector_notfound(document_corpus, subject_index): suggestions = ListSuggestionResult( [ SubjectSuggestion( uri='http://example.com/notfound', label='not found', score=1.0)], - subjects) + subject_index) assert suggestions.vector.sum() == 0 From 2cb42da576cc53e82c5ce17529601a3dd6daa5d8 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 17:21:36 +0300 Subject: [PATCH 3/6] Perform document to subject conversion in memory; remove stale SubjectDirectory class --- annif/backend/tfidf.py | 72 +++++----------------------------------- annif/corpus/__init__.py | 6 ++-- annif/corpus/subject.py | 16 --------- tests/test_corpus.py | 25 -------------- 4 files changed, 11 insertions(+), 108 deletions(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index d4afc68fe..b64dd6e80 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -1,53 +1,18 @@ """Backend that returns most similar subjects based on similarity in sparse TF-IDF normalized bag-of-words vector space""" +import collections import os.path -import tempfile import joblib import gensim.similarities from gensim.matutils import Sparse2Corpus from sklearn.feature_extraction.text import TfidfVectorizer import annif.util -from annif.corpus.subject import SubjectDirectory from annif.suggestion import VectorSuggestionResult from annif.exception import NotInitializedException, NotSupportedException from . import backend -class SubjectWriter: - """Writes a single subject file into a SubjectDirectory, performing - buffering to limit the number of I/O operations.""" - - _buffer = None - - BUFFER_SIZE = 100 - - def __init__(self, path, uri, label): - self._path = path - self._buffer = ["{} {}".format(uri, label)] - self._created = False - - def _flush(self): - if self._created: - mode = 'a' - else: - mode = 'w' - - with open(self._path, mode, encoding='utf-8') as subjfile: - for text in self._buffer: - print(text, file=subjfile) - self._buffer = [] - self._created = True - - def write(self, text): - self._buffer.append(text) - if len(self._buffer) >= self.BUFFER_SIZE: - self._flush() - - def close(self): - self._flush() - - class TFIDFBackend(backend.AnnifBackend): """TF-IDF vector space similarity based backend for Annif""" name = "tfidf" @@ -60,39 +25,20 @@ class TFIDFBackend(backend.AnnifBackend): VECTORIZER_FILE = 'vectorizer' INDEX_FILE = 'tfidf-index' - _temp_directory = None - _subject_writer = None - - def _subject_filename(self, subject_id): - filename = '{:08d}.txt'.format(subject_id) - return os.path.join(self._temp_directory.name, filename) - - def _create_subject(self, subject_id, uri, label): - filename = self._subject_filename(subject_id) - self._subject_writer[subject_id] = SubjectWriter(filename, uri, label) - - def _add_text_to_subject(self, subject_id, text): - self._subject_writer[subject_id].write(text) - def _generate_subjects_from_documents(self, corpus, project): - self._temp_directory = tempfile.TemporaryDirectory() - self._subject_writer = {} - - for subject_id, subject_info in enumerate(project.subjects): - uri, label = subject_info - self._create_subject(subject_id, uri, label) + subject_text = collections.defaultdict(list) for doc in corpus.documents: for uri in doc.uris: subject_id = project.subjects.by_uri(uri) if subject_id is None: continue - self._add_text_to_subject(subject_id, doc.text) + subject_text[subject_id].append(doc.text) - for subject_id, _ in enumerate(project.subjects): - self._subject_writer[subject_id].close() + for subject_id in subject_text: + subject_text[subject_id] = '\n'.join(subject_text[subject_id]) - return SubjectDirectory(self._temp_directory.name) + return (subject_text[sid] for sid in sorted(subject_text.keys())) def _initialize_vectorizer(self): if self._vectorizer is None: @@ -136,13 +82,11 @@ def train(self, corpus, project): raise NotSupportedException( 'Cannot train tfidf project with no documents') self.info('transforming subject corpus') - subjects = self._generate_subjects_from_documents( - corpus, project).subjects + subjects = self._generate_subjects_from_documents(corpus, project) self.info('creating vectorizer') self._vectorizer = TfidfVectorizer( tokenizer=project.analyzer.tokenize_words) - veccorpus = self._vectorizer.fit_transform( - (subj.text for subj in subjects)) + veccorpus = self._vectorizer.fit_transform(subjects) annif.util.atomic_save( self._vectorizer, self.datadir, diff --git a/annif/corpus/__init__.py b/annif/corpus/__init__.py index 7d87d565a..b5dad8495 100644 --- a/annif/corpus/__init__.py +++ b/annif/corpus/__init__.py @@ -2,12 +2,12 @@ from .document import DocumentDirectory, DocumentFile, DocumentList -from .subject import Subject, SubjectDirectory, SubjectFileTSV +from .subject import Subject, SubjectFileTSV from .subject import SubjectIndex, SubjectSet from .skos import SubjectFileSKOS from .types import Document from .combine import CombinedCorpus __all__ = [DocumentDirectory, DocumentFile, DocumentList, Subject, - SubjectDirectory, SubjectFileTSV, SubjectIndex, SubjectSet, - SubjectFileSKOS, Document, CombinedCorpus] + SubjectFileTSV, SubjectIndex, SubjectSet, SubjectFileSKOS, + Document, CombinedCorpus] diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index c1a1fa853..fb23782f2 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -8,22 +8,6 @@ from .types import Subject, SubjectCorpus -class SubjectDirectory: - """A subject corpus in the form of a directory with .txt files.""" - - def __init__(self, path): - self.path = path - self._filenames = sorted(glob.glob(os.path.join(path, '*.txt'))) - - @property - def subjects(self): - for filename in self._filenames: - with open(filename, encoding='utf-8') as subjfile: - uri, label = subjfile.readline().strip().split(' ', 1) - text = ' '.join(subjfile.readlines()) - yield Subject(uri=uri, label=label, text=text) - - class SubjectFileTSV: """A subject vocabulary stored in a TSV file.""" diff --git a/tests/test_corpus.py b/tests/test_corpus.py index fe67eccf8..a93c399e1 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -148,31 +148,6 @@ def test_docdir_key_as_doccorpus(tmpdir, subject_index): assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'} -def test_subjdir(tmpdir): - tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one - first subject - this is the first thing we know about""") - tmpdir.join('subj2.txt').write("""http://example.org/subj2 subject two - second subject - this is the second thing we know about""") - tmpdir.join('subj3.txt').write("""http://example.org/subj3 subject three - third subject - this is the third thing we know about""") - - subjdir = annif.corpus.SubjectDirectory(str(tmpdir)) - subjects = sorted(list(subjdir.subjects), key=lambda subj: subj.uri) - assert len(subjects) == 3 - assert subjects[0].uri == 'http://example.org/subj1' - assert subjects[0].label == 'subject one' - assert 'first' in subjects[0].text - assert subjects[1].uri == 'http://example.org/subj2' - assert subjects[1].label == 'subject two' - assert 'second' in subjects[1].text - assert subjects[2].uri == 'http://example.org/subj3' - assert subjects[2].label == 'subject three' - assert 'third' in subjects[2].text - - def test_subject_by_uri(subject_index): subj_id = subject_index.by_uri('http://www.yso.fi/onto/yso/p7141') assert subject_index[subj_id][1] == 'sinetit' From 3cc830b9aa04c386ec26e56841874221183e5d4f Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 4 Oct 2019 17:46:22 +0300 Subject: [PATCH 4/6] Tokenize text during conversion to subject corpus instead of within TfidfTransformer, to avoid tokenizing the same text many times if it has multiple subjects --- annif/backend/tfidf.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index b64dd6e80..39d21636c 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -26,19 +26,18 @@ class TFIDFBackend(backend.AnnifBackend): INDEX_FILE = 'tfidf-index' def _generate_subjects_from_documents(self, corpus, project): - subject_text = collections.defaultdict(list) + subject_tokens = collections.defaultdict(list) for doc in corpus.documents: + tokens = project.analyzer.tokenize_words(doc.text) for uri in doc.uris: subject_id = project.subjects.by_uri(uri) if subject_id is None: continue - subject_text[subject_id].append(doc.text) + subject_tokens[subject_id].extend(tokens) - for subject_id in subject_text: - subject_text[subject_id] = '\n'.join(subject_text[subject_id]) - - return (subject_text[sid] for sid in sorted(subject_text.keys())) + return (" ".join(subject_tokens[sid]) + for sid in range(len(project.subjects))) def _initialize_vectorizer(self): if self._vectorizer is None: @@ -84,8 +83,7 @@ def train(self, corpus, project): self.info('transforming subject corpus') subjects = self._generate_subjects_from_documents(corpus, project) self.info('creating vectorizer') - self._vectorizer = TfidfVectorizer( - tokenizer=project.analyzer.tokenize_words) + self._vectorizer = TfidfVectorizer() veccorpus = self._vectorizer.fit_transform(subjects) annif.util.atomic_save( self._vectorizer, @@ -97,7 +95,8 @@ def train(self, corpus, project): def _suggest(self, text, project, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) - vectors = self._vectorizer.transform([text]) + tokens = project.analyzer.tokenize_words(text) + vectors = self._vectorizer.transform([" ".join(tokens)]) docsim = self._index[vectors[0]] fullresult = VectorSuggestionResult(docsim, project.subjects) return fullresult.filter(limit=int(self.params['limit'])) From fe4bdc55ccab75fe73932777d31e8c01e5047d11 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 7 Oct 2019 13:23:07 +0300 Subject: [PATCH 5/6] Spool large subject texts into files instead of keeping everything in memory --- annif/backend/tfidf.py | 67 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 39d21636c..d952c37ee 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -3,6 +3,7 @@ import collections import os.path +import tempfile import joblib import gensim.similarities from gensim.matutils import Sparse2Corpus @@ -13,6 +14,44 @@ from . import backend +class SubjectBuffer: + """A file-backed buffer to store and retrieve subject text.""" + + BUFFER_SIZE = 100 + + def __init__(self, tempdir, subject_id): + filename = '{:08d}.txt'.format(subject_id) + self._path = os.path.join(tempdir, filename) + self._buffer = [] + self._created = False + + def flush(self): + if self._created: + mode = 'a' + else: + mode = 'w' + + with open(self._path, mode, encoding='utf-8') as subjfile: + for text in self._buffer: + print(text, file=subjfile) + + self._buffer = [] + self._created = True + + def write(self, text): + self._buffer.append(text) + if len(self._buffer) >= self.BUFFER_SIZE: + self.flush() + + def read(self): + if not self._created: + # file was never created - we can simply return the buffer content + return "\n".join(self._buffer) + else: + with open(self._path, 'r', encoding='utf-8') as subjfile: + return subjfile.read() + "\n" + "\n".join(self._buffer) + + class TFIDFBackend(backend.AnnifBackend): """TF-IDF vector space similarity based backend for Annif""" name = "tfidf" @@ -26,18 +65,22 @@ class TFIDFBackend(backend.AnnifBackend): INDEX_FILE = 'tfidf-index' def _generate_subjects_from_documents(self, corpus, project): - subject_tokens = collections.defaultdict(list) - - for doc in corpus.documents: - tokens = project.analyzer.tokenize_words(doc.text) - for uri in doc.uris: - subject_id = project.subjects.by_uri(uri) - if subject_id is None: - continue - subject_tokens[subject_id].extend(tokens) - - return (" ".join(subject_tokens[sid]) - for sid in range(len(project.subjects))) + with tempfile.TemporaryDirectory() as tempdir: + subject_buffer = {} + for subject_id in range(len(project.subjects)): + subject_buffer[subject_id] = SubjectBuffer(tempdir, + subject_id) + + for doc in corpus.documents: + tokens = project.analyzer.tokenize_words(doc.text) + for uri in doc.uris: + subject_id = project.subjects.by_uri(uri) + if subject_id is None: + continue + subject_buffer[subject_id].write(" ".join(tokens)) + + for sid in range(len(project.subjects)): + yield subject_buffer[sid].read() def _initialize_vectorizer(self): if self._vectorizer is None: From 3238047cad61aae070a9d53741bbf18ecbe3fca6 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 7 Oct 2019 13:50:31 +0300 Subject: [PATCH 6/6] Cleanup unused imports --- annif/backend/tfidf.py | 1 - annif/corpus/skos.py | 2 +- annif/corpus/subject.py | 4 +--- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index d952c37ee..c98bd77be 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -1,7 +1,6 @@ """Backend that returns most similar subjects based on similarity in sparse TF-IDF normalized bag-of-words vector space""" -import collections import os.path import tempfile import joblib diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 3087da91f..67c2a39e4 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -3,7 +3,7 @@ import rdflib import rdflib.util from rdflib.namespace import SKOS, RDF, OWL -from .subject import Subject, SubjectCorpus +from .types import Subject, SubjectCorpus class SubjectFileSKOS(SubjectCorpus): diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index fb23782f2..33e1c10ec 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -1,11 +1,9 @@ """Classes for supporting subject corpora expressed as directories or files""" -import glob -import os.path import annif.util import numpy as np from annif import logger -from .types import Subject, SubjectCorpus +from .types import Subject class SubjectFileTSV: