From 01d1c4367f589b79bb71fb9587353e7043948929 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 4 Oct 2019 16:33:17 +0300
Subject: [PATCH 1/6] Remove dead code: conversion from SubjectCorpus to
 DocumentCorpus was only used by a unit test, not real code

---
 annif/corpus/convert.py | 23 -----------------------
 annif/corpus/subject.py |  5 ++---
 tests/test_corpus.py    | 15 ---------------
 3 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py
index e3a1fe77a..0648cef2d 100644
--- a/annif/corpus/convert.py
+++ b/annif/corpus/convert.py
@@ -84,26 +84,3 @@ def _generate_corpus_from_documents(self):
 
         from .subject import SubjectDirectory
         self._subject_corpus = SubjectDirectory(self._temp_directory.name)
-
-
-class SubjectToDocumentCorpusMixin(DocumentCorpus):
-    """Mixin class for enabling a SubjectCorpus to act as a DocumentCorpus"""
-
-    _document_uris = None
-    _document_labels = None
-
-    @property
-    def documents(self):
-        if self._document_uris is None:
-            self._generate_corpus_from_subjects()
-        for text, uris in self._document_uris.items():
-            labels = self._document_labels[text]
-            yield Document(text=text, uris=uris, labels=labels)
-
-    def _generate_corpus_from_subjects(self):
-        self._document_uris = collections.defaultdict(set)
-        self._document_labels = collections.defaultdict(set)
-        for subj in self.subjects:
-            for line in subj.text.splitlines():
-                self._document_uris[line].add(subj.uri)
-                self._document_labels[line].add(subj.label)
diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
index 96225b9a8..82fb30e13 100644
--- a/annif/corpus/subject.py
+++ b/annif/corpus/subject.py
@@ -6,10 +6,9 @@
 import numpy as np
 from annif import logger
 from .types import Subject, SubjectCorpus
-from .convert import SubjectToDocumentCorpusMixin
 
 
-class SubjectDirectory(SubjectCorpus, SubjectToDocumentCorpusMixin):
+class SubjectDirectory(SubjectCorpus):
     """A subject corpus in the form of a directory with .txt files."""
 
     def __init__(self, path):
@@ -25,7 +24,7 @@ def subjects(self):
                 yield Subject(uri=uri, label=label, text=text)
 
 
-class SubjectFileTSV(SubjectCorpus, SubjectToDocumentCorpusMixin):
+class SubjectFileTSV(SubjectCorpus):
     """A subject corpus stored in a TSV file."""
 
     def __init__(self, path):
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index 1b2b81501..1006a9d8a 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -173,21 +173,6 @@ def test_subjdir(tmpdir):
     assert 'third' in subjects[2].text
 
 
-def test_subjdir_as_doccorpus(tmpdir):
-    tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one
-        first subject
-        this is the first thing we know about""")
-    tmpdir.join('subj2.txt').write("""http://example.org/subj2 subject two
-        second subject
-        this is the second thing we know about""")
-    tmpdir.join('subj3.txt').write("""http://example.org/subj3 subject three
-        third subject
-        this is the third thing we know about""")
-    subjdir = annif.corpus.SubjectDirectory(str(tmpdir))
-    documents = list(subjdir.documents)
-    assert len(documents) == 6
-
-
 def test_subject_by_uri(subject_index):
     subj_id = subject_index.by_uri('http://www.yso.fi/onto/yso/p7141')
     assert subject_index[subj_id][1] == 'sinetit'

From 987c6ddcd2bcb07a5dadc5687bd2da3cb64fc99b Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 4 Oct 2019 16:57:43 +0300
Subject: [PATCH 2/6] Move the conversion from document to subject corpus
 entirely inside TFIDFBackend since nothing else uses it

---
 annif/backend/tfidf.py      | 73 ++++++++++++++++++++++++++++++-
 annif/corpus/combine.py     | 17 +-------
 annif/corpus/convert.py     | 86 -------------------------------------
 annif/corpus/document.py    |  7 ++-
 annif/corpus/subject.py     |  6 +--
 tests/conftest.py           |  4 +-
 tests/test_backend_tfidf.py |  6 +--
 tests/test_corpus.py        |  4 +-
 tests/test_suggestion.py    | 16 +++----
 9 files changed, 92 insertions(+), 127 deletions(-)
 delete mode 100644 annif/corpus/convert.py

diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
index ec5d6e6d0..d4afc68fe 100644
--- a/annif/backend/tfidf.py
+++ b/annif/backend/tfidf.py
@@ -2,16 +2,52 @@
 TF-IDF normalized bag-of-words vector space"""
 
 import os.path
+import tempfile
 import joblib
 import gensim.similarities
 from gensim.matutils import Sparse2Corpus
 from sklearn.feature_extraction.text import TfidfVectorizer
 import annif.util
+from annif.corpus.subject import SubjectDirectory
 from annif.suggestion import VectorSuggestionResult
 from annif.exception import NotInitializedException, NotSupportedException
 from . import backend
 
 
+class SubjectWriter:
+    """Writes a single subject file into a SubjectDirectory, performing
+    buffering to limit the number of I/O operations."""
+
+    _buffer = None
+
+    BUFFER_SIZE = 100
+
+    def __init__(self, path, uri, label):
+        self._path = path
+        self._buffer = ["{} {}".format(uri, label)]
+        self._created = False
+
+    def _flush(self):
+        if self._created:
+            mode = 'a'
+        else:
+            mode = 'w'
+
+        with open(self._path, mode, encoding='utf-8') as subjfile:
+            for text in self._buffer:
+                print(text, file=subjfile)
+        self._buffer = []
+        self._created = True
+
+    def write(self, text):
+        self._buffer.append(text)
+        if len(self._buffer) >= self.BUFFER_SIZE:
+            self._flush()
+
+    def close(self):
+        self._flush()
+
+
 class TFIDFBackend(backend.AnnifBackend):
     """TF-IDF vector space similarity based backend for Annif"""
     name = "tfidf"
@@ -24,6 +60,40 @@ class TFIDFBackend(backend.AnnifBackend):
     VECTORIZER_FILE = 'vectorizer'
     INDEX_FILE = 'tfidf-index'
 
+    _temp_directory = None
+    _subject_writer = None
+
+    def _subject_filename(self, subject_id):
+        filename = '{:08d}.txt'.format(subject_id)
+        return os.path.join(self._temp_directory.name, filename)
+
+    def _create_subject(self, subject_id, uri, label):
+        filename = self._subject_filename(subject_id)
+        self._subject_writer[subject_id] = SubjectWriter(filename, uri, label)
+
+    def _add_text_to_subject(self, subject_id, text):
+        self._subject_writer[subject_id].write(text)
+
+    def _generate_subjects_from_documents(self, corpus, project):
+        self._temp_directory = tempfile.TemporaryDirectory()
+        self._subject_writer = {}
+
+        for subject_id, subject_info in enumerate(project.subjects):
+            uri, label = subject_info
+            self._create_subject(subject_id, uri, label)
+
+        for doc in corpus.documents:
+            for uri in doc.uris:
+                subject_id = project.subjects.by_uri(uri)
+                if subject_id is None:
+                    continue
+                self._add_text_to_subject(subject_id, doc.text)
+
+        for subject_id, _ in enumerate(project.subjects):
+            self._subject_writer[subject_id].close()
+
+        return SubjectDirectory(self._temp_directory.name)
+
     def _initialize_vectorizer(self):
         if self._vectorizer is None:
             path = os.path.join(self.datadir, self.VECTORIZER_FILE)
@@ -66,7 +136,8 @@ def train(self, corpus, project):
             raise NotSupportedException(
                 'Cannot train tfidf project with no documents')
         self.info('transforming subject corpus')
-        subjects = corpus.subjects
+        subjects = self._generate_subjects_from_documents(
+            corpus, project).subjects
         self.info('creating vectorizer')
         self._vectorizer = TfidfVectorizer(
             tokenizer=project.analyzer.tokenize_words)
diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py
index bb9109acb..119044d40 100644
--- a/annif/corpus/combine.py
+++ b/annif/corpus/combine.py
@@ -1,29 +1,16 @@
 """Class for combining multiple corpora so they behave like a single corpus"""
 
 import itertools
-from .types import DocumentCorpus, SubjectCorpus, Subject
+from .types import DocumentCorpus
 
 
-class CombinedCorpus(SubjectCorpus, DocumentCorpus):
+class CombinedCorpus(DocumentCorpus):
     """Class for combining multiple corpora so they behave like a single
     corpus"""
 
     def __init__(self, corpora):
         self._corpora = corpora
 
-    @property
-    def subjects(self):
-        for source_subjects in zip(
-                *[corpus.subjects for corpus in self._corpora]):
-            uri = None
-            label = None
-            texts = []
-            for subject in source_subjects:
-                uri = subject.uri
-                label = subject.label
-                texts.append(subject.text)
-            yield Subject(uri=uri, label=label, text=" ".join(texts))
-
     @property
     def documents(self):
         return itertools.chain.from_iterable(
diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py
deleted file mode 100644
index 0648cef2d..000000000
--- a/annif/corpus/convert.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""Mixin classes for converting between SubjectCorpus and DocumentCorpus"""
-
-import collections
-import os.path
-import tempfile
-from .types import Document, DocumentCorpus, SubjectCorpus
-
-
-class SubjectWriter:
-    """Writes a single subject file into a SubjectDirectory, performing
-    buffering to limit the number of I/O operations."""
-
-    _buffer = None
-
-    BUFFER_SIZE = 100
-
-    def __init__(self, path, uri, label):
-        self._path = path
-        self._buffer = ["{} {}".format(uri, label)]
-        self._created = False
-
-    def _flush(self):
-        if self._created:
-            mode = 'a'
-        else:
-            mode = 'w'
-
-        with open(self._path, mode, encoding='utf-8') as subjfile:
-            for text in self._buffer:
-                print(text, file=subjfile)
-        self._buffer = []
-        self._created = True
-
-    def write(self, text):
-        self._buffer.append(text)
-        if len(self._buffer) >= self.BUFFER_SIZE:
-            self._flush()
-
-    def close(self):
-        self._flush()
-
-
-class DocumentToSubjectCorpusMixin(SubjectCorpus):
-    """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus"""
-
-    _subject_corpus = None
-    _temp_directory = None
-    _subject_writer = None
-
-    @property
-    def subjects(self):
-        if self._subject_corpus is None:
-            self._generate_corpus_from_documents()
-        return self._subject_corpus.subjects
-
-    def _subject_filename(self, subject_id):
-        filename = '{:08d}.txt'.format(subject_id)
-        return os.path.join(self._temp_directory.name, filename)
-
-    def _create_subject(self, subject_id, uri, label):
-        filename = self._subject_filename(subject_id)
-        self._subject_writer[subject_id] = SubjectWriter(filename, uri, label)
-
-    def _add_text_to_subject(self, subject_id, text):
-        self._subject_writer[subject_id].write(text)
-
-    def _generate_corpus_from_documents(self):
-        self._temp_directory = tempfile.TemporaryDirectory()
-        self._subject_writer = {}
-
-        for subject_id, subject_info in enumerate(self._subject_index):
-            uri, label = subject_info
-            self._create_subject(subject_id, uri, label)
-
-        for doc in self.documents:
-            for uri in doc.uris:
-                subject_id = self._subject_index.by_uri(uri)
-                if subject_id is None:
-                    continue
-                self._add_text_to_subject(subject_id, doc.text)
-
-        for subject_id, _ in enumerate(self._subject_index):
-            self._subject_writer[subject_id].close()
-
-        from .subject import SubjectDirectory
-        self._subject_corpus = SubjectDirectory(self._temp_directory.name)
diff --git a/annif/corpus/document.py b/annif/corpus/document.py
index 773dfc623..3ebee44a1 100644
--- a/annif/corpus/document.py
+++ b/annif/corpus/document.py
@@ -6,13 +6,12 @@
 import gzip
 import annif.util
 from .types import DocumentCorpus
-from .convert import DocumentToSubjectCorpusMixin
 from .subject import SubjectSet
 
 logger = annif.logger
 
 
-class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
+class DocumentDirectory(DocumentCorpus):
     """A directory of files as a full text document corpus"""
 
     def __init__(self, path, require_subjects=False):
@@ -49,7 +48,7 @@ def documents(self):
                                         labels=subjects.subject_labels)
 
 
-class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
+class DocumentFile(DocumentCorpus):
     """A TSV file as a corpus of documents with subjects"""
 
     def __init__(self, path):
@@ -78,7 +77,7 @@ def _parse_tsv_line(self, line):
                            line.rstrip())
 
 
-class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
+class DocumentList(DocumentCorpus):
     """A document corpus based on a list of other iterable of Document
     objects"""
 
diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
index 82fb30e13..c1a1fa853 100644
--- a/annif/corpus/subject.py
+++ b/annif/corpus/subject.py
@@ -8,7 +8,7 @@
 from .types import Subject, SubjectCorpus
 
 
-class SubjectDirectory(SubjectCorpus):
+class SubjectDirectory:
     """A subject corpus in the form of a directory with .txt files."""
 
     def __init__(self, path):
@@ -24,8 +24,8 @@ def subjects(self):
                 yield Subject(uri=uri, label=label, text=text)
 
 
-class SubjectFileTSV(SubjectCorpus):
-    """A subject corpus stored in a TSV file."""
+class SubjectFileTSV:
+    """A subject vocabulary stored in a TSV file."""
 
     def __init__(self, path):
         self.path = path
diff --git a/tests/conftest.py b/tests/conftest.py
index d10c14d1e..75babdbd9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -74,8 +74,8 @@ def document_corpus(subject_index):
 
 
 @pytest.fixture(scope='module')
-def project(document_corpus):
+def project(document_corpus, subject_index):
     proj = unittest.mock.Mock()
     proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)')
-    proj.subjects = annif.corpus.SubjectIndex(document_corpus)
+    proj.subjects = subject_index
     return proj
diff --git a/tests/test_backend_tfidf.py b/tests/test_backend_tfidf.py
index 238909c99..66d008ea7 100644
--- a/tests/test_backend_tfidf.py
+++ b/tests/test_backend_tfidf.py
@@ -9,12 +9,10 @@
 
 
 @pytest.fixture(scope='module')
-def project(document_corpus):
+def project(document_corpus, subject_index):
     proj = unittest.mock.Mock()
     proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)')
-    proj.subjects = annif.corpus.SubjectIndex(document_corpus)
-    proj.vectorizer = TfidfVectorizer(tokenizer=proj.analyzer.tokenize_words)
-    proj.vectorizer.fit([subj.text for subj in document_corpus.subjects])
+    proj.subjects = subject_index
     return proj
 
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index 1006a9d8a..fe67eccf8 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -237,7 +237,7 @@ def test_docfile_is_empty(tmpdir):
     assert docs.is_empty()
 
 
-def test_combinedcorpus(tmpdir, subject_index):
+def test_combinedcorpus(tmpdir):
     docfile = tmpdir.join('documents.tsv')
     docfile.write("""Läntinen\t<http://www.yso.fi/onto/yso/p2557>
         Oulunlinnan\t<http://www.yso.fi/onto/yso/p7346>
@@ -247,7 +247,5 @@ def test_combinedcorpus(tmpdir, subject_index):
     corpus2 = annif.corpus.DocumentFile(str(docfile))
 
     combined = annif.corpus.CombinedCorpus([corpus1, corpus2])
-    combined.set_subject_index(subject_index)
 
     assert len(list(combined.documents)) == 6
-    assert len(list(combined.subjects)) == len(list(corpus1.subjects))
diff --git a/tests/test_suggestion.py b/tests/test_suggestion.py
index 460cde70b..50b88cf1a 100644
--- a/tests/test_suggestion.py
+++ b/tests/test_suggestion.py
@@ -51,8 +51,7 @@ def test_lazy_suggestion_result(subject_index):
     assert lar._object is not None
 
 
-def test_list_suggestions_vector(document_corpus):
-    subjects = SubjectIndex(document_corpus)
+def test_list_suggestions_vector(document_corpus, subject_index):
     suggestions = ListSuggestionResult(
         [
             SubjectSuggestion(
@@ -63,26 +62,25 @@ def test_list_suggestions_vector(document_corpus):
                 uri='http://www.yso.fi/onto/yso/p6479',
                 label='viikingit',
                 score=0.5)],
-        subjects)
+        subject_index)
     assert isinstance(suggestions.vector, np.ndarray)
-    assert len(suggestions.vector) == len(subjects)
+    assert len(suggestions.vector) == len(subject_index)
     assert suggestions.vector.sum() == 1.5
     for subject_id, score in enumerate(suggestions.vector):
-        if subjects[subject_id][1] == 'sinetit':
+        if subject_index[subject_id][1] == 'sinetit':
             assert score == 1.0
-        elif subjects[subject_id][1] == 'viikingit':
+        elif subject_index[subject_id][1] == 'viikingit':
             assert score == 0.5
         else:
             assert score == 0.0
 
 
-def test_list_suggestions_vector_notfound(document_corpus):
-    subjects = SubjectIndex(document_corpus)
+def test_list_suggestions_vector_notfound(document_corpus, subject_index):
     suggestions = ListSuggestionResult(
         [
             SubjectSuggestion(
                 uri='http://example.com/notfound',
                 label='not found',
                 score=1.0)],
-        subjects)
+        subject_index)
     assert suggestions.vector.sum() == 0

From 2cb42da576cc53e82c5ce17529601a3dd6daa5d8 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 4 Oct 2019 17:21:36 +0300
Subject: [PATCH 3/6] Perform document to subject conversion in memory; remove
 stale SubjectDirectory class

---
 annif/backend/tfidf.py   | 72 +++++-----------------------------------
 annif/corpus/__init__.py |  6 ++--
 annif/corpus/subject.py  | 16 ---------
 tests/test_corpus.py     | 25 --------------
 4 files changed, 11 insertions(+), 108 deletions(-)

diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
index d4afc68fe..b64dd6e80 100644
--- a/annif/backend/tfidf.py
+++ b/annif/backend/tfidf.py
@@ -1,53 +1,18 @@
 """Backend that returns most similar subjects based on similarity in sparse
 TF-IDF normalized bag-of-words vector space"""
 
+import collections
 import os.path
-import tempfile
 import joblib
 import gensim.similarities
 from gensim.matutils import Sparse2Corpus
 from sklearn.feature_extraction.text import TfidfVectorizer
 import annif.util
-from annif.corpus.subject import SubjectDirectory
 from annif.suggestion import VectorSuggestionResult
 from annif.exception import NotInitializedException, NotSupportedException
 from . import backend
 
 
-class SubjectWriter:
-    """Writes a single subject file into a SubjectDirectory, performing
-    buffering to limit the number of I/O operations."""
-
-    _buffer = None
-
-    BUFFER_SIZE = 100
-
-    def __init__(self, path, uri, label):
-        self._path = path
-        self._buffer = ["{} {}".format(uri, label)]
-        self._created = False
-
-    def _flush(self):
-        if self._created:
-            mode = 'a'
-        else:
-            mode = 'w'
-
-        with open(self._path, mode, encoding='utf-8') as subjfile:
-            for text in self._buffer:
-                print(text, file=subjfile)
-        self._buffer = []
-        self._created = True
-
-    def write(self, text):
-        self._buffer.append(text)
-        if len(self._buffer) >= self.BUFFER_SIZE:
-            self._flush()
-
-    def close(self):
-        self._flush()
-
-
 class TFIDFBackend(backend.AnnifBackend):
     """TF-IDF vector space similarity based backend for Annif"""
     name = "tfidf"
@@ -60,39 +25,20 @@ class TFIDFBackend(backend.AnnifBackend):
     VECTORIZER_FILE = 'vectorizer'
     INDEX_FILE = 'tfidf-index'
 
-    _temp_directory = None
-    _subject_writer = None
-
-    def _subject_filename(self, subject_id):
-        filename = '{:08d}.txt'.format(subject_id)
-        return os.path.join(self._temp_directory.name, filename)
-
-    def _create_subject(self, subject_id, uri, label):
-        filename = self._subject_filename(subject_id)
-        self._subject_writer[subject_id] = SubjectWriter(filename, uri, label)
-
-    def _add_text_to_subject(self, subject_id, text):
-        self._subject_writer[subject_id].write(text)
-
     def _generate_subjects_from_documents(self, corpus, project):
-        self._temp_directory = tempfile.TemporaryDirectory()
-        self._subject_writer = {}
-
-        for subject_id, subject_info in enumerate(project.subjects):
-            uri, label = subject_info
-            self._create_subject(subject_id, uri, label)
+        subject_text = collections.defaultdict(list)
 
         for doc in corpus.documents:
             for uri in doc.uris:
                 subject_id = project.subjects.by_uri(uri)
                 if subject_id is None:
                     continue
-                self._add_text_to_subject(subject_id, doc.text)
+                subject_text[subject_id].append(doc.text)
 
-        for subject_id, _ in enumerate(project.subjects):
-            self._subject_writer[subject_id].close()
+        for subject_id in subject_text:
+            subject_text[subject_id] = '\n'.join(subject_text[subject_id])
 
-        return SubjectDirectory(self._temp_directory.name)
+        return (subject_text[sid] for sid in sorted(subject_text.keys()))
 
     def _initialize_vectorizer(self):
         if self._vectorizer is None:
@@ -136,13 +82,11 @@ def train(self, corpus, project):
             raise NotSupportedException(
                 'Cannot train tfidf project with no documents')
         self.info('transforming subject corpus')
-        subjects = self._generate_subjects_from_documents(
-            corpus, project).subjects
+        subjects = self._generate_subjects_from_documents(corpus, project)
         self.info('creating vectorizer')
         self._vectorizer = TfidfVectorizer(
             tokenizer=project.analyzer.tokenize_words)
-        veccorpus = self._vectorizer.fit_transform(
-            (subj.text for subj in subjects))
+        veccorpus = self._vectorizer.fit_transform(subjects)
         annif.util.atomic_save(
             self._vectorizer,
             self.datadir,
diff --git a/annif/corpus/__init__.py b/annif/corpus/__init__.py
index 7d87d565a..b5dad8495 100644
--- a/annif/corpus/__init__.py
+++ b/annif/corpus/__init__.py
@@ -2,12 +2,12 @@
 
 
 from .document import DocumentDirectory, DocumentFile, DocumentList
-from .subject import Subject, SubjectDirectory, SubjectFileTSV
+from .subject import Subject, SubjectFileTSV
 from .subject import SubjectIndex, SubjectSet
 from .skos import SubjectFileSKOS
 from .types import Document
 from .combine import CombinedCorpus
 
 __all__ = [DocumentDirectory, DocumentFile, DocumentList, Subject,
-           SubjectDirectory, SubjectFileTSV, SubjectIndex, SubjectSet,
-           SubjectFileSKOS, Document, CombinedCorpus]
+           SubjectFileTSV, SubjectIndex, SubjectSet, SubjectFileSKOS,
+           Document, CombinedCorpus]
diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
index c1a1fa853..fb23782f2 100644
--- a/annif/corpus/subject.py
+++ b/annif/corpus/subject.py
@@ -8,22 +8,6 @@
 from .types import Subject, SubjectCorpus
 
 
-class SubjectDirectory:
-    """A subject corpus in the form of a directory with .txt files."""
-
-    def __init__(self, path):
-        self.path = path
-        self._filenames = sorted(glob.glob(os.path.join(path, '*.txt')))
-
-    @property
-    def subjects(self):
-        for filename in self._filenames:
-            with open(filename, encoding='utf-8') as subjfile:
-                uri, label = subjfile.readline().strip().split(' ', 1)
-                text = ' '.join(subjfile.readlines())
-                yield Subject(uri=uri, label=label, text=text)
-
-
 class SubjectFileTSV:
     """A subject vocabulary stored in a TSV file."""
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index fe67eccf8..a93c399e1 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -148,31 +148,6 @@ def test_docdir_key_as_doccorpus(tmpdir, subject_index):
     assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'}
 
 
-def test_subjdir(tmpdir):
-    tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one
-        first subject
-        this is the first thing we know about""")
-    tmpdir.join('subj2.txt').write("""http://example.org/subj2 subject two
-        second subject
-        this is the second thing we know about""")
-    tmpdir.join('subj3.txt').write("""http://example.org/subj3 subject three
-        third subject
-        this is the third thing we know about""")
-
-    subjdir = annif.corpus.SubjectDirectory(str(tmpdir))
-    subjects = sorted(list(subjdir.subjects), key=lambda subj: subj.uri)
-    assert len(subjects) == 3
-    assert subjects[0].uri == 'http://example.org/subj1'
-    assert subjects[0].label == 'subject one'
-    assert 'first' in subjects[0].text
-    assert subjects[1].uri == 'http://example.org/subj2'
-    assert subjects[1].label == 'subject two'
-    assert 'second' in subjects[1].text
-    assert subjects[2].uri == 'http://example.org/subj3'
-    assert subjects[2].label == 'subject three'
-    assert 'third' in subjects[2].text
-
-
 def test_subject_by_uri(subject_index):
     subj_id = subject_index.by_uri('http://www.yso.fi/onto/yso/p7141')
     assert subject_index[subj_id][1] == 'sinetit'

From 3cc830b9aa04c386ec26e56841874221183e5d4f Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 4 Oct 2019 17:46:22 +0300
Subject: [PATCH 4/6] Tokenize text during conversion to subject corpus instead
 of within TfidfTransformer, to avoid tokenizing the same text many times if
 it has multiple subjects

---
 annif/backend/tfidf.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
index b64dd6e80..39d21636c 100644
--- a/annif/backend/tfidf.py
+++ b/annif/backend/tfidf.py
@@ -26,19 +26,18 @@ class TFIDFBackend(backend.AnnifBackend):
     INDEX_FILE = 'tfidf-index'
 
     def _generate_subjects_from_documents(self, corpus, project):
-        subject_text = collections.defaultdict(list)
+        subject_tokens = collections.defaultdict(list)
 
         for doc in corpus.documents:
+            tokens = project.analyzer.tokenize_words(doc.text)
             for uri in doc.uris:
                 subject_id = project.subjects.by_uri(uri)
                 if subject_id is None:
                     continue
-                subject_text[subject_id].append(doc.text)
+                subject_tokens[subject_id].extend(tokens)
 
-        for subject_id in subject_text:
-            subject_text[subject_id] = '\n'.join(subject_text[subject_id])
-
-        return (subject_text[sid] for sid in sorted(subject_text.keys()))
+        return (" ".join(subject_tokens[sid])
+                for sid in range(len(project.subjects)))
 
     def _initialize_vectorizer(self):
         if self._vectorizer is None:
@@ -84,8 +83,7 @@ def train(self, corpus, project):
         self.info('transforming subject corpus')
         subjects = self._generate_subjects_from_documents(corpus, project)
         self.info('creating vectorizer')
-        self._vectorizer = TfidfVectorizer(
-            tokenizer=project.analyzer.tokenize_words)
+        self._vectorizer = TfidfVectorizer()
         veccorpus = self._vectorizer.fit_transform(subjects)
         annif.util.atomic_save(
             self._vectorizer,
@@ -97,7 +95,8 @@ def train(self, corpus, project):
     def _suggest(self, text, project, params):
         self.debug('Suggesting subjects for text "{}..." (len={})'.format(
             text[:20], len(text)))
-        vectors = self._vectorizer.transform([text])
+        tokens = project.analyzer.tokenize_words(text)
+        vectors = self._vectorizer.transform([" ".join(tokens)])
         docsim = self._index[vectors[0]]
         fullresult = VectorSuggestionResult(docsim, project.subjects)
         return fullresult.filter(limit=int(self.params['limit']))

From fe4bdc55ccab75fe73932777d31e8c01e5047d11 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Mon, 7 Oct 2019 13:23:07 +0300
Subject: [PATCH 5/6] Spool large subject texts into files instead of keeping
 everything in memory

---
 annif/backend/tfidf.py | 67 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 12 deletions(-)

diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
index 39d21636c..d952c37ee 100644
--- a/annif/backend/tfidf.py
+++ b/annif/backend/tfidf.py
@@ -3,6 +3,7 @@
 
 import collections
 import os.path
+import tempfile
 import joblib
 import gensim.similarities
 from gensim.matutils import Sparse2Corpus
@@ -13,6 +14,44 @@
 from . import backend
 
 
+class SubjectBuffer:
+    """A file-backed buffer to store and retrieve subject text."""
+
+    BUFFER_SIZE = 100
+
+    def __init__(self, tempdir, subject_id):
+        filename = '{:08d}.txt'.format(subject_id)
+        self._path = os.path.join(tempdir, filename)
+        self._buffer = []
+        self._created = False
+
+    def flush(self):
+        if self._created:
+            mode = 'a'
+        else:
+            mode = 'w'
+
+        with open(self._path, mode, encoding='utf-8') as subjfile:
+            for text in self._buffer:
+                print(text, file=subjfile)
+
+        self._buffer = []
+        self._created = True
+
+    def write(self, text):
+        self._buffer.append(text)
+        if len(self._buffer) >= self.BUFFER_SIZE:
+            self.flush()
+
+    def read(self):
+        if not self._created:
+            # file was never created - we can simply return the buffer content
+            return "\n".join(self._buffer)
+        else:
+            with open(self._path, 'r', encoding='utf-8') as subjfile:
+                return subjfile.read() + "\n" + "\n".join(self._buffer)
+
+
 class TFIDFBackend(backend.AnnifBackend):
     """TF-IDF vector space similarity based backend for Annif"""
     name = "tfidf"
@@ -26,18 +65,22 @@ class TFIDFBackend(backend.AnnifBackend):
     INDEX_FILE = 'tfidf-index'
 
     def _generate_subjects_from_documents(self, corpus, project):
-        subject_tokens = collections.defaultdict(list)
-
-        for doc in corpus.documents:
-            tokens = project.analyzer.tokenize_words(doc.text)
-            for uri in doc.uris:
-                subject_id = project.subjects.by_uri(uri)
-                if subject_id is None:
-                    continue
-                subject_tokens[subject_id].extend(tokens)
-
-        return (" ".join(subject_tokens[sid])
-                for sid in range(len(project.subjects)))
+        with tempfile.TemporaryDirectory() as tempdir:
+            subject_buffer = {}
+            for subject_id in range(len(project.subjects)):
+                subject_buffer[subject_id] = SubjectBuffer(tempdir,
+                                                           subject_id)
+
+            for doc in corpus.documents:
+                tokens = project.analyzer.tokenize_words(doc.text)
+                for uri in doc.uris:
+                    subject_id = project.subjects.by_uri(uri)
+                    if subject_id is None:
+                        continue
+                    subject_buffer[subject_id].write(" ".join(tokens))
+
+            for sid in range(len(project.subjects)):
+                yield subject_buffer[sid].read()
 
     def _initialize_vectorizer(self):
         if self._vectorizer is None:

From 3238047cad61aae070a9d53741bbf18ecbe3fca6 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Mon, 7 Oct 2019 13:50:31 +0300
Subject: [PATCH 6/6] Cleanup unused imports

---
 annif/backend/tfidf.py  | 1 -
 annif/corpus/skos.py    | 2 +-
 annif/corpus/subject.py | 4 +---
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
index d952c37ee..c98bd77be 100644
--- a/annif/backend/tfidf.py
+++ b/annif/backend/tfidf.py
@@ -1,7 +1,6 @@
 """Backend that returns most similar subjects based on similarity in sparse
 TF-IDF normalized bag-of-words vector space"""
 
-import collections
 import os.path
 import tempfile
 import joblib
diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py
index 3087da91f..67c2a39e4 100644
--- a/annif/corpus/skos.py
+++ b/annif/corpus/skos.py
@@ -3,7 +3,7 @@
 import rdflib
 import rdflib.util
 from rdflib.namespace import SKOS, RDF, OWL
-from .subject import Subject, SubjectCorpus
+from .types import Subject, SubjectCorpus
 
 
 class SubjectFileSKOS(SubjectCorpus):
diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
index fb23782f2..33e1c10ec 100644
--- a/annif/corpus/subject.py
+++ b/annif/corpus/subject.py
@@ -1,11 +1,9 @@
 """Classes for supporting subject corpora expressed as directories or files"""
 
-import glob
-import os.path
 import annif.util
 import numpy as np
 from annif import logger
-from .types import Subject, SubjectCorpus
+from .types import Subject
 
 
 class SubjectFileTSV: