From a60c62f064749c44082d6e2dfb2e7917ce64dc52 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 6 Feb 2019 16:31:26 +0200
Subject: [PATCH 01/12] Initial support for online learning in vw_multi
 backend. Part of #225 and #230

---
 annif/backend/backend.py       |  4 ++++
 annif/backend/vw_multi.py      | 14 ++++++++++++--
 tests/test_backend_vw_multi.py | 12 +++++++++++-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/annif/backend/backend.py b/annif/backend/backend.py
index 60fbb966b..b959275fe 100644
--- a/annif/backend/backend.py
+++ b/annif/backend/backend.py
@@ -33,6 +33,10 @@ def train(self, corpus, project):
         """train the model on the given document or subject corpus"""
         pass  # default is to do nothing, subclasses may override
 
+    def learn(self, corpus, project):
+        """further train the model on the given document or subject corpus"""
+        pass  # default is to do nothing, subclasses may override
+
     def initialize(self):
         """This method can be overridden by backends. It should cause the
         backend to pre-load all data it needs during operation."""
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 1f6d47961..8b66d137f 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -129,8 +129,7 @@ def _inputs_to_exampletext(self, project, text):
         return ' '.join(['|{} {}'.format(namespace, featurestr)
                          for namespace, featurestr in namespaces.items()])
 
-    def _create_train_file(self, corpus, project):
-        self.info('creating VW train file')
+    def _create_examples(self, corpus, project):
         examples = []
         for doc in corpus.documents:
             text = self._inputs_to_exampletext(project, doc.text)
@@ -138,6 +137,11 @@ def _create_train_file(self, corpus, project):
                 continue
             examples.extend(self._format_examples(project, text, doc.uris))
         random.shuffle(examples)
+        return examples
+
+    def _create_train_file(self, corpus, project):
+        self.info('creating VW train file')
+        examples = self._create_examples(corpus, project)
         annif.util.atomic_save(examples,
                                self._get_datadir(),
                                self.TRAIN_FILE,
@@ -184,6 +188,12 @@ def train(self, corpus, project):
         self._create_train_file(corpus, project)
         self._create_model(project)
 
+    def learn(self, corpus, project):
+        for example in self._create_examples(corpus, project):
+            self._model.learn(example)
+        modelpath = os.path.join(self._get_datadir(), self.MODEL_FILE)
+        self._model.save(modelpath)
+
     def _convert_result(self, result, project):
         if self.algorithm == 'multilabel_oaa':
             # result is a list of subject IDs - need to vectorize
diff --git a/tests/test_backend_vw_multi.py b/tests/test_backend_vw_multi.py
index 359fcbdf7..5812b36ac 100644
--- a/tests/test_backend_vw_multi.py
+++ b/tests/test_backend_vw_multi.py
@@ -29,7 +29,7 @@ def test_vw_multi_analyze_no_model(datadir, project):
         results = vw.analyze("example text", project)
 
 
-def test_vw_multi_train(datadir, document_corpus, project):
+def test_vw_multi_train_and_learn(datadir, document_corpus, project):
     vw_type = annif.backend.get_backend('vw_multi')
     vw = vw_type(
         backend_id='vw_multi',
@@ -44,6 +44,16 @@ def test_vw_multi_train(datadir, document_corpus, project):
     assert datadir.join('vw-model').exists()
     assert datadir.join('vw-model').size() > 0
 
+    # test online learning
+    modelfile = datadir.join('vw-model')
+
+    old_size = modelfile.size()
+    old_mtime = modelfile.mtime()
+
+    vw.learn(document_corpus, project)
+
+    assert modelfile.size() != old_size or modelfile.mtime() != old_mtime
+
 
 def test_vw_multi_train_from_project(app, datadir, document_corpus, project):
     vw_type = annif.backend.get_backend('vw_multi')

From 84c959946367264535c97be9a3d6e5c14a07cfeb Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 8 Feb 2019 16:50:01 +0200
Subject: [PATCH 02/12] Add learn method to AnnifProject, with test

---
 annif/project.py      | 6 ++++++
 tests/test_project.py | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/annif/project.py b/annif/project.py
index 893e37932..eae3e1be6 100644
--- a/annif/project.py
+++ b/annif/project.py
@@ -202,6 +202,12 @@ def train(self, corpus):
         self._create_vectorizer(corpus)
         self.backend.train(corpus, project=self)
 
+    def learn(self, corpus):
+        """further train the project using documents from a metadata source"""
+
+        corpus.set_subject_index(self.subjects)
+        self.backend.train(corpus, project=self)
+
     def dump(self):
         """return this project as a dict"""
         return {'project_id': self.project_id,
diff --git a/tests/test_project.py b/tests/test_project.py
index 08c473d84..5e6dfe531 100644
--- a/tests/test_project.py
+++ b/tests/test_project.py
@@ -92,6 +92,15 @@ def test_project_train_tfidf(app, document_corpus, testdatadir):
     assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0
 
 
+def test_project_learn_tfidf(app, document_corpus, testdatadir):
+    with app.app_context():
+        project = annif.project.get_project('tfidf-fi')
+
+    project.learn(document_corpus)
+    # Should assert that the index file changed, but this is not really
+    # implemented in the tfidf backend yet
+
+
 def test_project_load_vocabulary_fasttext(app, vocabulary, testdatadir):
     pytest.importorskip("annif.backend.fasttext")
     with app.app_context():

From c8266cd697b1e5a71ff1625f4b37e7943d716294 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 8 Feb 2019 16:52:59 +0200
Subject: [PATCH 03/12] Add learn CLI command

---
 annif/cli.py      | 13 +++++++++++++
 tests/test_cli.py | 11 +++++++++++
 2 files changed, 24 insertions(+)

diff --git a/annif/cli.py b/annif/cli.py
index 3a590904b..d97775a57 100644
--- a/annif/cli.py
+++ b/annif/cli.py
@@ -134,6 +134,19 @@ def run_train(project_id, paths):
     proj.train(documents)
 
 
+@cli.command('learn')
+@click_log.simple_verbosity_option(logger)
+@click.argument('project_id')
+@click.argument('paths', type=click.Path(), nargs=-1)
+def run_learn(project_id, paths):
+    """
+    Further train an existing project on a collection of documents.
+    """
+    proj = get_project(project_id)
+    documents = open_documents(paths)
+    proj.learn(documents)
+
+
 @cli.command('analyze')
 @click_log.simple_verbosity_option(logger)
 @click.argument('project_id')
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 5385bb9b4..e7db6d3f7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -126,6 +126,17 @@ def test_train_multiple(testdatadir):
     assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0
 
 
+def test_learn(testdatadir):
+    docfile = os.path.join(
+        os.path.dirname(__file__),
+        'corpora',
+        'archaeology',
+        'documents.tsv')
+    result = runner.invoke(annif.cli.cli, ['learn', 'tfidf-fi', docfile])
+    assert not result.exception
+    assert result.exit_code == 0
+
+
 def test_analyze():
     result = runner.invoke(
         annif.cli.cli,

From 5b9d7df451343c32c33151dabc36680701e4f61e Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 8 Feb 2019 17:26:57 +0200
Subject: [PATCH 04/12] Define REST API learn method. Implementation still
 missing conversion of input

---
 annif/rest.py            | 18 ++++++++++++
 annif/swagger/annif.yaml | 63 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/annif/rest.py b/annif/rest.py
index ce13cea38..da8904942 100644
--- a/annif/rest.py
+++ b/annif/rest.py
@@ -64,3 +64,21 @@ def analyze(project_id, text, limit, threshold):
         return server_error(err)
     hits = hit_filter(result)
     return {'results': [hit._asdict() for hit in hits]}
+
+
+def learn(project_id, documents):
+    """learn from documents and return an empty 204 response if succesful"""
+
+    try:
+        project = annif.project.get_project(
+            project_id, min_access=Access.hidden)
+    except ValueError:
+        return project_not_found_error(project_id)
+
+#   TODO should convert the documents to a corpus object
+#    try:
+#        project.learn(documents)
+#    except AnnifException as err:
+#        return server_error(err)
+
+    return None, 204
diff --git a/annif/swagger/annif.yaml b/annif/swagger/annif.yaml
index a53cb9d83..4738c4a40 100644
--- a/annif/swagger/annif.yaml
+++ b/annif/swagger/annif.yaml
@@ -90,6 +90,38 @@ paths:
             $ref: '#/definitions/Problem'
       tags:
         - Automatic subject indexing
+  '/projects/{project_id}/learn':
+    post:
+      summary: learn from manually indexed documents
+      operationId: annif.rest.learn
+      consumes:
+        - application/json
+      produces:
+        - application/json
+        - application/problem+json
+      parameters:
+        - $ref: '#/parameters/project_id'
+        - name: documents
+          in: body
+          description: documents to learn from
+          required: true
+          schema:
+            type: array
+            items:
+              $ref: '#/definitions/IndexedDocument'
+      responses:
+        '204':
+          description: successful operation
+        '404':
+          description: Project not found
+          schema:
+            $ref: '#/definitions/Problem'
+        '503':
+          description: Service Unavailable
+          schema:
+            $ref: '#/definitions/Problem'
+      tags:
+        - Learning from feedback
 definitions:
   ProjectBackend:
     description: A backend of a project
@@ -133,7 +165,7 @@ definitions:
         example: 'http://example.org/subject1'
       label:
         type: string
-        example: 'Archaeology'
+        example: Archaeology
       score:
         type: number
         example: 0.85
@@ -148,6 +180,23 @@ definitions:
         type: array
         items:
           $ref: '#/definitions/AnalysisResult'
+  IndexedDocument:
+    description: A document with attached, known good subjects
+    properties:
+      text:
+        type: string
+        example: "A quick brown fox jumped over the lazy dog."
+      subjects:
+        type: array
+        items:
+          type: object
+          properties:
+            uri:
+              type: string
+              example: 'http://example.org/subject1'
+            label:
+              type: string
+              example: 'Vulpes vulpes'
   Problem:
     type: object
     properties:
@@ -169,8 +218,10 @@ definitions:
       status:
         type: integer
         format: int32
-        description: |
-          The HTTP status code generated by the origin server for this occurrence
+        description: >
+          The HTTP status code generated by the origin server for this
+          occurrence
+
           of the problem.
         minimum: 100
         maximum: 600
@@ -185,6 +236,8 @@ definitions:
       instance:
         type: string
         format: uri
-        description: |
-          An absolute URI that identifies the specific occurrence of the problem.
+        description: >
+          An absolute URI that identifies the specific occurrence of the
+          problem.
+
           It may or may not yield further information if dereferenced.

From e6e2d1775a1f8ed04c701f73c41a21def3ca4251 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Thu, 14 Feb 2019 13:24:05 +0200
Subject: [PATCH 05/12] adapt to datadir changes on master branch

---
 annif/backend/vw_multi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 842f611f7..ec77e4c4d 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -191,7 +191,7 @@ def train(self, corpus, project):
     def learn(self, corpus, project):
         for example in self._create_examples(corpus, project):
             self._model.learn(example)
-        modelpath = os.path.join(self._get_datadir(), self.MODEL_FILE)
+        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
         self._model.save(modelpath)
 
     def _convert_result(self, result, project):

From bdef18f783624569179b9e40983c1a63004ba57a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Thu, 14 Feb 2019 13:32:50 +0200
Subject: [PATCH 06/12] Split off initializing SubjectSet from string into a
 separate classmethod

---
 annif/corpus/document.py |  2 +-
 annif/corpus/subject.py  | 26 ++++++++++++--------------
 tests/test_corpus.py     |  4 ++--
 tests/test_eval.py       |  2 +-
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/annif/corpus/document.py b/annif/corpus/document.py
index 44e6f822d..4c884fde3 100644
--- a/annif/corpus/document.py
+++ b/annif/corpus/document.py
@@ -40,7 +40,7 @@ def documents(self):
             with open(docfilename, errors='replace') as docfile:
                 text = docfile.read()
             with open(keyfilename) as keyfile:
-                subjects = SubjectSet(keyfile.read())
+                subjects = SubjectSet.from_string(keyfile.read())
             yield Document(text=text, uris=subjects.subject_uris,
                            labels=subjects.subject_labels)
 
diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
index d91e4d0df..5bb68105f 100644
--- a/annif/corpus/subject.py
+++ b/annif/corpus/subject.py
@@ -98,22 +98,20 @@ def load(cls, path):
 class SubjectSet:
     """Represents a set of subjects for a document."""
 
-    def __init__(self, subj_data):
-        """initialize a SubjectSet from either a string representation or a
-        tuple (URIs, labels)"""
-
-        if isinstance(subj_data, str):
-            self.subject_uris = set()
-            self.subject_labels = set()
-            self._parse(subj_data)
-        else:
-            uris, labels = subj_data
-            self.subject_uris = set(uris)
-            self.subject_labels = set(labels)
+    def __init__(self, subj_data=None):
+        """Create a SubjectSet and optionally initialize it from a tuple
+        (URIs, labels)"""
+
+        uris, labels = subj_data or ([], [])
+        self.subject_uris = set(uris)
+        self.subject_labels = set(labels)
 
-    def _parse(self, subj_data):
+    @classmethod
+    def from_string(cls, subj_data):
+        sset = cls()
         for line in subj_data.splitlines():
-            self._parse_line(line)
+            sset._parse_line(line)
+        return sset
 
     def _parse_line(self, line):
         vals = line.split("\t")
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index b2331e922..ac7f0c755 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -9,7 +9,7 @@ def test_subjectset_uris():
     <http://example.org/another>\tanother
     """
 
-    sset = annif.corpus.SubjectSet(data)
+    sset = annif.corpus.SubjectSet.from_string(data)
     assert sset.has_uris()
     assert len(sset.subject_uris) == 2
     assert "http://example.org/dummy" in sset.subject_uris
@@ -21,7 +21,7 @@ def test_subjectset_labels():
     another
     """
 
-    sset = annif.corpus.SubjectSet(data)
+    sset = annif.corpus.SubjectSet.from_string(data)
     assert not sset.has_uris()
     assert len(sset.subject_labels) == 2
     assert "dummy" in sset.subject_labels
diff --git a/tests/test_eval.py b/tests/test_eval.py
index 411dc7fcf..0de8524b0 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -90,7 +90,7 @@ def test_ndcg_empty2():
 def test_evaluation_batch(subject_index):
     batch = annif.eval.EvaluationBatch(subject_index)
 
-    gold_set = annif.corpus.SubjectSet(
+    gold_set = annif.corpus.SubjectSet.from_string(
         '<http://www.yso.fi/onto/yso/p10849>\tarkeologit')
     hits1 = annif.hit.ListAnalysisResult([
         annif.hit.AnalysisHit(

From 3ce498b3ed92fdcff1d4fa8f1353a5a7f7e99eef Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Thu, 14 Feb 2019 15:12:58 +0200
Subject: [PATCH 07/12] Implement learning in REST API by converting the input
 to a DocumentCorpus

---
 annif/corpus/__init__.py |  3 ++-
 annif/corpus/document.py | 12 ++++++++++++
 annif/rest.py            | 15 ++++++++++-----
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/annif/corpus/__init__.py b/annif/corpus/__init__.py
index 68fa3ced9..9f749f6cf 100644
--- a/annif/corpus/__init__.py
+++ b/annif/corpus/__init__.py
@@ -1,8 +1,9 @@
 """Annif corpus operations"""
 
 
-from .document import DocumentDirectory, DocumentFile
+from .document import DocumentDirectory, DocumentFile, DocumentList
 from .subject import Subject, SubjectDirectory, SubjectFileTSV
 from .subject import SubjectIndex, SubjectSet
 from .skos import SubjectFileSKOS
+from .types import Document
 from .combine import CombinedCorpus
diff --git a/annif/corpus/document.py b/annif/corpus/document.py
index 4c884fde3..dc8facb65 100644
--- a/annif/corpus/document.py
+++ b/annif/corpus/document.py
@@ -66,3 +66,15 @@ def opener(path):
                 subjects = [annif.util.cleanup_uri(uri)
                             for uri in uris.split()]
                 yield Document(text=text, uris=subjects, labels=[])
+
+
+class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
+    """A document corpus based on a list of other iterable of Document
+    objects"""
+
+    def __init__(self, documents):
+        self._documents = documents
+
+    @property
+    def documents(self):
+        yield from self._documents
diff --git a/annif/rest.py b/annif/rest.py
index da8904942..8672fdca0 100644
--- a/annif/rest.py
+++ b/annif/rest.py
@@ -3,6 +3,7 @@
 
 import connexion
 import annif.project
+from annif.corpus import Document, DocumentList
 from annif.hit import HitFilter
 from annif.exception import AnnifException
 from annif.project import Access
@@ -75,10 +76,14 @@ def learn(project_id, documents):
     except ValueError:
         return project_not_found_error(project_id)
 
-#   TODO should convert the documents to a corpus object
-#    try:
-#        project.learn(documents)
-#    except AnnifException as err:
-#        return server_error(err)
+    corpus = [Document(text=d['text'],
+                       uris=[subj['uri'] for subj in d['subjects']],
+                       labels=[subj['label'] for subj in d['subjects']])
+              for d in documents
+              if 'text' in d and 'subjects' in d]
+    try:
+        project.learn(DocumentList(corpus))
+    except AnnifException as err:
+        return server_error(err)
 
     return None, 204

From 972a285262e677eec36146bfc7b21ab678533d86 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 27 Feb 2019 10:51:06 +0200
Subject: [PATCH 08/12] Split AnnifLearningBackend into separate abstract class
 + add some learning tests

---
 annif/backend/backend.py  | 13 +++++++++----
 annif/backend/dummy.py    | 19 ++++++++++++++++---
 annif/backend/vw_multi.py |  2 +-
 annif/project.py          |  2 +-
 tests/test_backend.py     | 21 +++++++++++++++++++++
 tests/test_cli.py         |  2 +-
 tests/test_project.py     | 20 ++++++++++++++------
 tests/test_rest.py        | 32 +++++++++++++++++++++++++++++++-
 8 files changed, 94 insertions(+), 17 deletions(-)

diff --git a/annif/backend/backend.py b/annif/backend/backend.py
index b3e2e520b..5dec8708f 100644
--- a/annif/backend/backend.py
+++ b/annif/backend/backend.py
@@ -24,10 +24,6 @@ def train(self, corpus, project):
         """train the model on the given document or subject corpus"""
         pass  # default is to do nothing, subclasses may override
 
-    def learn(self, corpus, project):
-        """further train the model on the given document or subject corpus"""
-        pass  # default is to do nothing, subclasses may override
-
     def initialize(self):
         """This method can be overridden by backends. It should cause the
         backend to pre-load all data it needs during operation."""
@@ -58,3 +54,12 @@ def info(self, message):
     def warning(self, message):
         """Log a warning message from this backend"""
         logger.warning("Backend {}: {}".format(self.backend_id, message))
+
+
+class AnnifLearningBackend(AnnifBackend):
+    """Base class for Annif backends that can perform online learning"""
+
+    @abc.abstractmethod
+    def learn(self, corpus, project):
+        """further train the model on the given document or subject corpus"""
+        pass  # pragma: no cover
diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py
index 6f748c8fa..2f3cee660 100644
--- a/annif/backend/dummy.py
+++ b/annif/backend/dummy.py
@@ -5,15 +5,28 @@
 from . import backend
 
 
-class DummyBackend(backend.AnnifBackend):
+class DummyBackend(backend.AnnifLearningBackend):
     name = "dummy"
     initialized = False
+    uri = 'http://example.org/dummy'
+    label = 'dummy'
 
     def initialize(self):
         self.initialized = True
 
     def _analyze(self, text, project, params):
         score = float(params.get('score', 1.0))
-        return ListAnalysisResult([AnalysisHit(uri='http://example.org/dummy',
-                                               label='dummy', score=score)],
+        return ListAnalysisResult([AnalysisHit(uri=self.uri,
+                                               label=self.label,
+                                               score=score)],
                                   project.subjects)
+
+    def learn(self, corpus, project):
+        # in this dummy backend we "learn" by picking up the URI and label
+        # of the first subject of the first document in the learning set
+        # and using that in subsequent analysis results
+        for doc in corpus.documents:
+            if doc.uris and doc.labels:
+                self.uri = list(doc.uris)[0]
+                self.label = list(doc.labels)[0]
+            break
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index ec77e4c4d..310ac609d 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -12,7 +12,7 @@
 from . import mixins
 
 
-class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend):
+class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifLearningBackend):
     """Vorpal Wabbit multiclass/multilabel backend for Annif"""
 
     name = "vw_multi"
diff --git a/annif/project.py b/annif/project.py
index 630492f3d..4c6685536 100644
--- a/annif/project.py
+++ b/annif/project.py
@@ -200,7 +200,7 @@ def learn(self, corpus):
         """further train the project using documents from a metadata source"""
 
         corpus.set_subject_index(self.subjects)
-        self.backend.train(corpus, project=self)
+        self.backend.learn(corpus, project=self)
 
     def dump(self):
         """return this project as a dict"""
diff --git a/tests/test_backend.py b/tests/test_backend.py
index 68496d025..0b1073a9b 100644
--- a/tests/test_backend.py
+++ b/tests/test_backend.py
@@ -4,6 +4,7 @@
 import pytest
 import annif
 import annif.backend
+import annif.corpus
 
 
 def test_get_backend_nonexistent():
@@ -20,3 +21,23 @@ def test_get_backend_dummy(app, project):
     assert result[0].uri == 'http://example.org/dummy'
     assert result[0].label == 'dummy'
     assert result[0].score == 1.0
+
+
+def test_learn_dummy(app, project, tmpdir):
+    dummy_type = annif.backend.get_backend("dummy")
+    dummy = dummy_type(backend_id='dummy', params={},
+                       datadir=app.config['DATADIR'])
+
+    tmpdir.join('doc1.txt').write('doc1')
+    tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1')
+    tmpdir.join('doc2.txt').write('doc2')
+    tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2')
+    docdir = annif.corpus.DocumentDirectory(str(tmpdir))
+
+    dummy.learn(docdir, project)
+
+    result = dummy.analyze(text='this is some text', project=project)
+    assert len(result) == 1
+    assert result[0].uri == 'http://example.org/key1'
+    assert result[0].label == 'key1'
+    assert result[0].score == 1.0
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e7db6d3f7..5b9916d37 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -132,7 +132,7 @@ def test_learn(testdatadir):
         'corpora',
         'archaeology',
         'documents.tsv')
-    result = runner.invoke(annif.cli.cli, ['learn', 'tfidf-fi', docfile])
+    result = runner.invoke(annif.cli.cli, ['learn', 'dummy-fi', docfile])
     assert not result.exception
     assert result.exit_code == 0
 
diff --git a/tests/test_project.py b/tests/test_project.py
index 5e6dfe531..ad4dae86d 100644
--- a/tests/test_project.py
+++ b/tests/test_project.py
@@ -92,13 +92,21 @@ def test_project_train_tfidf(app, document_corpus, testdatadir):
     assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0
 
 
-def test_project_learn_tfidf(app, document_corpus, testdatadir):
-    with app.app_context():
-        project = annif.project.get_project('tfidf-fi')
+def test_project_learn(app, tmpdir):
+    tmpdir.join('doc1.txt').write('doc1')
+    tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1')
+    tmpdir.join('doc2.txt').write('doc2')
+    tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2')
+    docdir = annif.corpus.DocumentDirectory(str(tmpdir))
 
-    project.learn(document_corpus)
-    # Should assert that the index file changed, but this is not really
-    # implemented in the tfidf backend yet
+    with app.app_context():
+        project = annif.project.get_project('dummy-fi')
+        project.learn(docdir)
+        result = project.analyze('this is some text')
+        assert len(result) == 1
+        assert result[0].uri == 'http://example.org/key1'
+        assert result[0].label == 'key1'
+        assert result[0].score == 1.0
 
 
 def test_project_load_vocabulary_fasttext(app, vocabulary, testdatadir):
diff --git a/tests/test_rest.py b/tests/test_rest.py
index 0aa657774..135125738 100644
--- a/tests/test_rest.py
+++ b/tests/test_rest.py
@@ -88,7 +88,7 @@ def test_rest_analyze_nonexistent(app):
         assert result.status_code == 404
 
 
-def test_rest_novocab(app):
+def test_rest_analyze_novocab(app):
     with app.app_context():
         result = annif.rest.analyze(
             'novocab',
@@ -96,3 +96,33 @@ def test_rest_novocab(app):
             limit=10,
             threshold=0.0)
         assert result.status_code == 503
+
+
+def test_rest_learn_empty(app):
+    with app.app_context():
+        response = annif.rest.learn('dummy-en', [])
+        assert response == (None, 204)  # success, no output
+
+
+def test_rest_learn(app):
+    documents = [{'text': 'the quick brown fox',
+                  'subjects': [{'uri': 'http://example.org/fox',
+                                'label': 'fox'}]}]
+    with app.app_context():
+        response = annif.rest.learn('dummy-en', documents)
+        assert response == (None, 204)  # success, no output
+
+        result = annif.rest.analyze(
+            'dummy-en',
+            text='example text',
+            limit=10,
+            threshold=0.0)
+        assert 'results' in result
+        assert result['results'][0]['uri'] == 'http://example.org/fox'
+        assert result['results'][0]['label'] == 'fox'
+
+
+def test_rest_learn_novocab(app):
+    with app.app_context():
+        result = annif.rest.learn('novocab', [])
+        assert result.status_code == 503

From 01fa0b55270a1627fa7e4d7af2996c7e313a5fdd Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 27 Feb 2019 11:00:59 +0200
Subject: [PATCH 09/12] add unit test for learn on nonexistent vocabulary

---
 tests/test_rest.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_rest.py b/tests/test_rest.py
index 135125738..633867860 100644
--- a/tests/test_rest.py
+++ b/tests/test_rest.py
@@ -126,3 +126,9 @@ def test_rest_learn_novocab(app):
     with app.app_context():
         result = annif.rest.learn('novocab', [])
         assert result.status_code == 503
+
+
+def test_rest_learn_nonexistent(app):
+    with app.app_context():
+        result = annif.rest.learn('nonexistent', [])
+        assert result.status_code == 404

From 16c0959f608cb42a2ebcde3f23410f15119449af Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 27 Feb 2019 13:24:11 +0200
Subject: [PATCH 10/12] REST handling of backends that don't support learning

---
 annif/exception.py    |  7 +++++++
 annif/project.py      | 10 ++++++++--
 tests/test_project.py | 15 ++++++++++++++-
 tests/test_rest.py    |  6 ++++++
 4 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/annif/exception.py b/annif/exception.py
index 10d92a73e..867eb2beb 100644
--- a/annif/exception.py
+++ b/annif/exception.py
@@ -46,3 +46,10 @@ class ConfigurationException(AnnifException):
     """Exception raised when a project or backend is misconfigured."""
 
     prefix = "Misconfigured"
+
+
+class NotSupportedException(AnnifException):
+    """Exception raised when an operation is not supported by a project or
+    backend."""
+
+    prefix = "Not supported"
diff --git a/annif/project.py b/annif/project.py
index 4c6685536..4136ed8a7 100644
--- a/annif/project.py
+++ b/annif/project.py
@@ -16,7 +16,7 @@
 import annif.vocab
 from annif.datadir import DatadirMixin
 from annif.exception import AnnifException, ConfigurationException, \
-    NotInitializedException
+    NotInitializedException, NotSupportedException
 
 logger = annif.logger
 
@@ -200,7 +200,13 @@ def learn(self, corpus):
         """further train the project using documents from a metadata source"""
 
         corpus.set_subject_index(self.subjects)
-        self.backend.learn(corpus, project=self)
+        if isinstance(
+                self.backend,
+                annif.backend.backend.AnnifLearningBackend):
+            self.backend.learn(corpus, project=self)
+        else:
+            raise NotSupportedException("Learning not supported by backend",
+                                        project_id=self.project_id)
 
     def dump(self):
         """return this project as a dict"""
diff --git a/tests/test_project.py b/tests/test_project.py
index ad4dae86d..879118228 100644
--- a/tests/test_project.py
+++ b/tests/test_project.py
@@ -4,7 +4,7 @@
 import pytest
 import annif.project
 import annif.backend.dummy
-from annif.exception import ConfigurationException
+from annif.exception import ConfigurationException, NotSupportedException
 from annif.project import Access
 
 
@@ -109,6 +109,19 @@ def test_project_learn(app, tmpdir):
         assert result[0].score == 1.0
 
 
+def test_project_learn_not_supported(app, tmpdir):
+    tmpdir.join('doc1.txt').write('doc1')
+    tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1')
+    tmpdir.join('doc2.txt').write('doc2')
+    tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2')
+    docdir = annif.corpus.DocumentDirectory(str(tmpdir))
+
+    with app.app_context():
+        project = annif.project.get_project('tfidf-fi')
+        with pytest.raises(NotSupportedException):
+            project.learn(docdir)
+
+
 def test_project_load_vocabulary_fasttext(app, vocabulary, testdatadir):
     pytest.importorskip("annif.backend.fasttext")
     with app.app_context():
diff --git a/tests/test_rest.py b/tests/test_rest.py
index 633867860..4ecc57cc4 100644
--- a/tests/test_rest.py
+++ b/tests/test_rest.py
@@ -132,3 +132,9 @@ def test_rest_learn_nonexistent(app):
     with app.app_context():
         result = annif.rest.learn('nonexistent', [])
         assert result.status_code == 404
+
+
+def test_rest_learn_not_supported(app):
+    with app.app_context():
+        result = annif.rest.learn('tfidf-fi', [])
+        assert result.status_code == 503

From 31577837c8113ec05bc38b8e64f16750b2c68f21 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 27 Feb 2019 13:26:27 +0200
Subject: [PATCH 11/12] CLI unit test for trying to learn when backend doesn't
 support it

---
 tests/test_cli.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 5b9916d37..4d55a0edb 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -137,6 +137,17 @@ def test_learn(testdatadir):
     assert result.exit_code == 0
 
 
+def test_learn_notsupported(testdatadir):
+    docfile = os.path.join(
+        os.path.dirname(__file__),
+        'corpora',
+        'archaeology',
+        'documents.tsv')
+    result = runner.invoke(annif.cli.cli, ['learn', 'tfidf-fi', docfile])
+    assert result.exit_code != 0
+    assert 'Learning not supported' in result.output
+
+
 def test_analyze():
     result = runner.invoke(
         annif.cli.cli,

From 7624d21f4a8223ac5aac6f09cbee59e1ec4a3d9b Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 27 Feb 2019 13:40:32 +0200
Subject: [PATCH 12/12] refactor: split off JSON input to document corpus
 conversion in rest module

---
 annif/rest.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/annif/rest.py b/annif/rest.py
index 8672fdca0..8a038bdb0 100644
--- a/annif/rest.py
+++ b/annif/rest.py
@@ -67,6 +67,15 @@ def analyze(project_id, text, limit, threshold):
     return {'results': [hit._asdict() for hit in hits]}
 
 
+def _documents_to_corpus(documents):
+    corpus = [Document(text=d['text'],
+                       uris=[subj['uri'] for subj in d['subjects']],
+                       labels=[subj['label'] for subj in d['subjects']])
+              for d in documents
+              if 'text' in d and 'subjects' in d]
+    return DocumentList(corpus)
+
+
 def learn(project_id, documents):
     """learn from documents and return an empty 204 response if succesful"""
 
@@ -76,13 +85,10 @@ def learn(project_id, documents):
     except ValueError:
         return project_not_found_error(project_id)
 
-    corpus = [Document(text=d['text'],
-                       uris=[subj['uri'] for subj in d['subjects']],
-                       labels=[subj['label'] for subj in d['subjects']])
-              for d in documents
-              if 'text' in d and 'subjects' in d]
+    corpus = _documents_to_corpus(documents)
+
     try:
-        project.learn(DocumentList(corpus))
+        project.learn(corpus)
     except AnnifException as err:
         return server_error(err)