From 112554db6a8ce2b1652a41fb8f9fa9d94905d309 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Mon, 24 Jun 2019 18:07:22 +0300
Subject: [PATCH 01/25] Make sure all projects are initialized when they are
 used for suggesting

---
 annif/backend/backend.py | 1 +
 annif/backend/tfidf.py   | 1 -
 tests/test_project.py    | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/annif/backend/backend.py b/annif/backend/backend.py
index b1da11c7c..817b6b23f 100644
--- a/annif/backend/backend.py
+++ b/annif/backend/backend.py
@@ -38,6 +38,7 @@ def _suggest(self, text, project, params):
     def suggest(self, text, project, params=None):
         """Suggest subjects for the input text and return a list of subjects
         represented as a list of SubjectSuggestion objects."""
+        self.initialize()
         beparams = dict(self.params)
         if params:
             beparams.update(params)
diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
index 923066497..8e7f1dc5e 100644
--- a/annif/backend/tfidf.py
+++ b/annif/backend/tfidf.py
@@ -46,7 +46,6 @@ def train(self, corpus, project):
             self.INDEX_FILE)
 
     def _suggest(self, text, project, params):
-        self.initialize()
         self.debug('Suggesting subjects for text "{}..." (len={})'.format(
             text[:20], len(text)))
         vectors = project.vectorizer.transform([text])
diff --git a/tests/test_project.py b/tests/test_project.py
index 758ba28b3..b70a41af4 100644
--- a/tests/test_project.py
+++ b/tests/test_project.py
@@ -163,7 +163,6 @@ def test_project_not_initialized(app):
     with app.app_context():
         project = annif.project.get_project('dummy-en')
     assert not project.initialized
-    assert not project.backend.initialized
 
 
 def test_project_initialized(app_with_initialize):

From a56be2667db153c87fc5cb03384389e2167ed5fb Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Mon, 24 Jun 2019 18:08:54 +0300
Subject: [PATCH 02/25] Initial implementation of vw_ensemble backend. Fixes
 #235

---
 annif/backend/__init__.py         |   6 +-
 annif/backend/ensemble.py         |  10 +-
 annif/backend/mixins.py           |   1 -
 annif/backend/vw_ensemble.py      | 170 ++++++++++++++++++++++++++++++
 annif/backend/vw_multi.py         |   1 +
 tests/conftest.py                 |  12 +++
 tests/test_backend_vw_ensemble.py |  56 ++++++++++
 7 files changed, 251 insertions(+), 5 deletions(-)
 create mode 100644 annif/backend/vw_ensemble.py
 create mode 100644 tests/test_backend_vw_ensemble.py

diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py
index 1b5b21651..4a5fd82e2 100644
--- a/annif/backend/__init__.py
+++ b/annif/backend/__init__.py
@@ -38,6 +38,8 @@ def get_backend(backend_id):
 try:
     from . import vw_multi
     register_backend(vw_multi.VWMultiBackend)
+    from . import vw_ensemble
+    register_backend(vw_ensemble.VWEnsembleBackend)
 except ImportError:
-    annif.logger.debug(
-        "vowpalwabbit not available, not enabling vw_multi backend")
+    annif.logger.debug("vowpalwabbit not available, not enabling " +
+                       "vw_multi & vw_ensemble backends")
diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py
index 90d2712e7..90959979a 100644
--- a/annif/backend/ensemble.py
+++ b/annif/backend/ensemble.py
@@ -30,10 +30,16 @@ def _suggest_with_sources(self, text, sources):
                     hits=norm_hits, weight=weight))
         return hits_from_sources
 
+    def _merge_hits_from_sources(self, hits_from_sources, project, params):
+        """Hook for merging hits from sources. Can be overridden by
+        subclasses."""
+        return annif.util.merge_hits(hits_from_sources, project.subjects)
+
     def _suggest(self, text, project, params):
         sources = annif.util.parse_sources(params['sources'])
         hits_from_sources = self._suggest_with_sources(text, sources)
-        merged_hits = annif.util.merge_hits(
-            hits_from_sources, project.subjects)
+        merged_hits = self._merge_hits_from_sources(hits_from_sources,
+                                                    project,
+                                                    params)
         self.debug('{} hits after merging'.format(len(merged_hits)))
         return merged_hits
diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py
index 976ed028c..04774a58c 100644
--- a/annif/backend/mixins.py
+++ b/annif/backend/mixins.py
@@ -16,7 +16,6 @@ def _suggest_chunks(self, chunktexts, project):
         pass  # pragma: no cover
 
     def _suggest(self, text, project, params):
-        self.initialize()
         self.debug('Suggesting subjects for text "{}..." (len={})'.format(
             text[:20], len(text)))
         sentences = project.analyzer.tokenize_sentences(text)
diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
new file mode 100644
index 000000000..c43242aac
--- /dev/null
+++ b/annif/backend/vw_ensemble.py
@@ -0,0 +1,170 @@
+"""Annif backend using the Vowpal Wabbit multiclass and multilabel
+classifiers"""
+
+import random
+import os.path
+import annif.util
+from vowpalwabbit import pyvw
+import numpy as np
+from annif.suggestion import VectorSuggestionResult
+from annif.exception import ConfigurationException, NotInitializedException
+from . import backend
+from . import ensemble
+
+
+class VWEnsembleBackend(
+        ensemble.EnsembleBackend,
+        backend.AnnifLearningBackend):
+    """Vowpal Wabbit ensemble backend that combines results from multiple
+    projects and learns how well those projects/backends recognize
+    particular subjects."""
+
+    name = "vw_ensemble"
+
+    VW_PARAMS = {
+        # each param specifier is a pair (allowed_values, default_value)
+        # where allowed_values is either a type or a list of allowed values
+        # and default_value may be None, to let VW decide by itself
+        'bit_precision': (int, None),
+        'learning_rate': (float, None),
+        'loss_function': (['squared', 'logistic', 'hinge'], 'squared'),
+        'l1': (float, None),
+        'l2': (float, None),
+        'passes': (int, None)
+    }
+
+    MODEL_FILE = 'vw-model'
+    TRAIN_FILE = 'vw-train.txt'
+
+    # defaults for uninitialized instances
+    _model = None
+
+    def initialize(self):
+        if self._model is None:
+            path = os.path.join(self.datadir, self.MODEL_FILE)
+            if not os.path.exists(path):
+                raise NotInitializedException(
+                    'model {} not found'.format(path),
+                    backend_id=self.backend_id)
+            self.debug('loading VW model from {}'.format(path))
+            params = self._create_params({'i': path, 'quiet': True})
+            if 'passes' in params:
+                # don't confuse the model with passes
+                del params['passes']
+            self.debug("model parameters: {}".format(params))
+            self._model = pyvw.vw(**params)
+            self.debug('loaded model {}'.format(str(self._model)))
+
+    @staticmethod
+    def _write_train_file(examples, filename):
+        with open(filename, 'w', encoding='utf-8') as trainfile:
+            for ex in examples:
+                print(ex, file=trainfile)
+
+    def _merge_hits_from_sources(self, hits_from_sources, project, params):
+        score_vector = np.array([hits.vector
+                                 for hits, _ in hits_from_sources])
+        result = np.zeros(score_vector.shape[1])
+        for subj_id in range(score_vector.shape[1]):
+            if score_vector[:, subj_id].sum() > 0.0:
+                ex = self._format_example(
+                    subj_id,
+                    score_vector[:, subj_id])
+                score = (self._model.predict(ex) + 1.0) / 2.0
+                result[subj_id] = score
+        return VectorSuggestionResult(result, project.subjects)
+
+    def _format_example(self, subject_id, scores, true=None):
+        if true is None:
+            val = ''
+        elif true:
+            val = 1
+        else:
+            val = -1
+        ex = "{} |{}".format(val, subject_id)
+        for proj_idx, proj in enumerate(self.source_project_ids):
+            ex += " {}:{}".format(proj, scores[proj_idx])
+        return ex
+
+    @property
+    def source_project_ids(self):
+        sources = annif.util.parse_sources(self.params['sources'])
+        return [project_id for project_id, _ in sources]
+
+    def _create_examples(self, corpus, project):
+        source_projects = [annif.project.get_project(project_id)
+                           for project_id in self.source_project_ids]
+        examples = []
+        for doc in corpus.documents:
+            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
+            true = subjects.as_vector(project.subjects)
+            score_vectors = []
+            for source_project in source_projects:
+                hits = source_project.suggest(doc.text)
+                score_vectors.append(hits.vector)
+            score_vector = np.array(score_vectors)
+            for subj_id in range(len(true)):
+                if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
+                    ex = self._format_example(
+                        subj_id,
+                        score_vector[:, subj_id],
+                        true[subj_id])
+                    examples.append(ex)
+        random.shuffle(examples)
+        return examples
+
+    def _create_train_file(self, corpus, project):
+        self.info('creating VW train file')
+        examples = self._create_examples(corpus, project)
+        annif.util.atomic_save(examples,
+                               self.datadir,
+                               self.TRAIN_FILE,
+                               method=self._write_train_file)
+
+    def _convert_param(self, param, val):
+        pspec, _ = self.VW_PARAMS[param]
+        if isinstance(pspec, list):
+            if val in pspec:
+                return val
+            raise ConfigurationException(
+                "{} is not a valid value for {} (allowed: {})".format(
+                    val, param, ', '.join(pspec)), backend_id=self.backend_id)
+        try:
+            return pspec(val)
+        except ValueError:
+            raise ConfigurationException(
+                "The {} value {} cannot be converted to {}".format(
+                    param, val, pspec), backend_id=self.backend_id)
+
+    def _create_params(self, params):
+        params.update({param: defaultval
+                       for param, (_, defaultval) in self.VW_PARAMS.items()
+                       if defaultval is not None})
+        params.update({param: self._convert_param(param, val)
+                       for param, val in self.params.items()
+                       if param in self.VW_PARAMS})
+        return params
+
+    def _create_model(self, project):
+        trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
+        params = self._create_params(
+            {'data': trainpath, 'q': '::'})
+        if params.get('passes', 1) > 1:
+            # need a cache file when there are multiple passes
+            params.update({'cache': True, 'kill_cache': True})
+        self.debug("model parameters: {}".format(params))
+        self._model = pyvw.vw(**params)
+        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
+        self._model.save(modelpath)
+
+    def train(self, corpus, project):
+        self.info("creating VW ensemble model")
+        self._create_train_file(corpus, project)
+        self._create_model(project)
+
+    def learn(self, corpus, project):
+        self.initialize()
+        for example in self._create_examples(corpus, project):
+            self._model.learn(example)
+        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
+        self._model.save(modelpath)
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 287704d02..e4006128f 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -189,6 +189,7 @@ def train(self, corpus, project):
         self._create_model(project)
 
     def learn(self, corpus, project):
+        self.initialize()
         for example in self._create_examples(corpus, project):
             self._model.learn(example)
         modelpath = os.path.join(self.datadir, self.MODEL_FILE)
diff --git a/tests/conftest.py b/tests/conftest.py
index d10c14d1e..c965d3fac 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -73,6 +73,18 @@ def document_corpus(subject_index):
     return doc_corpus
 
 
+@pytest.fixture(scope='module')
+def fulltext_corpus(subject_index):
+    docdir = os.path.join(
+        os.path.dirname(__file__),
+        'corpora',
+        'archaeology',
+        'fulltext')
+    ft_corpus = annif.corpus.DocumentDirectory(docdir)
+    ft_corpus.set_subject_index(subject_index)
+    return ft_corpus
+
+
 @pytest.fixture(scope='module')
 def project(document_corpus):
     proj = unittest.mock.Mock()
diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py
new file mode 100644
index 000000000..b9308bffb
--- /dev/null
+++ b/tests/test_backend_vw_ensemble.py
@@ -0,0 +1,56 @@
+"""Unit tests for the vw_ensemble backend in Annif"""
+
+import pytest
+import annif.backend
+import annif.corpus
+
+pytest.importorskip("annif.backend.vw_ensemble")
+
+
+def test_vw_ensemble_train(app, datadir, tmpdir, fulltext_corpus, project):
+    vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
+    vw_ensemble = vw_ensemble_type(
+        backend_id='vw_ensemble',
+        params={'sources': 'tfidf-fi'},
+        datadir=str(datadir))
+
+    with app.app_context():
+        vw_ensemble.train(fulltext_corpus, project)
+    assert datadir.join('vw-train.txt').exists()
+    assert datadir.join('vw-train.txt').size() > 0
+    assert datadir.join('vw-model').exists()
+    assert datadir.join('vw-model').size() > 0
+
+
+def test_vw_ensemble_initialize(app, datadir):
+    vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
+    vw_ensemble = vw_ensemble_type(
+        backend_id='vw_ensemble',
+        params={'sources': 'tfidf-fi'},
+        datadir=str(datadir))
+
+    assert vw_ensemble._model is None
+    with app.app_context():
+        vw_ensemble.initialize()
+    assert vw_ensemble._model is not None
+    # initialize a second time - this shouldn't do anything
+    with app.app_context():
+        vw_ensemble.initialize()
+
+
+def test_vw_ensemble_suggest(app, datadir, project):
+    vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
+    vw_ensemble = vw_ensemble_type(
+        backend_id='vw_ensemble',
+        params={'sources': 'tfidf-fi'},
+        datadir=str(datadir))
+
+    results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös
+        muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
+        tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.
+        Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä,
+        joita ihmisten toiminta on jättänyt maaperään tai vesistöjen
+        pohjaan.""", project)
+
+    assert vw_ensemble._model is not None
+    assert len(results) > 0

From a6079a0a315fbae6b4f6aa4dee0efe4d8bdfcd15 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 14:12:33 +0300
Subject: [PATCH 03/25] fix test failure caused by using an uninitialized
 tfidf-fi project (when starting with a clean datadir)

---
 tests/test_backend_vw_ensemble.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py
index b9308bffb..bc091d78e 100644
--- a/tests/test_backend_vw_ensemble.py
+++ b/tests/test_backend_vw_ensemble.py
@@ -3,19 +3,27 @@
 import pytest
 import annif.backend
 import annif.corpus
+import annif.project
 
 pytest.importorskip("annif.backend.vw_ensemble")
 
 
-def test_vw_ensemble_train(app, datadir, tmpdir, fulltext_corpus, project):
+def test_vw_ensemble_train(app, datadir, tmpdir):
     vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
     vw_ensemble = vw_ensemble_type(
         backend_id='vw_ensemble',
-        params={'sources': 'tfidf-fi'},
+        params={'sources': 'dummy-en'},
         datadir=str(datadir))
 
+    tmpfile = tmpdir.join('document.tsv')
+    tmpfile.write("dummy\thttp://example.org/dummy\n" +
+                  "another\thttp://example.org/dummy\n" +
+                  "none\thttp://example.org/none")
+    document_corpus = annif.corpus.DocumentFile(str(tmpfile))
+    project = annif.project.get_project('dummy-en')
+
     with app.app_context():
-        vw_ensemble.train(fulltext_corpus, project)
+        vw_ensemble.train(document_corpus, project)
     assert datadir.join('vw-train.txt').exists()
     assert datadir.join('vw-train.txt').size() > 0
     assert datadir.join('vw-model').exists()
@@ -26,7 +34,7 @@ def test_vw_ensemble_initialize(app, datadir):
     vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
     vw_ensemble = vw_ensemble_type(
         backend_id='vw_ensemble',
-        params={'sources': 'tfidf-fi'},
+        params={'sources': 'dummy-en'},
         datadir=str(datadir))
 
     assert vw_ensemble._model is None
@@ -38,13 +46,15 @@ def test_vw_ensemble_initialize(app, datadir):
         vw_ensemble.initialize()
 
 
-def test_vw_ensemble_suggest(app, datadir, project):
+def test_vw_ensemble_suggest(app, datadir):
     vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
     vw_ensemble = vw_ensemble_type(
         backend_id='vw_ensemble',
-        params={'sources': 'tfidf-fi'},
+        params={'sources': 'dummy-en'},
         datadir=str(datadir))
 
+    project = annif.project.get_project('dummy-en')
+
     results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös
         muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
         tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.

From a3579d6acc2d38157a6b725f6a55dae57e344a38 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 14:26:47 +0300
Subject: [PATCH 04/25] remove unused fixture

---
 tests/conftest.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c965d3fac..d10c14d1e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -73,18 +73,6 @@ def document_corpus(subject_index):
     return doc_corpus
 
 
-@pytest.fixture(scope='module')
-def fulltext_corpus(subject_index):
-    docdir = os.path.join(
-        os.path.dirname(__file__),
-        'corpora',
-        'archaeology',
-        'fulltext')
-    ft_corpus = annif.corpus.DocumentDirectory(docdir)
-    ft_corpus.set_subject_index(subject_index)
-    return ft_corpus
-
-
 @pytest.fixture(scope='module')
 def project(document_corpus):
     proj = unittest.mock.Mock()

From 9d46450738b85d6191c252b351c7890389bafd31 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 15:02:13 +0300
Subject: [PATCH 05/25] introduce common base class for VW backends

---
 annif/backend/vw_base.py     | 9 +++++++++
 annif/backend/vw_ensemble.py | 4 ++--
 annif/backend/vw_multi.py    | 6 +++---
 3 files changed, 14 insertions(+), 5 deletions(-)
 create mode 100644 annif/backend/vw_base.py

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
new file mode 100644
index 000000000..798a3ef07
--- /dev/null
+++ b/annif/backend/vw_base.py
@@ -0,0 +1,9 @@
+"""Base class for Vowpal Wabbit based Annif backends"""
+
+from . import backend
+
+
+class VWBaseBackend(backend.AnnifLearningBackend):
+    """Base class for Vowpal Wabbit based Annif backends"""
+
+    pass
diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index c43242aac..e46012ea4 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -8,13 +8,13 @@
 import numpy as np
 from annif.suggestion import VectorSuggestionResult
 from annif.exception import ConfigurationException, NotInitializedException
-from . import backend
+from . import vw_base
 from . import ensemble
 
 
 class VWEnsembleBackend(
         ensemble.EnsembleBackend,
-        backend.AnnifLearningBackend):
+        vw_base.VWBaseBackend):
     """Vowpal Wabbit ensemble backend that combines results from multiple
     projects and learns how well those projects/backends recognize
     particular subjects."""
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index e4006128f..dbf8a97a5 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -8,12 +8,12 @@
 import numpy as np
 from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
 from annif.exception import ConfigurationException, NotInitializedException
-from . import backend
+from . import vw_base
 from . import mixins
 
 
-class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifLearningBackend):
-    """Vorpal Wabbit multiclass/multilabel backend for Annif"""
+class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend):
+    """Vowpal Wabbit multiclass/multilabel backend for Annif"""
 
     name = "vw_multi"
     needs_subject_index = True

From 9503aa824b65fb0f6783e2d65e224538e84068a9 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 15:09:17 +0300
Subject: [PATCH 06/25] refactor: move initialize method to vw_base

---
 annif/backend/vw_base.py     | 25 ++++++++++++++++++++++++-
 annif/backend/vw_ensemble.py | 24 +-----------------------
 annif/backend/vw_multi.py    | 24 +-----------------------
 3 files changed, 26 insertions(+), 47 deletions(-)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index 798a3ef07..e4ce9c8d5 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -1,9 +1,32 @@
 """Base class for Vowpal Wabbit based Annif backends"""
 
+import os
+from vowpalwabbit import pyvw
+from annif.exception import NotInitializedException
 from . import backend
 
 
 class VWBaseBackend(backend.AnnifLearningBackend):
     """Base class for Vowpal Wabbit based Annif backends"""
 
-    pass
+    MODEL_FILE = 'vw-model'
+    TRAIN_FILE = 'vw-train.txt'
+
+    # defaults for uninitialized instances
+    _model = None
+
+    def initialize(self):
+        if self._model is None:
+            path = os.path.join(self.datadir, self.MODEL_FILE)
+            if not os.path.exists(path):
+                raise NotInitializedException(
+                    'model {} not found'.format(path),
+                    backend_id=self.backend_id)
+            self.debug('loading VW model from {}'.format(path))
+            params = self._create_params({'i': path, 'quiet': True})
+            if 'passes' in params:
+                # don't confuse the model with passes
+                del params['passes']
+            self.debug("model parameters: {}".format(params))
+            self._model = pyvw.vw(**params)
+            self.debug('loaded model {}'.format(str(self._model)))
diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index e46012ea4..3f8626a6b 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -7,7 +7,7 @@
 from vowpalwabbit import pyvw
 import numpy as np
 from annif.suggestion import VectorSuggestionResult
-from annif.exception import ConfigurationException, NotInitializedException
+from annif.exception import ConfigurationException
 from . import vw_base
 from . import ensemble
 
@@ -33,28 +33,6 @@ class VWEnsembleBackend(
         'passes': (int, None)
     }
 
-    MODEL_FILE = 'vw-model'
-    TRAIN_FILE = 'vw-train.txt'
-
-    # defaults for uninitialized instances
-    _model = None
-
-    def initialize(self):
-        if self._model is None:
-            path = os.path.join(self.datadir, self.MODEL_FILE)
-            if not os.path.exists(path):
-                raise NotInitializedException(
-                    'model {} not found'.format(path),
-                    backend_id=self.backend_id)
-            self.debug('loading VW model from {}'.format(path))
-            params = self._create_params({'i': path, 'quiet': True})
-            if 'passes' in params:
-                # don't confuse the model with passes
-                del params['passes']
-            self.debug("model parameters: {}".format(params))
-            self._model = pyvw.vw(**params)
-            self.debug('loaded model {}'.format(str(self._model)))
-
     @staticmethod
     def _write_train_file(examples, filename):
         with open(filename, 'w', encoding='utf-8') as trainfile:
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index dbf8a97a5..ca0c3016a 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -7,7 +7,7 @@
 from vowpalwabbit import pyvw
 import numpy as np
 from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
-from annif.exception import ConfigurationException, NotInitializedException
+from annif.exception import ConfigurationException
 from . import vw_base
 from . import mixins
 
@@ -37,28 +37,6 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend):
 
     DEFAULT_INPUTS = '_text_'
 
-    MODEL_FILE = 'vw-model'
-    TRAIN_FILE = 'vw-train.txt'
-
-    # defaults for uninitialized instances
-    _model = None
-
-    def initialize(self):
-        if self._model is None:
-            path = os.path.join(self.datadir, self.MODEL_FILE)
-            if not os.path.exists(path):
-                raise NotInitializedException(
-                    'model {} not found'.format(path),
-                    backend_id=self.backend_id)
-            self.debug('loading VW model from {}'.format(path))
-            params = self._create_params({'i': path, 'quiet': True})
-            if 'passes' in params:
-                # don't confuse the model with passes
-                del params['passes']
-            self.debug("model parameters: {}".format(params))
-            self._model = pyvw.vw(**params)
-            self.debug('loaded model {}'.format(str(self._model)))
-
     @property
     def algorithm(self):
         algorithm = self.params.get('algorithm', self.DEFAULT_ALGORITHM)

From 098e02ac8ff55efa47ed784ec0ac307e7a53d69a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 15:15:42 +0300
Subject: [PATCH 07/25] Refactor: move parameter handling to vw_base

---
 annif/backend/vw_base.py     | 31 +++++++++++++++++++++++++++++++
 annif/backend/vw_ensemble.py | 28 ----------------------------
 annif/backend/vw_multi.py    | 27 ---------------------------
 3 files changed, 31 insertions(+), 55 deletions(-)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index e4ce9c8d5..5600c5d56 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -2,6 +2,7 @@
 
 import os
 from vowpalwabbit import pyvw
+from annif.exception import ConfigurationException
 from annif.exception import NotInitializedException
 from . import backend
 
@@ -9,6 +10,12 @@
 class VWBaseBackend(backend.AnnifLearningBackend):
     """Base class for Vowpal Wabbit based Annif backends"""
 
+    # Parameters for VW based backends
+    # each param specifier is a pair (allowed_values, default_value)
+    # where allowed_values is either a type or a list of allowed values
+    # and default_value may be None, to let VW decide by itself
+    VW_PARAMS = {}  # needs to be specified in subclasses
+
     MODEL_FILE = 'vw-model'
     TRAIN_FILE = 'vw-train.txt'
 
@@ -30,3 +37,27 @@ def initialize(self):
             self.debug("model parameters: {}".format(params))
             self._model = pyvw.vw(**params)
             self.debug('loaded model {}'.format(str(self._model)))
+
+    def _convert_param(self, param, val):
+        pspec, _ = self.VW_PARAMS[param]
+        if isinstance(pspec, list):
+            if val in pspec:
+                return val
+            raise ConfigurationException(
+                "{} is not a valid value for {} (allowed: {})".format(
+                    val, param, ', '.join(pspec)), backend_id=self.backend_id)
+        try:
+            return pspec(val)
+        except ValueError:
+            raise ConfigurationException(
+                "The {} value {} cannot be converted to {}".format(
+                    param, val, pspec), backend_id=self.backend_id)
+
+    def _create_params(self, params):
+        params.update({param: defaultval
+                       for param, (_, defaultval) in self.VW_PARAMS.items()
+                       if defaultval is not None})
+        params.update({param: self._convert_param(param, val)
+                       for param, val in self.params.items()
+                       if param in self.VW_PARAMS})
+        return params
diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 3f8626a6b..e39987c4a 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -7,7 +7,6 @@
 from vowpalwabbit import pyvw
 import numpy as np
 from annif.suggestion import VectorSuggestionResult
-from annif.exception import ConfigurationException
 from . import vw_base
 from . import ensemble
 
@@ -22,9 +21,6 @@ class VWEnsembleBackend(
     name = "vw_ensemble"
 
     VW_PARAMS = {
-        # each param specifier is a pair (allowed_values, default_value)
-        # where allowed_values is either a type or a list of allowed values
-        # and default_value may be None, to let VW decide by itself
         'bit_precision': (int, None),
         'learning_rate': (float, None),
         'loss_function': (['squared', 'logistic', 'hinge'], 'squared'),
@@ -99,30 +95,6 @@ def _create_train_file(self, corpus, project):
                                self.TRAIN_FILE,
                                method=self._write_train_file)
 
-    def _convert_param(self, param, val):
-        pspec, _ = self.VW_PARAMS[param]
-        if isinstance(pspec, list):
-            if val in pspec:
-                return val
-            raise ConfigurationException(
-                "{} is not a valid value for {} (allowed: {})".format(
-                    val, param, ', '.join(pspec)), backend_id=self.backend_id)
-        try:
-            return pspec(val)
-        except ValueError:
-            raise ConfigurationException(
-                "The {} value {} cannot be converted to {}".format(
-                    param, val, pspec), backend_id=self.backend_id)
-
-    def _create_params(self, params):
-        params.update({param: defaultval
-                       for param, (_, defaultval) in self.VW_PARAMS.items()
-                       if defaultval is not None})
-        params.update({param: self._convert_param(param, val)
-                       for param, val in self.params.items()
-                       if param in self.VW_PARAMS})
-        return params
-
     def _create_model(self, project):
         trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
         params = self._create_params(
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index ca0c3016a..252f6717f 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -19,9 +19,6 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend):
     needs_subject_index = True
 
     VW_PARAMS = {
-        # each param specifier is a pair (allowed_values, default_value)
-        # where allowed_values is either a type or a list of allowed values
-        # and default_value may be None, to let VW decide by itself
         'bit_precision': (int, None),
         'ngram': (lambda x: '_{}'.format(int(x)), None),
         'learning_rate': (float, None),
@@ -125,30 +122,6 @@ def _create_train_file(self, corpus, project):
                                self.TRAIN_FILE,
                                method=self._write_train_file)
 
-    def _convert_param(self, param, val):
-        pspec, _ = self.VW_PARAMS[param]
-        if isinstance(pspec, list):
-            if val in pspec:
-                return val
-            raise ConfigurationException(
-                "{} is not a valid value for {} (allowed: {})".format(
-                    val, param, ', '.join(pspec)), backend_id=self.backend_id)
-        try:
-            return pspec(val)
-        except ValueError:
-            raise ConfigurationException(
-                "The {} value {} cannot be converted to {}".format(
-                    param, val, pspec), backend_id=self.backend_id)
-
-    def _create_params(self, params):
-        params.update({param: defaultval
-                       for param, (_, defaultval) in self.VW_PARAMS.items()
-                       if defaultval is not None})
-        params.update({param: self._convert_param(param, val)
-                       for param, val in self.params.items()
-                       if param in self.VW_PARAMS})
-        return params
-
     def _create_model(self, project):
         self.info('creating VW model (algorithm: {})'.format(self.algorithm))
         trainpath = os.path.join(self.datadir, self.TRAIN_FILE)

From 745601619c5086c9ce8db86dd809f07f5d5e8fa0 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 15:30:03 +0300
Subject: [PATCH 08/25] Refactor: move learn method to vw_base (and make it an
 abstract base class)

---
 annif/backend/vw_base.py     | 17 ++++++++++++++++-
 annif/backend/vw_ensemble.py |  7 -------
 annif/backend/vw_multi.py    |  7 -------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index 5600c5d56..cf84ae03a 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -1,5 +1,6 @@
 """Base class for Vowpal Wabbit based Annif backends"""
 
+import abc
 import os
 from vowpalwabbit import pyvw
 from annif.exception import ConfigurationException
@@ -7,7 +8,7 @@
 from . import backend
 
 
-class VWBaseBackend(backend.AnnifLearningBackend):
+class VWBaseBackend(backend.AnnifLearningBackend, metaclass=abc.ABCMeta):
     """Base class for Vowpal Wabbit based Annif backends"""
 
     # Parameters for VW based backends
@@ -61,3 +62,17 @@ def _create_params(self, params):
                        for param, val in self.params.items()
                        if param in self.VW_PARAMS})
         return params
+
+    @abc.abstractmethod
+    def _create_examples(self, corpus, project):
+        """This method should be implemented by concrete backends. It
+        should return a sequence of strings formatted according to the VW
+        input format."""
+        pass
+    
+    def learn(self, corpus, project):
+        self.initialize()
+        for example in self._create_examples(corpus, project):
+            self._model.learn(example)
+        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
+        self._model.save(modelpath)
diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index e39987c4a..a63cbd864 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -111,10 +111,3 @@ def train(self, corpus, project):
         self.info("creating VW ensemble model")
         self._create_train_file(corpus, project)
         self._create_model(project)
-
-    def learn(self, corpus, project):
-        self.initialize()
-        for example in self._create_examples(corpus, project):
-            self._model.learn(example)
-        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
-        self._model.save(modelpath)
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 252f6717f..bd4432c9d 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -139,13 +139,6 @@ def train(self, corpus, project):
         self._create_train_file(corpus, project)
         self._create_model(project)
 
-    def learn(self, corpus, project):
-        self.initialize()
-        for example in self._create_examples(corpus, project):
-            self._model.learn(example)
-        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
-        self._model.save(modelpath)
-
     def _convert_result(self, result, project):
         if self.algorithm == 'multilabel_oaa':
             # result is a list of subject IDs - need to vectorize

From b53f7c7901d22df637aaf47032fc5c04ad15be61 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 16:05:16 +0300
Subject: [PATCH 09/25] Refactor: move train method to vw_base

---
 annif/backend/vw_base.py     | 26 ++++++++++++++++++++++++++
 annif/backend/vw_ensemble.py | 19 -------------------
 annif/backend/vw_multi.py    | 20 +-------------------
 3 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index cf84ae03a..232b14729 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -3,6 +3,7 @@
 import abc
 import os
 from vowpalwabbit import pyvw
+import annif.util
 from annif.exception import ConfigurationException
 from annif.exception import NotInitializedException
 from . import backend
@@ -63,12 +64,37 @@ def _create_params(self, params):
                        if param in self.VW_PARAMS})
         return params
 
+    @staticmethod
+    def _write_train_file(examples, filename):
+        with open(filename, 'w', encoding='utf-8') as trainfile:
+            for ex in examples:
+                print(ex, file=trainfile)
+
+    def _create_train_file(self, corpus, project):
+        self.info('creating VW train file')
+        examples = self._create_examples(corpus, project)
+        annif.util.atomic_save(examples,
+                               self.datadir,
+                               self.TRAIN_FILE,
+                               method=self._write_train_file)
+
     @abc.abstractmethod
     def _create_examples(self, corpus, project):
         """This method should be implemented by concrete backends. It
         should return a sequence of strings formatted according to the VW
         input format."""
         pass
+
+    @abc.abstractmethod
+    def _create_model(self, project):
+        """This method should be implemented by concrete backends.  It
+        should create an empty (untrained) VW model and save it to disk."""
+        pass
+
+    def train(self, corpus, project):
+        self.info("creating VW model")
+        self._create_train_file(corpus, project)
+        self._create_model(project)
     
     def learn(self, corpus, project):
         self.initialize()
diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index a63cbd864..0dd35c9e4 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -29,12 +29,6 @@ class VWEnsembleBackend(
         'passes': (int, None)
     }
 
-    @staticmethod
-    def _write_train_file(examples, filename):
-        with open(filename, 'w', encoding='utf-8') as trainfile:
-            for ex in examples:
-                print(ex, file=trainfile)
-
     def _merge_hits_from_sources(self, hits_from_sources, project, params):
         score_vector = np.array([hits.vector
                                  for hits, _ in hits_from_sources])
@@ -87,14 +81,6 @@ def _create_examples(self, corpus, project):
         random.shuffle(examples)
         return examples
 
-    def _create_train_file(self, corpus, project):
-        self.info('creating VW train file')
-        examples = self._create_examples(corpus, project)
-        annif.util.atomic_save(examples,
-                               self.datadir,
-                               self.TRAIN_FILE,
-                               method=self._write_train_file)
-
     def _create_model(self, project):
         trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
         params = self._create_params(
@@ -106,8 +92,3 @@ def _create_model(self, project):
         self._model = pyvw.vw(**params)
         modelpath = os.path.join(self.datadir, self.MODEL_FILE)
         self._model.save(modelpath)
-
-    def train(self, corpus, project):
-        self.info("creating VW ensemble model")
-        self._create_train_file(corpus, project)
-        self._create_model(project)
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index bd4432c9d..89f4027af 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -3,9 +3,9 @@
 
 import random
 import os.path
-import annif.util
 from vowpalwabbit import pyvw
 import numpy as np
+import annif.project
 from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
 from annif.exception import ConfigurationException
 from . import vw_base
@@ -59,12 +59,6 @@ def _normalize_text(project, text):
         ntext = ' '.join(project.analyzer.tokenize_words(text))
         return VWMultiBackend._cleanup_text(ntext)
 
-    @staticmethod
-    def _write_train_file(examples, filename):
-        with open(filename, 'w', encoding='utf-8') as trainfile:
-            for ex in examples:
-                print(ex, file=trainfile)
-
     @staticmethod
     def _uris_to_subject_ids(project, uris):
         subject_ids = []
@@ -114,14 +108,6 @@ def _create_examples(self, corpus, project):
         random.shuffle(examples)
         return examples
 
-    def _create_train_file(self, corpus, project):
-        self.info('creating VW train file')
-        examples = self._create_examples(corpus, project)
-        annif.util.atomic_save(examples,
-                               self.datadir,
-                               self.TRAIN_FILE,
-                               method=self._write_train_file)
-
     def _create_model(self, project):
         self.info('creating VW model (algorithm: {})'.format(self.algorithm))
         trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
@@ -135,10 +121,6 @@ def _create_model(self, project):
         modelpath = os.path.join(self.datadir, self.MODEL_FILE)
         self._model.save(modelpath)
 
-    def train(self, corpus, project):
-        self._create_train_file(corpus, project)
-        self._create_model(project)
-
     def _convert_result(self, result, project):
         if self.algorithm == 'multilabel_oaa':
             # result is a list of subject IDs - need to vectorize

From 393b5076bb399ee83d8d9428eb83a4af6a18ae93 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 16:17:24 +0300
Subject: [PATCH 10/25] fix whitespace

---
 annif/backend/vw_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index 232b14729..09a9a2fc3 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -95,7 +95,7 @@ def train(self, corpus, project):
         self.info("creating VW model")
         self._create_train_file(corpus, project)
         self._create_model(project)
-    
+
     def learn(self, corpus, project):
         self.initialize()
         for example in self._create_examples(corpus, project):

From 78d206ba00e93ce4248cc837e96d973103651ea2 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Wed, 26 Jun 2019 16:25:57 +0300
Subject: [PATCH 11/25] Refactor _create_examples to reduce(?) its complexity

---
 annif/backend/vw_ensemble.py | 45 ++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 0dd35c9e4..788329053 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -42,6 +42,11 @@ def _merge_hits_from_sources(self, hits_from_sources, project, params):
                 result[subj_id] = score
         return VectorSuggestionResult(result, project.subjects)
 
+    @property
+    def _source_project_ids(self):
+        sources = annif.util.parse_sources(self.params['sources'])
+        return [project_id for project_id, _ in sources]
+
     def _format_example(self, subject_id, scores, true=None):
         if true is None:
             val = ''
@@ -50,34 +55,34 @@ def _format_example(self, subject_id, scores, true=None):
         else:
             val = -1
         ex = "{} |{}".format(val, subject_id)
-        for proj_idx, proj in enumerate(self.source_project_ids):
+        for proj_idx, proj in enumerate(self._source_project_ids):
             ex += " {}:{}".format(proj, scores[proj_idx])
         return ex
 
-    @property
-    def source_project_ids(self):
-        sources = annif.util.parse_sources(self.params['sources'])
-        return [project_id for project_id, _ in sources]
+    def _doc_to_example(self, doc, project, source_projects):
+        examples = []
+        subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
+        true = subjects.as_vector(project.subjects)
+        score_vectors = []
+        for source_project in source_projects:
+            hits = source_project.suggest(doc.text)
+            score_vectors.append(hits.vector)
+        score_vector = np.array(score_vectors)
+        for subj_id in range(len(true)):
+            if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
+                ex = self._format_example(
+                    subj_id,
+                    score_vector[:, subj_id],
+                    true[subj_id])
+                examples.append(ex)
+        return examples
 
     def _create_examples(self, corpus, project):
         source_projects = [annif.project.get_project(project_id)
-                           for project_id in self.source_project_ids]
+                           for project_id in self._source_project_ids]
         examples = []
         for doc in corpus.documents:
-            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
-            true = subjects.as_vector(project.subjects)
-            score_vectors = []
-            for source_project in source_projects:
-                hits = source_project.suggest(doc.text)
-                score_vectors.append(hits.vector)
-            score_vector = np.array(score_vectors)
-            for subj_id in range(len(true)):
-                if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
-                    ex = self._format_example(
-                        subj_id,
-                        score_vector[:, subj_id],
-                        true[subj_id])
-                    examples.append(ex)
+            examples += self._doc_to_example(doc, project, source_projects)
         random.shuffle(examples)
         return examples
 

From c0cc1c7a7fb11116436c3577faebff155ff937d0 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Thu, 27 Jun 2019 12:37:27 +0300
Subject: [PATCH 12/25] Add missing import (why did it work before?)

---
 annif/backend/vw_ensemble.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 788329053..5d61ddab0 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -4,6 +4,7 @@
 import random
 import os.path
 import annif.util
+import annif.project
 from vowpalwabbit import pyvw
 import numpy as np
 from annif.suggestion import VectorSuggestionResult

From ff05920684c4d7426f88e7e76d326d9e01b5684a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Thu, 27 Jun 2019 13:57:04 +0300
Subject: [PATCH 13/25] Refactor: move _create_model to vw_base; disable
 quadratic features for vw_ensemble (no improvement in results)

---
 annif/backend/vw_base.py     | 16 +++++++++++-----
 annif/backend/vw_ensemble.py | 12 ------------
 annif/backend/vw_multi.py    | 11 +----------
 3 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index 09a9a2fc3..b172fd653 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -85,11 +85,17 @@ def _create_examples(self, corpus, project):
         input format."""
         pass
 
-    @abc.abstractmethod
-    def _create_model(self, project):
-        """This method should be implemented by concrete backends.  It
-        should create an empty (untrained) VW model and save it to disk."""
-        pass
+    def _create_model(self, project, initial_params={}):
+        trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
+        initial_params['data'] = trainpath
+        params = self._create_params(initial_params)
+        if params.get('passes', 1) > 1:
+            # need a cache file when there are multiple passes
+            params.update({'cache': True, 'kill_cache': True})
+        self.debug("model parameters: {}".format(params))
+        self._model = pyvw.vw(**params)
+        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
+        self._model.save(modelpath)
 
     def train(self, corpus, project):
         self.info("creating VW model")
diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 5d61ddab0..bc87d16e1 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -86,15 +86,3 @@ def _create_examples(self, corpus, project):
             examples += self._doc_to_example(doc, project, source_projects)
         random.shuffle(examples)
         return examples
-
-    def _create_model(self, project):
-        trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
-        params = self._create_params(
-            {'data': trainpath, 'q': '::'})
-        if params.get('passes', 1) > 1:
-            # need a cache file when there are multiple passes
-            params.update({'cache': True, 'kill_cache': True})
-        self.debug("model parameters: {}".format(params))
-        self._model = pyvw.vw(**params)
-        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
-        self._model.save(modelpath)
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 89f4027af..2511e5504 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -110,16 +110,7 @@ def _create_examples(self, corpus, project):
 
     def _create_model(self, project):
         self.info('creating VW model (algorithm: {})'.format(self.algorithm))
-        trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
-        params = self._create_params(
-            {'data': trainpath, self.algorithm: len(project.subjects)})
-        if params.get('passes', 1) > 1:
-            # need a cache file when there are multiple passes
-            params.update({'cache': True, 'kill_cache': True})
-        self.debug("model parameters: {}".format(params))
-        self._model = pyvw.vw(**params)
-        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
-        self._model.save(modelpath)
+        super()._create_model(project, {self.algorithm: len(project.subjects)})
 
     def _convert_result(self, result, project):
         if self.algorithm == 'multilabel_oaa':

From 9f25308cf0b205e315d6eb0b545e7d3d0d966ccd Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Thu, 27 Jun 2019 14:08:59 +0300
Subject: [PATCH 14/25] Add "pragma: no cover" annotation for abstract method

---
 annif/backend/vw_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index b172fd653..cc9c32991 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -83,7 +83,7 @@ def _create_examples(self, corpus, project):
         """This method should be implemented by concrete backends. It
         should return a sequence of strings formatted according to the VW
         input format."""
-        pass
+        pass  # pragma: no cover
 
     def _create_model(self, project, initial_params={}):
         trainpath = os.path.join(self.datadir, self.TRAIN_FILE)

From 2b81385b5618a81aee182e2c40df9ac3e46e3919 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 10:42:16 +0300
Subject: [PATCH 15/25] Add discounting mechanism to vw_ensemble backend

---
 annif/backend/vw_ensemble.py      | 83 +++++++++++++++++++++++++++++--
 tests/test_backend_vw_ensemble.py |  2 +
 2 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index bc87d16e1..774ce3610 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -1,12 +1,15 @@
 """Annif backend using the Vowpal Wabbit multiclass and multilabel
 classifiers"""
 
+import collections
+import json
 import random
 import os.path
 import annif.util
 import annif.project
 from vowpalwabbit import pyvw
 import numpy as np
+from annif.exception import NotInitializedException
 from annif.suggestion import VectorSuggestionResult
 from . import vw_base
 from . import ensemble
@@ -30,6 +33,37 @@ class VWEnsembleBackend(
         'passes': (int, None)
     }
 
+    # number of training examples per subject, stored as a collections.Counter
+    _subject_freq = None
+
+    FREQ_FILE = 'subject-freq.json'
+
+    # The discount rate affects how quickly the ensemble starts to trust its
+    # own judgement when the amount of training data increases, versus using
+    # a simple mean of scores. A higher value will mean that the model
+    # adapts quicker (and possibly makes more errors) while a lower value
+    # will make it more careful so that it will require more training data.
+    DEFAULT_DISCOUNT_RATE = 0.01
+
+    def initialize(self):
+        if self._subject_freq is None:
+            path = os.path.join(self.datadir, self.FREQ_FILE)
+            if not os.path.exists(path):
+                raise NotInitializedException(
+                    'frequency file {} not found'.format(path),
+                    backend_id=self.backend_id)
+            self.debug('loading concept frequencies from {}'.format(path))
+            with open(path) as freqf:
+                # The Counter was serialized like a dictionary, need to
+                # convert it back. Keys that became strings need to be turned
+                # back into integers.
+                self._subject_freq = collections.Counter()
+                for cid, freq in json.load(freqf).items():
+                    self._subject_freq[int(cid)] = freq
+            self.debug('loaded frequencies for {} concepts'.format(
+                len(self._subject_freq)))
+        super().initialize()
+
     def _merge_hits_from_sources(self, hits_from_sources, project, params):
         score_vector = np.array([hits.vector
                                  for hits, _ in hits_from_sources])
@@ -39,8 +73,14 @@ def _merge_hits_from_sources(self, hits_from_sources, project, params):
                 ex = self._format_example(
                     subj_id,
                     score_vector[:, subj_id])
-                score = (self._model.predict(ex) + 1.0) / 2.0
-                result[subj_id] = score
+                discount_rate = self.params.get(
+                    'discount_rate', self.DEFAULT_DISCOUNT_RATE)
+                raw_weight = 1.0 / \
+                    ((discount_rate * self._subject_freq[subj_id]) + 1)
+                raw_score = score_vector[:, subj_id].mean()
+                pred_score = (self._model.predict(ex) + 1.0) / 2.0
+                result[subj_id] = (raw_weight * raw_score) + \
+                    (1.0 - raw_weight) * pred_score
         return VectorSuggestionResult(result, project.subjects)
 
     @property
@@ -71,10 +111,10 @@ def _doc_to_example(self, doc, project, source_projects):
         score_vector = np.array(score_vectors)
         for subj_id in range(len(true)):
             if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
-                ex = self._format_example(
+                ex = (subj_id, self._format_example(
                     subj_id,
                     score_vector[:, subj_id],
-                    true[subj_id])
+                    true[subj_id]))
                 examples.append(ex)
         return examples
 
@@ -86,3 +126,38 @@ def _create_examples(self, corpus, project):
             examples += self._doc_to_example(doc, project, source_projects)
         random.shuffle(examples)
         return examples
+
+    @staticmethod
+    def _write_freq_file(subject_freq, filename):
+        with open(filename, 'w') as freqfile:
+            json.dump(subject_freq, freqfile)
+
+    def _create_train_file(self, corpus, project):
+        self.info('creating VW train file')
+        exampledata = self._create_examples(corpus, project)
+
+        subjects = [subj_id for subj_id, ex in exampledata]
+        self._subject_freq = collections.Counter(subjects)
+        annif.util.atomic_save(self._subject_freq,
+                               self.datadir,
+                               self.FREQ_FILE,
+                               method=self._write_freq_file)
+
+        examples = [ex for subj_id, ex in exampledata]
+        annif.util.atomic_save(examples,
+                               self.datadir,
+                               self.TRAIN_FILE,
+                               method=self._write_train_file)
+
+    def learn(self, corpus, project):
+        self.initialize()
+        exampledata = self._create_examples(corpus, project)
+        for subj_id, example in exampledata:
+            self._model.learn(example)
+            self._subject_freq[subj_id] += 1
+        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
+        self._model.save(modelpath)
+        annif.util.atomic_save(self._subject_freq,
+                               self.datadir,
+                               self.FREQ_FILE,
+                               method=self._write_freq_file)
diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py
index bc091d78e..ad73b7de2 100644
--- a/tests/test_backend_vw_ensemble.py
+++ b/tests/test_backend_vw_ensemble.py
@@ -26,6 +26,8 @@ def test_vw_ensemble_train(app, datadir, tmpdir):
         vw_ensemble.train(document_corpus, project)
     assert datadir.join('vw-train.txt').exists()
     assert datadir.join('vw-train.txt').size() > 0
+    assert datadir.join('subject-freq.json').exists()
+    assert datadir.join('subject-freq.json').size() > 0
     assert datadir.join('vw-model').exists()
     assert datadir.join('vw-model').size() > 0
 

From dfaaf6d0fe31ee72f19c155287e99123afec40be Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 11:11:25 +0300
Subject: [PATCH 16/25] add more tests for vw_ensemble

---
 tests/test_backend_vw_ensemble.py | 33 ++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py
index ad73b7de2..239b21073 100644
--- a/tests/test_backend_vw_ensemble.py
+++ b/tests/test_backend_vw_ensemble.py
@@ -1,14 +1,28 @@
 """Unit tests for the vw_ensemble backend in Annif"""
 
+import json
+import time
 import pytest
 import annif.backend
 import annif.corpus
 import annif.project
+from annif.exception import NotInitializedException
 
 pytest.importorskip("annif.backend.vw_ensemble")
 
 
-def test_vw_ensemble_train(app, datadir, tmpdir):
+def test_vw_ensemble_suggest_no_model(datadir, project):
+    vw_ensemble_type = annif.backend.get_backend('vw_ensemble')
+    vw_ensemble = vw_ensemble_type(
+        backend_id='vw_ensemble',
+        params={'sources': 'dummy-en'},
+        datadir=str(datadir))
+
+    with pytest.raises(NotInitializedException):
+        results = vw_ensemble.suggest("example text", project)
+
+
+def test_vw_ensemble_train_and_learn(app, datadir, tmpdir):
     vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
     vw_ensemble = vw_ensemble_type(
         backend_id='vw_ensemble',
@@ -31,6 +45,23 @@ def test_vw_ensemble_train(app, datadir, tmpdir):
     assert datadir.join('vw-model').exists()
     assert datadir.join('vw-model').size() > 0
 
+    # test online learning
+    modelfile = datadir.join('vw-model')
+    freqfile = datadir.join('subject-freq.json')
+
+    old_size = modelfile.size()
+    old_mtime = modelfile.mtime()
+    with open(str(freqfile)) as freqf:
+        old_totalfreq = sum(json.load(freqf).values())
+
+    time.sleep(0.1)  # make sure the timestamp has a chance to increase
+
+    vw_ensemble.learn(document_corpus, project)
+
+    assert modelfile.size() != old_size or modelfile.mtime() != old_mtime
+    with open(str(freqfile)) as freqf:
+        assert sum(json.load(freqf).values()) != old_totalfreq
+
 
 def test_vw_ensemble_initialize(app, datadir):
     vw_ensemble_type = annif.backend.get_backend("vw_ensemble")

From b844f6f6bb0ae393b46439cbbbbf336c783b5e41 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 11:23:31 +0300
Subject: [PATCH 17/25] Refactor: split initialize method in vw_ensemble

---
 annif/backend/vw_ensemble.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 774ce3610..72224384a 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -45,23 +45,26 @@ class VWEnsembleBackend(
     # will make it more careful so that it will require more training data.
     DEFAULT_DISCOUNT_RATE = 0.01
 
+    def _load_subject_freq(self):
+        path = os.path.join(self.datadir, self.FREQ_FILE)
+        if not os.path.exists(path):
+            raise NotInitializedException(
+                'frequency file {} not found'.format(path),
+                backend_id=self.backend_id)
+        self.debug('loading concept frequencies from {}'.format(path))
+        with open(path) as freqf:
+            # The Counter was serialized like a dictionary, need to
+            # convert it back. Keys that became strings need to be turned
+            # back into integers.
+            self._subject_freq = collections.Counter()
+            for cid, freq in json.load(freqf).items():
+                self._subject_freq[int(cid)] = freq
+        self.debug('loaded frequencies for {} concepts'.format(
+            len(self._subject_freq)))
+
     def initialize(self):
         if self._subject_freq is None:
-            path = os.path.join(self.datadir, self.FREQ_FILE)
-            if not os.path.exists(path):
-                raise NotInitializedException(
-                    'frequency file {} not found'.format(path),
-                    backend_id=self.backend_id)
-            self.debug('loading concept frequencies from {}'.format(path))
-            with open(path) as freqf:
-                # The Counter was serialized like a dictionary, need to
-                # convert it back. Keys that became strings need to be turned
-                # back into integers.
-                self._subject_freq = collections.Counter()
-                for cid, freq in json.load(freqf).items():
-                    self._subject_freq[int(cid)] = freq
-            self.debug('loaded frequencies for {} concepts'.format(
-                len(self._subject_freq)))
+            self._load_subject_freq()
         super().initialize()
 
     def _merge_hits_from_sources(self, hits_from_sources, project, params):

From 9a1345c5b73bb5226a3e155d4ce42593ca4bb344 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 11:25:56 +0300
Subject: [PATCH 18/25] Refactor: split _doc_to_example method in vw_ensemble

---
 annif/backend/vw_ensemble.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 72224384a..e25b43964 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -103,15 +103,18 @@ def _format_example(self, subject_id, scores, true=None):
             ex += " {}:{}".format(proj, scores[proj_idx])
         return ex
 
-    def _doc_to_example(self, doc, project, source_projects):
-        examples = []
-        subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
-        true = subjects.as_vector(project.subjects)
+    def _doc_score_vector(self, doc, source_projects):
         score_vectors = []
         for source_project in source_projects:
             hits = source_project.suggest(doc.text)
             score_vectors.append(hits.vector)
-        score_vector = np.array(score_vectors)
+        return np.array(score_vectors)
+
+    def _doc_to_example(self, doc, project, source_projects):
+        examples = []
+        subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
+        true = subjects.as_vector(project.subjects)
+        score_vector = self._doc_score_vector(doc, source_projects)
         for subj_id in range(len(true)):
             if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
                 ex = (subj_id, self._format_example(

From 34dc5a40b5657cb77c3ae9d02a3c1662c3a09454 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 11:33:28 +0300
Subject: [PATCH 19/25] Refactor: split _merge_hits_from_sources in vw_ensemble

---
 annif/backend/vw_ensemble.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index e25b43964..13f188041 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -67,21 +67,25 @@ def initialize(self):
             self._load_subject_freq()
         super().initialize()
 
+    def _calculate_scores(self, subj_id, subj_score_vector):
+        ex = self._format_example(subj_id, subj_score_vector)
+        raw_score = subj_score_vector.mean()
+        pred_score = (self._model.predict(ex) + 1.0) / 2.0
+        return raw_score, pred_score
+
     def _merge_hits_from_sources(self, hits_from_sources, project, params):
         score_vector = np.array([hits.vector
                                  for hits, _ in hits_from_sources])
+        discount_rate = self.params.get('discount_rate',
+                                        self.DEFAULT_DISCOUNT_RATE)
         result = np.zeros(score_vector.shape[1])
         for subj_id in range(score_vector.shape[1]):
-            if score_vector[:, subj_id].sum() > 0.0:
-                ex = self._format_example(
-                    subj_id,
-                    score_vector[:, subj_id])
-                discount_rate = self.params.get(
-                    'discount_rate', self.DEFAULT_DISCOUNT_RATE)
+            subj_score_vector = score_vector[:, subj_id]
+            if subj_score_vector.sum() > 0.0:
+                raw_score, pred_score = self._calculate_scores(
+                    subj_id, subj_score_vector)
                 raw_weight = 1.0 / \
                     ((discount_rate * self._subject_freq[subj_id]) + 1)
-                raw_score = score_vector[:, subj_id].mean()
-                pred_score = (self._model.predict(ex) + 1.0) / 2.0
                 result[subj_id] = (raw_weight * raw_score) + \
                     (1.0 - raw_weight) * pred_score
         return VectorSuggestionResult(result, project.subjects)

From b162b50a8f56c1a02277c27eb7a96e526ad3675a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 12:56:55 +0300
Subject: [PATCH 20/25] Refactor: ensure dicts passed as function parameters
 are not mutated

---
 annif/backend/vw_base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py
index cc9c32991..ffc310c07 100644
--- a/annif/backend/vw_base.py
+++ b/annif/backend/vw_base.py
@@ -56,6 +56,7 @@ def _convert_param(self, param, val):
                     param, val, pspec), backend_id=self.backend_id)
 
     def _create_params(self, params):
+        params = params.copy()  # don't mutate the original dict
         params.update({param: defaultval
                        for param, (_, defaultval) in self.VW_PARAMS.items()
                        if defaultval is not None})
@@ -86,6 +87,7 @@ def _create_examples(self, corpus, project):
         pass  # pragma: no cover
 
     def _create_model(self, project, initial_params={}):
+        initial_params = initial_params.copy()  # don't mutate the original
         trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
         initial_params['data'] = trainpath
         params = self._create_params(initial_params)

From 4d7e0f799feae97b7aa47725c438c168c9c55e61 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 12:57:31 +0300
Subject: [PATCH 21/25] remove unused imports

---
 annif/backend/vw_multi.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 2511e5504..3c3aef9e7 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -2,8 +2,6 @@
 classifiers"""
 
 import random
-import os.path
-from vowpalwabbit import pyvw
 import numpy as np
 import annif.project
 from annif.suggestion import ListSuggestionResult, VectorSuggestionResult

From 0db12a3a2709fd67f48fc4665b445881969aa4bb Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 12:57:55 +0300
Subject: [PATCH 22/25] remove unused imports

---
 annif/backend/vw_ensemble.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 13f188041..0e556886a 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -7,7 +7,6 @@
 import os.path
 import annif.util
 import annif.project
-from vowpalwabbit import pyvw
 import numpy as np
 from annif.exception import NotInitializedException
 from annif.suggestion import VectorSuggestionResult

From d28f9dea959d814a940e356670c9cc913ffcae81 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 13:34:45 +0300
Subject: [PATCH 23/25] add API documentation templates for the new vw_base and
 vw_multi modules

---
 docs/source/annif.backend.rst | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/source/annif.backend.rst b/docs/source/annif.backend.rst
index 2b9cef414..484335b82 100644
--- a/docs/source/annif.backend.rst
+++ b/docs/source/annif.backend.rst
@@ -65,8 +65,24 @@ annif.backend.tfidf module
     :undoc-members:
     :show-inheritance:
 
-annif.backend.vw\_multi module
-------------------------------
+annif.backend.vw_base module
+----------------------------
+
+.. automodule:: annif.backend.vw_base
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+annif.backend.vw_ensemble module
+--------------------------------
+
+.. automodule:: annif.backend.vw_ensemble
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+annif.backend.vw_multi module
+-----------------------------
 
 .. automodule:: annif.backend.vw_multi
     :members:

From d3a26a498a02ced7dfafab48fd3a6122f2f5c32a Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 15:23:33 +0300
Subject: [PATCH 24/25] Avoid scientific notation for weight values in VW train
 file (test to verify)

---
 annif/backend/vw_ensemble.py      |  2 +-
 tests/test_backend_vw_ensemble.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 0e556886a..9950e43de 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -103,7 +103,7 @@ def _format_example(self, subject_id, scores, true=None):
             val = -1
         ex = "{} |{}".format(val, subject_id)
         for proj_idx, proj in enumerate(self._source_project_ids):
-            ex += " {}:{}".format(proj, scores[proj_idx])
+            ex += " {}:{:.6f}".format(proj, scores[proj_idx])
         return ex
 
     def _doc_score_vector(self, doc, source_projects):
diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py
index 239b21073..3f36384be 100644
--- a/tests/test_backend_vw_ensemble.py
+++ b/tests/test_backend_vw_ensemble.py
@@ -97,3 +97,25 @@ def test_vw_ensemble_suggest(app, datadir):
 
     assert vw_ensemble._model is not None
     assert len(results) > 0
+
+
+def test_vw_ensemble_format_example(datadir):
+    vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
+    vw_ensemble = vw_ensemble_type(
+        backend_id='vw_ensemble',
+        params={'sources': 'dummy-en'},
+        datadir=str(datadir))
+
+    ex = vw_ensemble._format_example(0, [0.5])
+    assert ex == ' |0 dummy-en:0.500000'
+
+
+def test_vw_ensemble_format_example_avoid_sci_notation(datadir):
+    vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
+    vw_ensemble = vw_ensemble_type(
+        backend_id='vw_ensemble',
+        params={'sources': 'dummy-en'},
+        datadir=str(datadir))
+
+    ex = vw_ensemble._format_example(0, [7.24e-05])
+    assert ex == ' |0 dummy-en:0.000072'

From 1741cd7f751f72573b389b43ab6b9be8dfce99d2 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 28 Jun 2019 16:04:31 +0300
Subject: [PATCH 25/25] Bugfix: parse discount_rate into a float (with test to
 verify)

---
 annif/backend/vw_ensemble.py      |  4 ++--
 tests/test_backend_vw_ensemble.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 9950e43de..94cf28fa7 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -75,8 +75,8 @@ def _calculate_scores(self, subj_id, subj_score_vector):
     def _merge_hits_from_sources(self, hits_from_sources, project, params):
         score_vector = np.array([hits.vector
                                  for hits, _ in hits_from_sources])
-        discount_rate = self.params.get('discount_rate',
-                                        self.DEFAULT_DISCOUNT_RATE)
+        discount_rate = float(self.params.get('discount_rate',
+                                              self.DEFAULT_DISCOUNT_RATE))
         result = np.zeros(score_vector.shape[1])
         for subj_id in range(score_vector.shape[1]):
             subj_score_vector = score_vector[:, subj_id]
diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py
index 3f36384be..23b48ff53 100644
--- a/tests/test_backend_vw_ensemble.py
+++ b/tests/test_backend_vw_ensemble.py
@@ -99,6 +99,25 @@ def test_vw_ensemble_suggest(app, datadir):
     assert len(results) > 0
 
 
+def test_vw_ensemble_suggest_set_discount_rate(app, datadir):
+    vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
+    vw_ensemble = vw_ensemble_type(
+        backend_id='vw_ensemble',
+        params={'sources': 'dummy-en', 'discount_rate': '0.02'},
+        datadir=str(datadir))
+
+    project = annif.project.get_project('dummy-en')
+
+    results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös
+        muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
+        tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.
+        Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä,
+        joita ihmisten toiminta on jättänyt maaperään tai vesistöjen
+        pohjaan.""", project)
+
+    assert len(results) > 0
+
+
 def test_vw_ensemble_format_example(datadir):
     vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
     vw_ensemble = vw_ensemble_type(