From 794bfb568533ce4f3023e17a6db20aa41b6777be Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Mon, 4 Feb 2019 14:19:54 +0200
Subject: [PATCH 1/9] Move text normalization outside chunking method

---
 annif/backend/fasttext.py | 11 ++++++++++-
 annif/backend/mixins.py   |  5 +----
 annif/backend/vw_multi.py |  5 ++++-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py
index bf3abe719..609664a49 100644
--- a/annif/backend/fasttext.py
+++ b/annif/backend/fasttext.py
@@ -107,9 +107,18 @@ def train(self, corpus, project):
         self._create_train_file(corpus, project)
         self._create_model()
 
+    def _predict_chunks(self, chunktexts, project, limit):
+        normalized_chunks = []
+        for chunktext in chunktexts:
+            normalized = self._normalize_text(project, chunktext)
+            if normalized != '':
+                normalized_chunks.append(normalized)
+        return self._model.predict(normalized_chunks, limit)
+
     def _analyze_chunks(self, chunktexts, project):
         limit = int(self.params['limit'])
-        chunklabels, chunkscores = self._model.predict(chunktexts, limit)
+        chunklabels, chunkscores = self._predict_chunks(
+            chunktexts, project, limit)
         label_scores = collections.defaultdict(float)
         for labels, scores in zip(chunklabels, chunkscores):
             for label, score in zip(labels, scores):
diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py
index f109ac9b5..f89aa5f1c 100644
--- a/annif/backend/mixins.py
+++ b/annif/backend/mixins.py
@@ -24,10 +24,7 @@ def _analyze(self, text, project, params):
         chunksize = int(params['chunksize'])
         chunktexts = []
         for i in range(0, len(sentences), chunksize):
-            chunktext = ' '.join(sentences[i:i + chunksize])
-            normalized = self._normalize_text(project, chunktext)
-            if normalized != '':
-                chunktexts.append(normalized)
+            chunktexts.append(' '.join(sentences[i:i + chunksize]))
         self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
         if len(chunktexts) == 0:  # nothing to analyze, empty result
             return ListAnalysisResult(hits=[], subject_index=project.subjects)
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index ed97b32e3..738571956 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -152,7 +152,10 @@ def train(self, corpus, project):
     def _analyze_chunks(self, chunktexts, project):
         results = []
         for chunktext in chunktexts:
-            example = ' | {}'.format(chunktext)
+            normalized = self._normalize_text(project, chunktext)
+            if normalized == '':
+                continue
+            example = ' | {}'.format(normalized)
             result = self._model.predict(example)
             if self.algorithm == 'multilabel_oaa':
                 # result is a list of subject IDs - need to vectorize

From f77c4a98cb8a6c0b76b0fde602b65a856ff48e28 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Mon, 4 Feb 2019 16:13:49 +0200
Subject: [PATCH 2/9] Make it possible to use input from other
 projects/backends in vw_multi

---
 annif/backend/vw_multi.py | 56 ++++++++++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 738571956..9f6b286b6 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -6,7 +6,7 @@
 import annif.util
 from vowpalwabbit import pyvw
 import numpy as np
-from annif.hit import VectorAnalysisResult
+from annif.hit import ListAnalysisResult, VectorAnalysisResult
 from annif.exception import ConfigurationException, NotInitializedException
 from . import backend
 from . import mixins
@@ -23,7 +23,7 @@ class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend):
         # where allowed_values is either a type or a list of allowed values
         # and default_value may be None, to let VW decide by itself
         'bit_precision': (int, None),
-        'ngram': (int, None),
+        'ngram': (lambda x: '_{}'.format(int(x)), None),
         'learning_rate': (float, None),
         'loss_function': (['squared', 'logistic', 'hinge'], 'logistic'),
         'l1': (float, None),
@@ -35,6 +35,8 @@ class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend):
     DEFAULT_ALGORITHM = 'oaa'
     SUPPORTED_ALGORITHMS = ('oaa', 'ect', 'log_multi', 'multilabel_oaa')
 
+    DEFAULT_INPUTS = '_text_'
+
     MODEL_FILE = 'vw-model'
     TRAIN_FILE = 'vw-train.txt'
 
@@ -67,11 +69,20 @@ def algorithm(self):
                 backend_id=self.backend_id)
         return algorithm
 
+    @property
+    def inputs(self):
+        inputs = self.params.get('inputs', self.DEFAULT_INPUTS)
+        return inputs.split(',')
+
+    @staticmethod
+    def _cleanup_text(text):
+        # colon and pipe chars have special meaning in VW and must be avoided
+        return text.replace(':', '').replace('|', '')
+
     @staticmethod
     def _normalize_text(project, text):
         ntext = ' '.join(project.analyzer.tokenize_words(text))
-        # colon and pipe chars have special meaning in VW and must be avoided
-        return ntext.replace(':', '').replace('|', '')
+        return VWMultiBackend._cleanup_text(ntext)
 
     @staticmethod
     def _write_train_file(examples, filename):
@@ -91,16 +102,39 @@ def _uris_to_subject_ids(project, uris):
     def _format_examples(self, project, text, uris):
         subject_ids = self._uris_to_subject_ids(project, uris)
         if self.algorithm == 'multilabel_oaa':
-            yield '{} | {}'.format(','.join(map(str, subject_ids)), text)
+            yield '{} {}'.format(','.join(map(str, subject_ids)), text)
         else:
             for subject_id in subject_ids:
-                yield '{} | {}'.format(subject_id + 1, text)
+                yield '{} {}'.format(subject_id + 1, text)
+
+    def _inputs_to_exampletext(self, project, text):
+        namespaces = {}
+        for input in self.inputs:
+            if input == '_text_':
+                normalized = self._normalize_text(project, text)
+                if normalized != '':
+                    namespaces['_text_'] = normalized
+            else:
+                proj = annif.project.get_project(input)
+                result = proj.analyze(text)
+                features = [
+                    '{}:{}'.format(
+                        self._cleanup_text(
+                            hit.uri),
+                        hit.score) for hit in result.hits]
+                namespaces[input] = ' '.join(features)
+        if not namespaces:
+            return None
+        return ' '.join(['|{} {}'.format(namespace, featurestr)
+                         for namespace, featurestr in namespaces.items()])
 
     def _create_train_file(self, corpus, project):
         self.info('creating VW train file')
         examples = []
         for doc in corpus.documents:
-            text = self._normalize_text(project, doc.text)
+            text = self._inputs_to_exampletext(project, doc.text)
+            if not text:
+                continue
             examples.extend(self._format_examples(project, text, doc.uris))
         random.shuffle(examples)
         annif.util.atomic_save(examples,
@@ -152,10 +186,10 @@ def train(self, corpus, project):
     def _analyze_chunks(self, chunktexts, project):
         results = []
         for chunktext in chunktexts:
-            normalized = self._normalize_text(project, chunktext)
-            if normalized == '':
+            exampletext = self._inputs_to_exampletext(project, chunktext)
+            if not exampletext:
                 continue
-            example = ' | {}'.format(normalized)
+            example = ' {}'.format(exampletext)
             result = self._model.predict(example)
             if self.algorithm == 'multilabel_oaa':
                 # result is a list of subject IDs - need to vectorize
@@ -170,5 +204,7 @@ def _analyze_chunks(self, chunktexts, project):
             else:
                 result = np.array(result)
             results.append(result)
+        if len(results) == 0:  # empty result
+            return ListAnalysisResult(hits=[], subject_index=project.subjects)
         return VectorAnalysisResult(
             np.array(results).mean(axis=0), project.subjects)

From c4d5ea4a9819a0efdc757f30fba1e8f0e9fa1433 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 5 Feb 2019 11:10:10 +0200
Subject: [PATCH 3/9] Add unit test for training vw_multi using input from
 another project

---
 tests/test_backend_vw_multi.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/test_backend_vw_multi.py b/tests/test_backend_vw_multi.py
index a0594ae2b..e204490f3 100644
--- a/tests/test_backend_vw_multi.py
+++ b/tests/test_backend_vw_multi.py
@@ -44,6 +44,22 @@ def test_vw_multi_train(datadir, document_corpus, project):
     assert datadir.join('vw-model').size() > 0
 
 
+def test_vw_multi_train_from_project(app, datadir, document_corpus, project):
+    vw_type = annif.backend.get_backend('vw_multi')
+    vw = vw_type(
+        backend_id='vw_multi',
+        params={
+            'chunksize': 4,
+            'inputs': '_text_,dummy-en'},
+        datadir=str(datadir))
+
+    with app.app_context():
+        vw.train(document_corpus, project)
+    assert vw._model is not None
+    assert datadir.join('vw-model').exists()
+    assert datadir.join('vw-model').size() > 0
+
+
 def test_vw_multi_train_multiple_passes(datadir, document_corpus, project):
     vw_type = annif.backend.get_backend('vw_multi')
     vw = vw_type(

From 99201e5664b023dda95695c52168fb1d25b02ac9 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 5 Feb 2019 12:27:39 +0200
Subject: [PATCH 4/9] Add empty vw_multi training doc to ensure 100% test
 coverage

---
 tests/test_backend_vw_multi.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_backend_vw_multi.py b/tests/test_backend_vw_multi.py
index e204490f3..359fcbdf7 100644
--- a/tests/test_backend_vw_multi.py
+++ b/tests/test_backend_vw_multi.py
@@ -13,7 +13,8 @@ def vw_corpus(tmpdir):
     """return a small document corpus for testing VW training"""
     tmpfile = tmpdir.join('document.tsv')
     tmpfile.write("nonexistent\thttp://example.com/nonexistent\n" +
-                  "arkeologia\thttp://www.yso.fi/onto/yso/p1265")
+                  "arkeologia\thttp://www.yso.fi/onto/yso/p1265\n" +
+                  "...\thttp://example.com/none")
     return annif.corpus.DocumentFile(str(tmpfile))
 
 

From 69906fcc998ba07e50e25bfc6196b11cd12c1af4 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 5 Feb 2019 12:32:43 +0200
Subject: [PATCH 5/9] refactor: simplify _predict_chunks using a filter
 expression

---
 annif/backend/fasttext.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py
index 609664a49..5c8b6627c 100644
--- a/annif/backend/fasttext.py
+++ b/annif/backend/fasttext.py
@@ -108,12 +108,9 @@ def train(self, corpus, project):
         self._create_model()
 
     def _predict_chunks(self, chunktexts, project, limit):
-        normalized_chunks = []
-        for chunktext in chunktexts:
-            normalized = self._normalize_text(project, chunktext)
-            if normalized != '':
-                normalized_chunks.append(normalized)
-        return self._model.predict(normalized_chunks, limit)
+        return self._model.predict(list(
+            filter(None, [self._normalize_text(project, chunktext)
+                          for chunktext in chunktexts])), limit)
 
     def _analyze_chunks(self, chunktexts, project):
         limit = int(self.params['limit'])

From b69034007e3dfd722fdbebd3f5bb775eee58e41b Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 5 Feb 2019 12:36:03 +0200
Subject: [PATCH 6/9] style fixes

---
 annif/backend/vw_multi.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 9f6b286b6..9f0a2dfaf 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -118,11 +118,10 @@ def _inputs_to_exampletext(self, project, text):
                 proj = annif.project.get_project(input)
                 result = proj.analyze(text)
                 features = [
-                    '{}:{}'.format(
-                        self._cleanup_text(
-                            hit.uri),
-                        hit.score) for hit in result.hits]
-                namespaces[input] = ' '.join(features)
+                    '{}:{}'.format(self._cleanup_text(hit.uri), hit.score)
+                    for hit in result.hits]
+                if features:
+                    namespaces[input] = ' '.join(features)
         if not namespaces:
             return None
         return ' '.join(['|{} {}'.format(namespace, featurestr)
@@ -204,7 +203,7 @@ def _analyze_chunks(self, chunktexts, project):
             else:
                 result = np.array(result)
             results.append(result)
-        if len(results) == 0:  # empty result
+        if not results:  # empty result
             return ListAnalysisResult(hits=[], subject_index=project.subjects)
         return VectorAnalysisResult(
             np.array(results).mean(axis=0), project.subjects)

From 899e8871ad8564b085bbad1c87bd2a485387da5e Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 5 Feb 2019 12:38:44 +0200
Subject: [PATCH 7/9] refactor: split off result conversion from
 _analyze_chunks

---
 annif/backend/vw_multi.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 9f0a2dfaf..df7cd9d87 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -182,6 +182,21 @@ def train(self, corpus, project):
         self._create_train_file(corpus, project)
         self._create_model(project)
 
+    def _convert_result(self, result, project):
+        if self.algorithm == 'multilabel_oaa':
+            # result is a list of subject IDs - need to vectorize
+            mask = np.zeros(len(project.subjects))
+            mask[result] = 1.0
+            return mask
+        elif isinstance(result, int):
+            # result is a single integer - need to one-hot-encode
+            mask = np.zeros(len(project.subjects))
+            mask[result - 1] = 1.0
+            return mask
+        else:
+            # result is a list of scores (probabilities or binary 1/0)
+            return np.array(result)
+
     def _analyze_chunks(self, chunktexts, project):
         results = []
         for chunktext in chunktexts:
@@ -190,19 +205,7 @@ def _analyze_chunks(self, chunktexts, project):
                 continue
             example = ' {}'.format(exampletext)
             result = self._model.predict(example)
-            if self.algorithm == 'multilabel_oaa':
-                # result is a list of subject IDs - need to vectorize
-                mask = np.zeros(len(project.subjects))
-                mask[result] = 1.0
-                result = mask
-            elif isinstance(result, int):
-                # result is a single integer - need to one-hot-encode
-                mask = np.zeros(len(project.subjects))
-                mask[result - 1] = 1.0
-                result = mask
-            else:
-                result = np.array(result)
-            results.append(result)
+            results.append(self._convert_result(result, project))
         if not results:  # empty result
             return ListAnalysisResult(hits=[], subject_index=project.subjects)
         return VectorAnalysisResult(

From 7edf0f67cccc4f8389cc86ca65e2954c4870c9c7 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 5 Feb 2019 12:42:07 +0200
Subject: [PATCH 8/9] refactor _inputs_to_exampletext: split off _get_input

---
 annif/backend/vw_multi.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index df7cd9d87..8b6cd637e 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -107,21 +107,27 @@ def _format_examples(self, project, text, uris):
             for subject_id in subject_ids:
                 yield '{} {}'.format(subject_id + 1, text)
 
+    def _get_input(self, input, project, text):
+        if input == '_text_':
+            normalized = self._normalize_text(project, text)
+            if normalized != '':
+                return normalized
+        else:
+            proj = annif.project.get_project(input)
+            result = proj.analyze(text)
+            features = [
+                '{}:{}'.format(self._cleanup_text(hit.uri), hit.score)
+                for hit in result.hits]
+            if features:
+                return ' '.join(features)
+        return None
+
     def _inputs_to_exampletext(self, project, text):
         namespaces = {}
         for input in self.inputs:
-            if input == '_text_':
-                normalized = self._normalize_text(project, text)
-                if normalized != '':
-                    namespaces['_text_'] = normalized
-            else:
-                proj = annif.project.get_project(input)
-                result = proj.analyze(text)
-                features = [
-                    '{}:{}'.format(self._cleanup_text(hit.uri), hit.score)
-                    for hit in result.hits]
-                if features:
-                    namespaces[input] = ' '.join(features)
+            inputtext = self._get_input(input, project, text)
+            if inputtext:
+                namespaces[input] = inputtext
         if not namespaces:
             return None
         return ' '.join(['|{} {}'.format(namespace, featurestr)

From 11943c7c6becef5691e8a394e3962137045135a5 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Tue, 5 Feb 2019 12:45:26 +0200
Subject: [PATCH 9/9] further simplify _get_input

---
 annif/backend/vw_multi.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 8b6cd637e..1f6d47961 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -109,18 +109,14 @@ def _format_examples(self, project, text, uris):
 
     def _get_input(self, input, project, text):
         if input == '_text_':
-            normalized = self._normalize_text(project, text)
-            if normalized != '':
-                return normalized
+            return self._normalize_text(project, text)
         else:
             proj = annif.project.get_project(input)
             result = proj.analyze(text)
             features = [
                 '{}:{}'.format(self._cleanup_text(hit.uri), hit.score)
                 for hit in result.hits]
-            if features:
-                return ' '.join(features)
-        return None
+            return ' '.join(features)
 
     def _inputs_to_exampletext(self, project, text):
         namespaces = {}