From 112554db6a8ce2b1652a41fb8f9fa9d94905d309 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 24 Jun 2019 18:07:22 +0300 Subject: [PATCH 01/25] Make sure all projects are initialized when they are used for suggesting --- annif/backend/backend.py | 1 + annif/backend/tfidf.py | 1 - tests/test_project.py | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/annif/backend/backend.py b/annif/backend/backend.py index b1da11c7c..817b6b23f 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -38,6 +38,7 @@ def _suggest(self, text, project, params): def suggest(self, text, project, params=None): """Suggest subjects for the input text and return a list of subjects represented as a list of SubjectSuggestion objects.""" + self.initialize() beparams = dict(self.params) if params: beparams.update(params) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 923066497..8e7f1dc5e 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -46,7 +46,6 @@ def train(self, corpus, project): self.INDEX_FILE) def _suggest(self, text, project, params): - self.initialize() self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) vectors = project.vectorizer.transform([text]) diff --git a/tests/test_project.py b/tests/test_project.py index 758ba28b3..b70a41af4 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -163,7 +163,6 @@ def test_project_not_initialized(app): with app.app_context(): project = annif.project.get_project('dummy-en') assert not project.initialized - assert not project.backend.initialized def test_project_initialized(app_with_initialize): From a56be2667db153c87fc5cb03384389e2167ed5fb Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 24 Jun 2019 18:08:54 +0300 Subject: [PATCH 02/25] Initial implementation of vw_ensemble backend. Fixes #235 --- annif/backend/__init__.py | 6 +- annif/backend/ensemble.py | 10 +- annif/backend/mixins.py | 1 - annif/backend/vw_ensemble.py | 170 ++++++++++++++++++++++++++++++ annif/backend/vw_multi.py | 1 + tests/conftest.py | 12 +++ tests/test_backend_vw_ensemble.py | 56 ++++++++++ 7 files changed, 251 insertions(+), 5 deletions(-) create mode 100644 annif/backend/vw_ensemble.py create mode 100644 tests/test_backend_vw_ensemble.py diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index 1b5b21651..4a5fd82e2 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -38,6 +38,8 @@ def get_backend(backend_id): try: from . import vw_multi register_backend(vw_multi.VWMultiBackend) + from . import vw_ensemble + register_backend(vw_ensemble.VWEnsembleBackend) except ImportError: - annif.logger.debug( - "vowpalwabbit not available, not enabling vw_multi backend") + annif.logger.debug("vowpalwabbit not available, not enabling " + + "vw_multi & vw_ensemble backends") diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index 90d2712e7..90959979a 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -30,10 +30,16 @@ def _suggest_with_sources(self, text, sources): hits=norm_hits, weight=weight)) return hits_from_sources + def _merge_hits_from_sources(self, hits_from_sources, project, params): + """Hook for merging hits from sources. Can be overridden by + subclasses.""" + return annif.util.merge_hits(hits_from_sources, project.subjects) + def _suggest(self, text, project, params): sources = annif.util.parse_sources(params['sources']) hits_from_sources = self._suggest_with_sources(text, sources) - merged_hits = annif.util.merge_hits( - hits_from_sources, project.subjects) + merged_hits = self._merge_hits_from_sources(hits_from_sources, + project, + params) self.debug('{} hits after merging'.format(len(merged_hits))) return merged_hits diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 976ed028c..04774a58c 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -16,7 +16,6 @@ def _suggest_chunks(self, chunktexts, project): pass # pragma: no cover def _suggest(self, text, project, params): - self.initialize() self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) sentences = project.analyzer.tokenize_sentences(text) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py new file mode 100644 index 000000000..c43242aac --- /dev/null +++ b/annif/backend/vw_ensemble.py @@ -0,0 +1,170 @@ +"""Annif backend using the Vowpal Wabbit multiclass and multilabel +classifiers""" + +import random +import os.path +import annif.util +from vowpalwabbit import pyvw +import numpy as np +from annif.suggestion import VectorSuggestionResult +from annif.exception import ConfigurationException, NotInitializedException +from . import backend +from . import ensemble + + +class VWEnsembleBackend( + ensemble.EnsembleBackend, + backend.AnnifLearningBackend): + """Vowpal Wabbit ensemble backend that combines results from multiple + projects and learns how well those projects/backends recognize + particular subjects.""" + + name = "vw_ensemble" + + VW_PARAMS = { + # each param specifier is a pair (allowed_values, default_value) + # where allowed_values is either a type or a list of allowed values + # and default_value may be None, to let VW decide by itself + 'bit_precision': (int, None), + 'learning_rate': (float, None), + 'loss_function': (['squared', 'logistic', 'hinge'], 'squared'), + 'l1': (float, None), + 'l2': (float, None), + 'passes': (int, None) + } + + MODEL_FILE = 'vw-model' + TRAIN_FILE = 'vw-train.txt' + + # defaults for uninitialized instances + _model = None + + def initialize(self): + if self._model is None: + path = os.path.join(self.datadir, self.MODEL_FILE) + if not os.path.exists(path): + raise NotInitializedException( + 'model {} not found'.format(path), + backend_id=self.backend_id) + self.debug('loading VW model from {}'.format(path)) + params = self._create_params({'i': path, 'quiet': True}) + if 'passes' in params: + # don't confuse the model with passes + del params['passes'] + self.debug("model parameters: {}".format(params)) + self._model = pyvw.vw(**params) + self.debug('loaded model {}'.format(str(self._model))) + + @staticmethod + def _write_train_file(examples, filename): + with open(filename, 'w', encoding='utf-8') as trainfile: + for ex in examples: + print(ex, file=trainfile) + + def _merge_hits_from_sources(self, hits_from_sources, project, params): + score_vector = np.array([hits.vector + for hits, _ in hits_from_sources]) + result = np.zeros(score_vector.shape[1]) + for subj_id in range(score_vector.shape[1]): + if score_vector[:, subj_id].sum() > 0.0: + ex = self._format_example( + subj_id, + score_vector[:, subj_id]) + score = (self._model.predict(ex) + 1.0) / 2.0 + result[subj_id] = score + return VectorSuggestionResult(result, project.subjects) + + def _format_example(self, subject_id, scores, true=None): + if true is None: + val = '' + elif true: + val = 1 + else: + val = -1 + ex = "{} |{}".format(val, subject_id) + for proj_idx, proj in enumerate(self.source_project_ids): + ex += " {}:{}".format(proj, scores[proj_idx]) + return ex + + @property + def source_project_ids(self): + sources = annif.util.parse_sources(self.params['sources']) + return [project_id for project_id, _ in sources] + + def _create_examples(self, corpus, project): + source_projects = [annif.project.get_project(project_id) + for project_id in self.source_project_ids] + examples = [] + for doc in corpus.documents: + subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) + true = subjects.as_vector(project.subjects) + score_vectors = [] + for source_project in source_projects: + hits = source_project.suggest(doc.text) + score_vectors.append(hits.vector) + score_vector = np.array(score_vectors) + for subj_id in range(len(true)): + if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: + ex = self._format_example( + subj_id, + score_vector[:, subj_id], + true[subj_id]) + examples.append(ex) + random.shuffle(examples) + return examples + + def _create_train_file(self, corpus, project): + self.info('creating VW train file') + examples = self._create_examples(corpus, project) + annif.util.atomic_save(examples, + self.datadir, + self.TRAIN_FILE, + method=self._write_train_file) + + def _convert_param(self, param, val): + pspec, _ = self.VW_PARAMS[param] + if isinstance(pspec, list): + if val in pspec: + return val + raise ConfigurationException( + "{} is not a valid value for {} (allowed: {})".format( + val, param, ', '.join(pspec)), backend_id=self.backend_id) + try: + return pspec(val) + except ValueError: + raise ConfigurationException( + "The {} value {} cannot be converted to {}".format( + param, val, pspec), backend_id=self.backend_id) + + def _create_params(self, params): + params.update({param: defaultval + for param, (_, defaultval) in self.VW_PARAMS.items() + if defaultval is not None}) + params.update({param: self._convert_param(param, val) + for param, val in self.params.items() + if param in self.VW_PARAMS}) + return params + + def _create_model(self, project): + trainpath = os.path.join(self.datadir, self.TRAIN_FILE) + params = self._create_params( + {'data': trainpath, 'q': '::'}) + if params.get('passes', 1) > 1: + # need a cache file when there are multiple passes + params.update({'cache': True, 'kill_cache': True}) + self.debug("model parameters: {}".format(params)) + self._model = pyvw.vw(**params) + modelpath = os.path.join(self.datadir, self.MODEL_FILE) + self._model.save(modelpath) + + def train(self, corpus, project): + self.info("creating VW ensemble model") + self._create_train_file(corpus, project) + self._create_model(project) + + def learn(self, corpus, project): + self.initialize() + for example in self._create_examples(corpus, project): + self._model.learn(example) + modelpath = os.path.join(self.datadir, self.MODEL_FILE) + self._model.save(modelpath) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 287704d02..e4006128f 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -189,6 +189,7 @@ def train(self, corpus, project): self._create_model(project) def learn(self, corpus, project): + self.initialize() for example in self._create_examples(corpus, project): self._model.learn(example) modelpath = os.path.join(self.datadir, self.MODEL_FILE) diff --git a/tests/conftest.py b/tests/conftest.py index d10c14d1e..c965d3fac 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,6 +73,18 @@ def document_corpus(subject_index): return doc_corpus +@pytest.fixture(scope='module') +def fulltext_corpus(subject_index): + docdir = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'fulltext') + ft_corpus = annif.corpus.DocumentDirectory(docdir) + ft_corpus.set_subject_index(subject_index) + return ft_corpus + + @pytest.fixture(scope='module') def project(document_corpus): proj = unittest.mock.Mock() diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py new file mode 100644 index 000000000..b9308bffb --- /dev/null +++ b/tests/test_backend_vw_ensemble.py @@ -0,0 +1,56 @@ +"""Unit tests for the vw_ensemble backend in Annif""" + +import pytest +import annif.backend +import annif.corpus + +pytest.importorskip("annif.backend.vw_ensemble") + + +def test_vw_ensemble_train(app, datadir, tmpdir, fulltext_corpus, project): + vw_ensemble_type = annif.backend.get_backend("vw_ensemble") + vw_ensemble = vw_ensemble_type( + backend_id='vw_ensemble', + params={'sources': 'tfidf-fi'}, + datadir=str(datadir)) + + with app.app_context(): + vw_ensemble.train(fulltext_corpus, project) + assert datadir.join('vw-train.txt').exists() + assert datadir.join('vw-train.txt').size() > 0 + assert datadir.join('vw-model').exists() + assert datadir.join('vw-model').size() > 0 + + +def test_vw_ensemble_initialize(app, datadir): + vw_ensemble_type = annif.backend.get_backend("vw_ensemble") + vw_ensemble = vw_ensemble_type( + backend_id='vw_ensemble', + params={'sources': 'tfidf-fi'}, + datadir=str(datadir)) + + assert vw_ensemble._model is None + with app.app_context(): + vw_ensemble.initialize() + assert vw_ensemble._model is not None + # initialize a second time - this shouldn't do anything + with app.app_context(): + vw_ensemble.initialize() + + +def test_vw_ensemble_suggest(app, datadir, project): + vw_ensemble_type = annif.backend.get_backend("vw_ensemble") + vw_ensemble = vw_ensemble_type( + backend_id='vw_ensemble', + params={'sources': 'tfidf-fi'}, + datadir=str(datadir)) + + results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös + muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede + tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä. + Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä, + joita ihmisten toiminta on jättänyt maaperään tai vesistöjen + pohjaan.""", project) + + assert vw_ensemble._model is not None + assert len(results) > 0 From a6079a0a315fbae6b4f6aa4dee0efe4d8bdfcd15 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 14:12:33 +0300 Subject: [PATCH 03/25] fix test failure caused by using an uninitialized tfidf-fi project (when starting with a clean datadir) --- tests/test_backend_vw_ensemble.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py index b9308bffb..bc091d78e 100644 --- a/tests/test_backend_vw_ensemble.py +++ b/tests/test_backend_vw_ensemble.py @@ -3,19 +3,27 @@ import pytest import annif.backend import annif.corpus +import annif.project pytest.importorskip("annif.backend.vw_ensemble") -def test_vw_ensemble_train(app, datadir, tmpdir, fulltext_corpus, project): +def test_vw_ensemble_train(app, datadir, tmpdir): vw_ensemble_type = annif.backend.get_backend("vw_ensemble") vw_ensemble = vw_ensemble_type( backend_id='vw_ensemble', - params={'sources': 'tfidf-fi'}, + params={'sources': 'dummy-en'}, datadir=str(datadir)) + tmpfile = tmpdir.join('document.tsv') + tmpfile.write("dummy\thttp://example.org/dummy\n" + + "another\thttp://example.org/dummy\n" + + "none\thttp://example.org/none") + document_corpus = annif.corpus.DocumentFile(str(tmpfile)) + project = annif.project.get_project('dummy-en') + with app.app_context(): - vw_ensemble.train(fulltext_corpus, project) + vw_ensemble.train(document_corpus, project) assert datadir.join('vw-train.txt').exists() assert datadir.join('vw-train.txt').size() > 0 assert datadir.join('vw-model').exists() @@ -26,7 +34,7 @@ def test_vw_ensemble_initialize(app, datadir): vw_ensemble_type = annif.backend.get_backend("vw_ensemble") vw_ensemble = vw_ensemble_type( backend_id='vw_ensemble', - params={'sources': 'tfidf-fi'}, + params={'sources': 'dummy-en'}, datadir=str(datadir)) assert vw_ensemble._model is None @@ -38,13 +46,15 @@ def test_vw_ensemble_initialize(app, datadir): vw_ensemble.initialize() -def test_vw_ensemble_suggest(app, datadir, project): +def test_vw_ensemble_suggest(app, datadir): vw_ensemble_type = annif.backend.get_backend("vw_ensemble") vw_ensemble = vw_ensemble_type( backend_id='vw_ensemble', - params={'sources': 'tfidf-fi'}, + params={'sources': 'dummy-en'}, datadir=str(datadir)) + project = annif.project.get_project('dummy-en') + results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä. From a3579d6acc2d38157a6b725f6a55dae57e344a38 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 14:26:47 +0300 Subject: [PATCH 04/25] remove unused fixture --- tests/conftest.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c965d3fac..d10c14d1e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,18 +73,6 @@ def document_corpus(subject_index): return doc_corpus -@pytest.fixture(scope='module') -def fulltext_corpus(subject_index): - docdir = os.path.join( - os.path.dirname(__file__), - 'corpora', - 'archaeology', - 'fulltext') - ft_corpus = annif.corpus.DocumentDirectory(docdir) - ft_corpus.set_subject_index(subject_index) - return ft_corpus - - @pytest.fixture(scope='module') def project(document_corpus): proj = unittest.mock.Mock() From 9d46450738b85d6191c252b351c7890389bafd31 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 15:02:13 +0300 Subject: [PATCH 05/25] introduce common base class for VW backends --- annif/backend/vw_base.py | 9 +++++++++ annif/backend/vw_ensemble.py | 4 ++-- annif/backend/vw_multi.py | 6 +++--- 3 files changed, 14 insertions(+), 5 deletions(-) create mode 100644 annif/backend/vw_base.py diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py new file mode 100644 index 000000000..798a3ef07 --- /dev/null +++ b/annif/backend/vw_base.py @@ -0,0 +1,9 @@ +"""Base class for Vowpal Wabbit based Annif backends""" + +from . import backend + + +class VWBaseBackend(backend.AnnifLearningBackend): + """Base class for Vowpal Wabbit based Annif backends""" + + pass diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index c43242aac..e46012ea4 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -8,13 +8,13 @@ import numpy as np from annif.suggestion import VectorSuggestionResult from annif.exception import ConfigurationException, NotInitializedException -from . import backend +from . import vw_base from . import ensemble class VWEnsembleBackend( ensemble.EnsembleBackend, - backend.AnnifLearningBackend): + vw_base.VWBaseBackend): """Vowpal Wabbit ensemble backend that combines results from multiple projects and learns how well those projects/backends recognize particular subjects.""" diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index e4006128f..dbf8a97a5 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -8,12 +8,12 @@ import numpy as np from annif.suggestion import ListSuggestionResult, VectorSuggestionResult from annif.exception import ConfigurationException, NotInitializedException -from . import backend +from . import vw_base from . import mixins -class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifLearningBackend): - """Vorpal Wabbit multiclass/multilabel backend for Annif""" +class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend): + """Vowpal Wabbit multiclass/multilabel backend for Annif""" name = "vw_multi" needs_subject_index = True From 9503aa824b65fb0f6783e2d65e224538e84068a9 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 15:09:17 +0300 Subject: [PATCH 06/25] refactor: move initialize method to vw_base --- annif/backend/vw_base.py | 25 ++++++++++++++++++++++++- annif/backend/vw_ensemble.py | 24 +----------------------- annif/backend/vw_multi.py | 24 +----------------------- 3 files changed, 26 insertions(+), 47 deletions(-) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index 798a3ef07..e4ce9c8d5 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -1,9 +1,32 @@ """Base class for Vowpal Wabbit based Annif backends""" +import os +from vowpalwabbit import pyvw +from annif.exception import NotInitializedException from . import backend class VWBaseBackend(backend.AnnifLearningBackend): """Base class for Vowpal Wabbit based Annif backends""" - pass + MODEL_FILE = 'vw-model' + TRAIN_FILE = 'vw-train.txt' + + # defaults for uninitialized instances + _model = None + + def initialize(self): + if self._model is None: + path = os.path.join(self.datadir, self.MODEL_FILE) + if not os.path.exists(path): + raise NotInitializedException( + 'model {} not found'.format(path), + backend_id=self.backend_id) + self.debug('loading VW model from {}'.format(path)) + params = self._create_params({'i': path, 'quiet': True}) + if 'passes' in params: + # don't confuse the model with passes + del params['passes'] + self.debug("model parameters: {}".format(params)) + self._model = pyvw.vw(**params) + self.debug('loaded model {}'.format(str(self._model))) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index e46012ea4..3f8626a6b 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -7,7 +7,7 @@ from vowpalwabbit import pyvw import numpy as np from annif.suggestion import VectorSuggestionResult -from annif.exception import ConfigurationException, NotInitializedException +from annif.exception import ConfigurationException from . import vw_base from . import ensemble @@ -33,28 +33,6 @@ class VWEnsembleBackend( 'passes': (int, None) } - MODEL_FILE = 'vw-model' - TRAIN_FILE = 'vw-train.txt' - - # defaults for uninitialized instances - _model = None - - def initialize(self): - if self._model is None: - path = os.path.join(self.datadir, self.MODEL_FILE) - if not os.path.exists(path): - raise NotInitializedException( - 'model {} not found'.format(path), - backend_id=self.backend_id) - self.debug('loading VW model from {}'.format(path)) - params = self._create_params({'i': path, 'quiet': True}) - if 'passes' in params: - # don't confuse the model with passes - del params['passes'] - self.debug("model parameters: {}".format(params)) - self._model = pyvw.vw(**params) - self.debug('loaded model {}'.format(str(self._model))) - @staticmethod def _write_train_file(examples, filename): with open(filename, 'w', encoding='utf-8') as trainfile: diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index dbf8a97a5..ca0c3016a 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -7,7 +7,7 @@ from vowpalwabbit import pyvw import numpy as np from annif.suggestion import ListSuggestionResult, VectorSuggestionResult -from annif.exception import ConfigurationException, NotInitializedException +from annif.exception import ConfigurationException from . import vw_base from . import mixins @@ -37,28 +37,6 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend): DEFAULT_INPUTS = '_text_' - MODEL_FILE = 'vw-model' - TRAIN_FILE = 'vw-train.txt' - - # defaults for uninitialized instances - _model = None - - def initialize(self): - if self._model is None: - path = os.path.join(self.datadir, self.MODEL_FILE) - if not os.path.exists(path): - raise NotInitializedException( - 'model {} not found'.format(path), - backend_id=self.backend_id) - self.debug('loading VW model from {}'.format(path)) - params = self._create_params({'i': path, 'quiet': True}) - if 'passes' in params: - # don't confuse the model with passes - del params['passes'] - self.debug("model parameters: {}".format(params)) - self._model = pyvw.vw(**params) - self.debug('loaded model {}'.format(str(self._model))) - @property def algorithm(self): algorithm = self.params.get('algorithm', self.DEFAULT_ALGORITHM) From 098e02ac8ff55efa47ed784ec0ac307e7a53d69a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 15:15:42 +0300 Subject: [PATCH 07/25] Refactor: move parameter handling to vw_base --- annif/backend/vw_base.py | 31 +++++++++++++++++++++++++++++++ annif/backend/vw_ensemble.py | 28 ---------------------------- annif/backend/vw_multi.py | 27 --------------------------- 3 files changed, 31 insertions(+), 55 deletions(-) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index e4ce9c8d5..5600c5d56 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -2,6 +2,7 @@ import os from vowpalwabbit import pyvw +from annif.exception import ConfigurationException from annif.exception import NotInitializedException from . import backend @@ -9,6 +10,12 @@ class VWBaseBackend(backend.AnnifLearningBackend): """Base class for Vowpal Wabbit based Annif backends""" + # Parameters for VW based backends + # each param specifier is a pair (allowed_values, default_value) + # where allowed_values is either a type or a list of allowed values + # and default_value may be None, to let VW decide by itself + VW_PARAMS = {} # needs to be specified in subclasses + MODEL_FILE = 'vw-model' TRAIN_FILE = 'vw-train.txt' @@ -30,3 +37,27 @@ def initialize(self): self.debug("model parameters: {}".format(params)) self._model = pyvw.vw(**params) self.debug('loaded model {}'.format(str(self._model))) + + def _convert_param(self, param, val): + pspec, _ = self.VW_PARAMS[param] + if isinstance(pspec, list): + if val in pspec: + return val + raise ConfigurationException( + "{} is not a valid value for {} (allowed: {})".format( + val, param, ', '.join(pspec)), backend_id=self.backend_id) + try: + return pspec(val) + except ValueError: + raise ConfigurationException( + "The {} value {} cannot be converted to {}".format( + param, val, pspec), backend_id=self.backend_id) + + def _create_params(self, params): + params.update({param: defaultval + for param, (_, defaultval) in self.VW_PARAMS.items() + if defaultval is not None}) + params.update({param: self._convert_param(param, val) + for param, val in self.params.items() + if param in self.VW_PARAMS}) + return params diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 3f8626a6b..e39987c4a 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -7,7 +7,6 @@ from vowpalwabbit import pyvw import numpy as np from annif.suggestion import VectorSuggestionResult -from annif.exception import ConfigurationException from . import vw_base from . import ensemble @@ -22,9 +21,6 @@ class VWEnsembleBackend( name = "vw_ensemble" VW_PARAMS = { - # each param specifier is a pair (allowed_values, default_value) - # where allowed_values is either a type or a list of allowed values - # and default_value may be None, to let VW decide by itself 'bit_precision': (int, None), 'learning_rate': (float, None), 'loss_function': (['squared', 'logistic', 'hinge'], 'squared'), @@ -99,30 +95,6 @@ def _create_train_file(self, corpus, project): self.TRAIN_FILE, method=self._write_train_file) - def _convert_param(self, param, val): - pspec, _ = self.VW_PARAMS[param] - if isinstance(pspec, list): - if val in pspec: - return val - raise ConfigurationException( - "{} is not a valid value for {} (allowed: {})".format( - val, param, ', '.join(pspec)), backend_id=self.backend_id) - try: - return pspec(val) - except ValueError: - raise ConfigurationException( - "The {} value {} cannot be converted to {}".format( - param, val, pspec), backend_id=self.backend_id) - - def _create_params(self, params): - params.update({param: defaultval - for param, (_, defaultval) in self.VW_PARAMS.items() - if defaultval is not None}) - params.update({param: self._convert_param(param, val) - for param, val in self.params.items() - if param in self.VW_PARAMS}) - return params - def _create_model(self, project): trainpath = os.path.join(self.datadir, self.TRAIN_FILE) params = self._create_params( diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index ca0c3016a..252f6717f 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -19,9 +19,6 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend): needs_subject_index = True VW_PARAMS = { - # each param specifier is a pair (allowed_values, default_value) - # where allowed_values is either a type or a list of allowed values - # and default_value may be None, to let VW decide by itself 'bit_precision': (int, None), 'ngram': (lambda x: '_{}'.format(int(x)), None), 'learning_rate': (float, None), @@ -125,30 +122,6 @@ def _create_train_file(self, corpus, project): self.TRAIN_FILE, method=self._write_train_file) - def _convert_param(self, param, val): - pspec, _ = self.VW_PARAMS[param] - if isinstance(pspec, list): - if val in pspec: - return val - raise ConfigurationException( - "{} is not a valid value for {} (allowed: {})".format( - val, param, ', '.join(pspec)), backend_id=self.backend_id) - try: - return pspec(val) - except ValueError: - raise ConfigurationException( - "The {} value {} cannot be converted to {}".format( - param, val, pspec), backend_id=self.backend_id) - - def _create_params(self, params): - params.update({param: defaultval - for param, (_, defaultval) in self.VW_PARAMS.items() - if defaultval is not None}) - params.update({param: self._convert_param(param, val) - for param, val in self.params.items() - if param in self.VW_PARAMS}) - return params - def _create_model(self, project): self.info('creating VW model (algorithm: {})'.format(self.algorithm)) trainpath = os.path.join(self.datadir, self.TRAIN_FILE) From 745601619c5086c9ce8db86dd809f07f5d5e8fa0 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 15:30:03 +0300 Subject: [PATCH 08/25] Refactor: move learn method to vw_base (and make it an abstract base class) --- annif/backend/vw_base.py | 17 ++++++++++++++++- annif/backend/vw_ensemble.py | 7 ------- annif/backend/vw_multi.py | 7 ------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index 5600c5d56..cf84ae03a 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -1,5 +1,6 @@ """Base class for Vowpal Wabbit based Annif backends""" +import abc import os from vowpalwabbit import pyvw from annif.exception import ConfigurationException @@ -7,7 +8,7 @@ from . import backend -class VWBaseBackend(backend.AnnifLearningBackend): +class VWBaseBackend(backend.AnnifLearningBackend, metaclass=abc.ABCMeta): """Base class for Vowpal Wabbit based Annif backends""" # Parameters for VW based backends @@ -61,3 +62,17 @@ def _create_params(self, params): for param, val in self.params.items() if param in self.VW_PARAMS}) return params + + @abc.abstractmethod + def _create_examples(self, corpus, project): + """This method should be implemented by concrete backends. It + should return a sequence of strings formatted according to the VW + input format.""" + pass + + def learn(self, corpus, project): + self.initialize() + for example in self._create_examples(corpus, project): + self._model.learn(example) + modelpath = os.path.join(self.datadir, self.MODEL_FILE) + self._model.save(modelpath) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index e39987c4a..a63cbd864 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -111,10 +111,3 @@ def train(self, corpus, project): self.info("creating VW ensemble model") self._create_train_file(corpus, project) self._create_model(project) - - def learn(self, corpus, project): - self.initialize() - for example in self._create_examples(corpus, project): - self._model.learn(example) - modelpath = os.path.join(self.datadir, self.MODEL_FILE) - self._model.save(modelpath) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 252f6717f..bd4432c9d 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -139,13 +139,6 @@ def train(self, corpus, project): self._create_train_file(corpus, project) self._create_model(project) - def learn(self, corpus, project): - self.initialize() - for example in self._create_examples(corpus, project): - self._model.learn(example) - modelpath = os.path.join(self.datadir, self.MODEL_FILE) - self._model.save(modelpath) - def _convert_result(self, result, project): if self.algorithm == 'multilabel_oaa': # result is a list of subject IDs - need to vectorize From b53f7c7901d22df637aaf47032fc5c04ad15be61 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 16:05:16 +0300 Subject: [PATCH 09/25] Refactor: move train method to vw_base --- annif/backend/vw_base.py | 26 ++++++++++++++++++++++++++ annif/backend/vw_ensemble.py | 19 ------------------- annif/backend/vw_multi.py | 20 +------------------- 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index cf84ae03a..232b14729 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -3,6 +3,7 @@ import abc import os from vowpalwabbit import pyvw +import annif.util from annif.exception import ConfigurationException from annif.exception import NotInitializedException from . import backend @@ -63,12 +64,37 @@ def _create_params(self, params): if param in self.VW_PARAMS}) return params + @staticmethod + def _write_train_file(examples, filename): + with open(filename, 'w', encoding='utf-8') as trainfile: + for ex in examples: + print(ex, file=trainfile) + + def _create_train_file(self, corpus, project): + self.info('creating VW train file') + examples = self._create_examples(corpus, project) + annif.util.atomic_save(examples, + self.datadir, + self.TRAIN_FILE, + method=self._write_train_file) + @abc.abstractmethod def _create_examples(self, corpus, project): """This method should be implemented by concrete backends. It should return a sequence of strings formatted according to the VW input format.""" pass + + @abc.abstractmethod + def _create_model(self, project): + """This method should be implemented by concrete backends. It + should create an empty (untrained) VW model and save it to disk.""" + pass + + def train(self, corpus, project): + self.info("creating VW model") + self._create_train_file(corpus, project) + self._create_model(project) def learn(self, corpus, project): self.initialize() diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index a63cbd864..0dd35c9e4 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -29,12 +29,6 @@ class VWEnsembleBackend( 'passes': (int, None) } - @staticmethod - def _write_train_file(examples, filename): - with open(filename, 'w', encoding='utf-8') as trainfile: - for ex in examples: - print(ex, file=trainfile) - def _merge_hits_from_sources(self, hits_from_sources, project, params): score_vector = np.array([hits.vector for hits, _ in hits_from_sources]) @@ -87,14 +81,6 @@ def _create_examples(self, corpus, project): random.shuffle(examples) return examples - def _create_train_file(self, corpus, project): - self.info('creating VW train file') - examples = self._create_examples(corpus, project) - annif.util.atomic_save(examples, - self.datadir, - self.TRAIN_FILE, - method=self._write_train_file) - def _create_model(self, project): trainpath = os.path.join(self.datadir, self.TRAIN_FILE) params = self._create_params( @@ -106,8 +92,3 @@ def _create_model(self, project): self._model = pyvw.vw(**params) modelpath = os.path.join(self.datadir, self.MODEL_FILE) self._model.save(modelpath) - - def train(self, corpus, project): - self.info("creating VW ensemble model") - self._create_train_file(corpus, project) - self._create_model(project) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index bd4432c9d..89f4027af 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -3,9 +3,9 @@ import random import os.path -import annif.util from vowpalwabbit import pyvw import numpy as np +import annif.project from annif.suggestion import ListSuggestionResult, VectorSuggestionResult from annif.exception import ConfigurationException from . import vw_base @@ -59,12 +59,6 @@ def _normalize_text(project, text): ntext = ' '.join(project.analyzer.tokenize_words(text)) return VWMultiBackend._cleanup_text(ntext) - @staticmethod - def _write_train_file(examples, filename): - with open(filename, 'w', encoding='utf-8') as trainfile: - for ex in examples: - print(ex, file=trainfile) - @staticmethod def _uris_to_subject_ids(project, uris): subject_ids = [] @@ -114,14 +108,6 @@ def _create_examples(self, corpus, project): random.shuffle(examples) return examples - def _create_train_file(self, corpus, project): - self.info('creating VW train file') - examples = self._create_examples(corpus, project) - annif.util.atomic_save(examples, - self.datadir, - self.TRAIN_FILE, - method=self._write_train_file) - def _create_model(self, project): self.info('creating VW model (algorithm: {})'.format(self.algorithm)) trainpath = os.path.join(self.datadir, self.TRAIN_FILE) @@ -135,10 +121,6 @@ def _create_model(self, project): modelpath = os.path.join(self.datadir, self.MODEL_FILE) self._model.save(modelpath) - def train(self, corpus, project): - self._create_train_file(corpus, project) - self._create_model(project) - def _convert_result(self, result, project): if self.algorithm == 'multilabel_oaa': # result is a list of subject IDs - need to vectorize From 393b5076bb399ee83d8d9428eb83a4af6a18ae93 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 16:17:24 +0300 Subject: [PATCH 10/25] fix whitespace --- annif/backend/vw_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index 232b14729..09a9a2fc3 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -95,7 +95,7 @@ def train(self, corpus, project): self.info("creating VW model") self._create_train_file(corpus, project) self._create_model(project) - + def learn(self, corpus, project): self.initialize() for example in self._create_examples(corpus, project): From 78d206ba00e93ce4248cc837e96d973103651ea2 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 26 Jun 2019 16:25:57 +0300 Subject: [PATCH 11/25] Refactor _create_examples to reduce(?) its complexity --- annif/backend/vw_ensemble.py | 45 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 0dd35c9e4..788329053 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -42,6 +42,11 @@ def _merge_hits_from_sources(self, hits_from_sources, project, params): result[subj_id] = score return VectorSuggestionResult(result, project.subjects) + @property + def _source_project_ids(self): + sources = annif.util.parse_sources(self.params['sources']) + return [project_id for project_id, _ in sources] + def _format_example(self, subject_id, scores, true=None): if true is None: val = '' @@ -50,34 +55,34 @@ def _format_example(self, subject_id, scores, true=None): else: val = -1 ex = "{} |{}".format(val, subject_id) - for proj_idx, proj in enumerate(self.source_project_ids): + for proj_idx, proj in enumerate(self._source_project_ids): ex += " {}:{}".format(proj, scores[proj_idx]) return ex - @property - def source_project_ids(self): - sources = annif.util.parse_sources(self.params['sources']) - return [project_id for project_id, _ in sources] + def _doc_to_example(self, doc, project, source_projects): + examples = [] + subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) + true = subjects.as_vector(project.subjects) + score_vectors = [] + for source_project in source_projects: + hits = source_project.suggest(doc.text) + score_vectors.append(hits.vector) + score_vector = np.array(score_vectors) + for subj_id in range(len(true)): + if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: + ex = self._format_example( + subj_id, + score_vector[:, subj_id], + true[subj_id]) + examples.append(ex) + return examples def _create_examples(self, corpus, project): source_projects = [annif.project.get_project(project_id) - for project_id in self.source_project_ids] + for project_id in self._source_project_ids] examples = [] for doc in corpus.documents: - subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) - true = subjects.as_vector(project.subjects) - score_vectors = [] - for source_project in source_projects: - hits = source_project.suggest(doc.text) - score_vectors.append(hits.vector) - score_vector = np.array(score_vectors) - for subj_id in range(len(true)): - if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: - ex = self._format_example( - subj_id, - score_vector[:, subj_id], - true[subj_id]) - examples.append(ex) + examples += self._doc_to_example(doc, project, source_projects) random.shuffle(examples) return examples From c0cc1c7a7fb11116436c3577faebff155ff937d0 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 27 Jun 2019 12:37:27 +0300 Subject: [PATCH 12/25] Add missing import (why did it work before?) --- annif/backend/vw_ensemble.py | 1 + 1 file changed, 1 insertion(+) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 788329053..5d61ddab0 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -4,6 +4,7 @@ import random import os.path import annif.util +import annif.project from vowpalwabbit import pyvw import numpy as np from annif.suggestion import VectorSuggestionResult From ff05920684c4d7426f88e7e76d326d9e01b5684a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 27 Jun 2019 13:57:04 +0300 Subject: [PATCH 13/25] Refactor: move _create_model to vw_base; disable quadratic features for vw_ensemble (no improvement in results) --- annif/backend/vw_base.py | 16 +++++++++++----- annif/backend/vw_ensemble.py | 12 ------------ annif/backend/vw_multi.py | 11 +---------- 3 files changed, 12 insertions(+), 27 deletions(-) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index 09a9a2fc3..b172fd653 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -85,11 +85,17 @@ def _create_examples(self, corpus, project): input format.""" pass - @abc.abstractmethod - def _create_model(self, project): - """This method should be implemented by concrete backends. It - should create an empty (untrained) VW model and save it to disk.""" - pass + def _create_model(self, project, initial_params={}): + trainpath = os.path.join(self.datadir, self.TRAIN_FILE) + initial_params['data'] = trainpath + params = self._create_params(initial_params) + if params.get('passes', 1) > 1: + # need a cache file when there are multiple passes + params.update({'cache': True, 'kill_cache': True}) + self.debug("model parameters: {}".format(params)) + self._model = pyvw.vw(**params) + modelpath = os.path.join(self.datadir, self.MODEL_FILE) + self._model.save(modelpath) def train(self, corpus, project): self.info("creating VW model") diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 5d61ddab0..bc87d16e1 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -86,15 +86,3 @@ def _create_examples(self, corpus, project): examples += self._doc_to_example(doc, project, source_projects) random.shuffle(examples) return examples - - def _create_model(self, project): - trainpath = os.path.join(self.datadir, self.TRAIN_FILE) - params = self._create_params( - {'data': trainpath, 'q': '::'}) - if params.get('passes', 1) > 1: - # need a cache file when there are multiple passes - params.update({'cache': True, 'kill_cache': True}) - self.debug("model parameters: {}".format(params)) - self._model = pyvw.vw(**params) - modelpath = os.path.join(self.datadir, self.MODEL_FILE) - self._model.save(modelpath) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 89f4027af..2511e5504 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -110,16 +110,7 @@ def _create_examples(self, corpus, project): def _create_model(self, project): self.info('creating VW model (algorithm: {})'.format(self.algorithm)) - trainpath = os.path.join(self.datadir, self.TRAIN_FILE) - params = self._create_params( - {'data': trainpath, self.algorithm: len(project.subjects)}) - if params.get('passes', 1) > 1: - # need a cache file when there are multiple passes - params.update({'cache': True, 'kill_cache': True}) - self.debug("model parameters: {}".format(params)) - self._model = pyvw.vw(**params) - modelpath = os.path.join(self.datadir, self.MODEL_FILE) - self._model.save(modelpath) + super()._create_model(project, {self.algorithm: len(project.subjects)}) def _convert_result(self, result, project): if self.algorithm == 'multilabel_oaa': From 9f25308cf0b205e315d6eb0b545e7d3d0d966ccd Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 27 Jun 2019 14:08:59 +0300 Subject: [PATCH 14/25] Add "pragma: no cover" annotation for abstract method --- annif/backend/vw_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index b172fd653..cc9c32991 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -83,7 +83,7 @@ def _create_examples(self, corpus, project): """This method should be implemented by concrete backends. It should return a sequence of strings formatted according to the VW input format.""" - pass + pass # pragma: no cover def _create_model(self, project, initial_params={}): trainpath = os.path.join(self.datadir, self.TRAIN_FILE) From 2b81385b5618a81aee182e2c40df9ac3e46e3919 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 10:42:16 +0300 Subject: [PATCH 15/25] Add discounting mechanism to vw_ensemble backend --- annif/backend/vw_ensemble.py | 83 +++++++++++++++++++++++++++++-- tests/test_backend_vw_ensemble.py | 2 + 2 files changed, 81 insertions(+), 4 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index bc87d16e1..774ce3610 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -1,12 +1,15 @@ """Annif backend using the Vowpal Wabbit multiclass and multilabel classifiers""" +import collections +import json import random import os.path import annif.util import annif.project from vowpalwabbit import pyvw import numpy as np +from annif.exception import NotInitializedException from annif.suggestion import VectorSuggestionResult from . import vw_base from . import ensemble @@ -30,6 +33,37 @@ class VWEnsembleBackend( 'passes': (int, None) } + # number of training examples per subject, stored as a collections.Counter + _subject_freq = None + + FREQ_FILE = 'subject-freq.json' + + # The discount rate affects how quickly the ensemble starts to trust its + # own judgement when the amount of training data increases, versus using + # a simple mean of scores. A higher value will mean that the model + # adapts quicker (and possibly makes more errors) while a lower value + # will make it more careful so that it will require more training data. + DEFAULT_DISCOUNT_RATE = 0.01 + + def initialize(self): + if self._subject_freq is None: + path = os.path.join(self.datadir, self.FREQ_FILE) + if not os.path.exists(path): + raise NotInitializedException( + 'frequency file {} not found'.format(path), + backend_id=self.backend_id) + self.debug('loading concept frequencies from {}'.format(path)) + with open(path) as freqf: + # The Counter was serialized like a dictionary, need to + # convert it back. Keys that became strings need to be turned + # back into integers. + self._subject_freq = collections.Counter() + for cid, freq in json.load(freqf).items(): + self._subject_freq[int(cid)] = freq + self.debug('loaded frequencies for {} concepts'.format( + len(self._subject_freq))) + super().initialize() + def _merge_hits_from_sources(self, hits_from_sources, project, params): score_vector = np.array([hits.vector for hits, _ in hits_from_sources]) @@ -39,8 +73,14 @@ def _merge_hits_from_sources(self, hits_from_sources, project, params): ex = self._format_example( subj_id, score_vector[:, subj_id]) - score = (self._model.predict(ex) + 1.0) / 2.0 - result[subj_id] = score + discount_rate = self.params.get( + 'discount_rate', self.DEFAULT_DISCOUNT_RATE) + raw_weight = 1.0 / \ + ((discount_rate * self._subject_freq[subj_id]) + 1) + raw_score = score_vector[:, subj_id].mean() + pred_score = (self._model.predict(ex) + 1.0) / 2.0 + result[subj_id] = (raw_weight * raw_score) + \ + (1.0 - raw_weight) * pred_score return VectorSuggestionResult(result, project.subjects) @property @@ -71,10 +111,10 @@ def _doc_to_example(self, doc, project, source_projects): score_vector = np.array(score_vectors) for subj_id in range(len(true)): if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: - ex = self._format_example( + ex = (subj_id, self._format_example( subj_id, score_vector[:, subj_id], - true[subj_id]) + true[subj_id])) examples.append(ex) return examples @@ -86,3 +126,38 @@ def _create_examples(self, corpus, project): examples += self._doc_to_example(doc, project, source_projects) random.shuffle(examples) return examples + + @staticmethod + def _write_freq_file(subject_freq, filename): + with open(filename, 'w') as freqfile: + json.dump(subject_freq, freqfile) + + def _create_train_file(self, corpus, project): + self.info('creating VW train file') + exampledata = self._create_examples(corpus, project) + + subjects = [subj_id for subj_id, ex in exampledata] + self._subject_freq = collections.Counter(subjects) + annif.util.atomic_save(self._subject_freq, + self.datadir, + self.FREQ_FILE, + method=self._write_freq_file) + + examples = [ex for subj_id, ex in exampledata] + annif.util.atomic_save(examples, + self.datadir, + self.TRAIN_FILE, + method=self._write_train_file) + + def learn(self, corpus, project): + self.initialize() + exampledata = self._create_examples(corpus, project) + for subj_id, example in exampledata: + self._model.learn(example) + self._subject_freq[subj_id] += 1 + modelpath = os.path.join(self.datadir, self.MODEL_FILE) + self._model.save(modelpath) + annif.util.atomic_save(self._subject_freq, + self.datadir, + self.FREQ_FILE, + method=self._write_freq_file) diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py index bc091d78e..ad73b7de2 100644 --- a/tests/test_backend_vw_ensemble.py +++ b/tests/test_backend_vw_ensemble.py @@ -26,6 +26,8 @@ def test_vw_ensemble_train(app, datadir, tmpdir): vw_ensemble.train(document_corpus, project) assert datadir.join('vw-train.txt').exists() assert datadir.join('vw-train.txt').size() > 0 + assert datadir.join('subject-freq.json').exists() + assert datadir.join('subject-freq.json').size() > 0 assert datadir.join('vw-model').exists() assert datadir.join('vw-model').size() > 0 From dfaaf6d0fe31ee72f19c155287e99123afec40be Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 11:11:25 +0300 Subject: [PATCH 16/25] add more tests for vw_ensemble --- tests/test_backend_vw_ensemble.py | 33 ++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py index ad73b7de2..239b21073 100644 --- a/tests/test_backend_vw_ensemble.py +++ b/tests/test_backend_vw_ensemble.py @@ -1,14 +1,28 @@ """Unit tests for the vw_ensemble backend in Annif""" +import json +import time import pytest import annif.backend import annif.corpus import annif.project +from annif.exception import NotInitializedException pytest.importorskip("annif.backend.vw_ensemble") -def test_vw_ensemble_train(app, datadir, tmpdir): +def test_vw_ensemble_suggest_no_model(datadir, project): + vw_ensemble_type = annif.backend.get_backend('vw_ensemble') + vw_ensemble = vw_ensemble_type( + backend_id='vw_ensemble', + params={'sources': 'dummy-en'}, + datadir=str(datadir)) + + with pytest.raises(NotInitializedException): + results = vw_ensemble.suggest("example text", project) + + +def test_vw_ensemble_train_and_learn(app, datadir, tmpdir): vw_ensemble_type = annif.backend.get_backend("vw_ensemble") vw_ensemble = vw_ensemble_type( backend_id='vw_ensemble', @@ -31,6 +45,23 @@ def test_vw_ensemble_train(app, datadir, tmpdir): assert datadir.join('vw-model').exists() assert datadir.join('vw-model').size() > 0 + # test online learning + modelfile = datadir.join('vw-model') + freqfile = datadir.join('subject-freq.json') + + old_size = modelfile.size() + old_mtime = modelfile.mtime() + with open(str(freqfile)) as freqf: + old_totalfreq = sum(json.load(freqf).values()) + + time.sleep(0.1) # make sure the timestamp has a chance to increase + + vw_ensemble.learn(document_corpus, project) + + assert modelfile.size() != old_size or modelfile.mtime() != old_mtime + with open(str(freqfile)) as freqf: + assert sum(json.load(freqf).values()) != old_totalfreq + def test_vw_ensemble_initialize(app, datadir): vw_ensemble_type = annif.backend.get_backend("vw_ensemble") From b844f6f6bb0ae393b46439cbbbbf336c783b5e41 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 11:23:31 +0300 Subject: [PATCH 17/25] Refactor: split initialize method in vw_ensemble --- annif/backend/vw_ensemble.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 774ce3610..72224384a 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -45,23 +45,26 @@ class VWEnsembleBackend( # will make it more careful so that it will require more training data. DEFAULT_DISCOUNT_RATE = 0.01 + def _load_subject_freq(self): + path = os.path.join(self.datadir, self.FREQ_FILE) + if not os.path.exists(path): + raise NotInitializedException( + 'frequency file {} not found'.format(path), + backend_id=self.backend_id) + self.debug('loading concept frequencies from {}'.format(path)) + with open(path) as freqf: + # The Counter was serialized like a dictionary, need to + # convert it back. Keys that became strings need to be turned + # back into integers. + self._subject_freq = collections.Counter() + for cid, freq in json.load(freqf).items(): + self._subject_freq[int(cid)] = freq + self.debug('loaded frequencies for {} concepts'.format( + len(self._subject_freq))) + def initialize(self): if self._subject_freq is None: - path = os.path.join(self.datadir, self.FREQ_FILE) - if not os.path.exists(path): - raise NotInitializedException( - 'frequency file {} not found'.format(path), - backend_id=self.backend_id) - self.debug('loading concept frequencies from {}'.format(path)) - with open(path) as freqf: - # The Counter was serialized like a dictionary, need to - # convert it back. Keys that became strings need to be turned - # back into integers. - self._subject_freq = collections.Counter() - for cid, freq in json.load(freqf).items(): - self._subject_freq[int(cid)] = freq - self.debug('loaded frequencies for {} concepts'.format( - len(self._subject_freq))) + self._load_subject_freq() super().initialize() def _merge_hits_from_sources(self, hits_from_sources, project, params): From 9a1345c5b73bb5226a3e155d4ce42593ca4bb344 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 11:25:56 +0300 Subject: [PATCH 18/25] Refactor: split _doc_to_example method in vw_ensemble --- annif/backend/vw_ensemble.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 72224384a..e25b43964 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -103,15 +103,18 @@ def _format_example(self, subject_id, scores, true=None): ex += " {}:{}".format(proj, scores[proj_idx]) return ex - def _doc_to_example(self, doc, project, source_projects): - examples = [] - subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) - true = subjects.as_vector(project.subjects) + def _doc_score_vector(self, doc, source_projects): score_vectors = [] for source_project in source_projects: hits = source_project.suggest(doc.text) score_vectors.append(hits.vector) - score_vector = np.array(score_vectors) + return np.array(score_vectors) + + def _doc_to_example(self, doc, project, source_projects): + examples = [] + subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) + true = subjects.as_vector(project.subjects) + score_vector = self._doc_score_vector(doc, source_projects) for subj_id in range(len(true)): if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: ex = (subj_id, self._format_example( From 34dc5a40b5657cb77c3ae9d02a3c1662c3a09454 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 11:33:28 +0300 Subject: [PATCH 19/25] Refactor: split _merge_hits_from_sources in vw_ensemble --- annif/backend/vw_ensemble.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index e25b43964..13f188041 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -67,21 +67,25 @@ def initialize(self): self._load_subject_freq() super().initialize() + def _calculate_scores(self, subj_id, subj_score_vector): + ex = self._format_example(subj_id, subj_score_vector) + raw_score = subj_score_vector.mean() + pred_score = (self._model.predict(ex) + 1.0) / 2.0 + return raw_score, pred_score + def _merge_hits_from_sources(self, hits_from_sources, project, params): score_vector = np.array([hits.vector for hits, _ in hits_from_sources]) + discount_rate = self.params.get('discount_rate', + self.DEFAULT_DISCOUNT_RATE) result = np.zeros(score_vector.shape[1]) for subj_id in range(score_vector.shape[1]): - if score_vector[:, subj_id].sum() > 0.0: - ex = self._format_example( - subj_id, - score_vector[:, subj_id]) - discount_rate = self.params.get( - 'discount_rate', self.DEFAULT_DISCOUNT_RATE) + subj_score_vector = score_vector[:, subj_id] + if subj_score_vector.sum() > 0.0: + raw_score, pred_score = self._calculate_scores( + subj_id, subj_score_vector) raw_weight = 1.0 / \ ((discount_rate * self._subject_freq[subj_id]) + 1) - raw_score = score_vector[:, subj_id].mean() - pred_score = (self._model.predict(ex) + 1.0) / 2.0 result[subj_id] = (raw_weight * raw_score) + \ (1.0 - raw_weight) * pred_score return VectorSuggestionResult(result, project.subjects) From b162b50a8f56c1a02277c27eb7a96e526ad3675a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 12:56:55 +0300 Subject: [PATCH 20/25] Refactor: ensure dicts passed as function parameters are not mutated --- annif/backend/vw_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py index cc9c32991..ffc310c07 100644 --- a/annif/backend/vw_base.py +++ b/annif/backend/vw_base.py @@ -56,6 +56,7 @@ def _convert_param(self, param, val): param, val, pspec), backend_id=self.backend_id) def _create_params(self, params): + params = params.copy() # don't mutate the original dict params.update({param: defaultval for param, (_, defaultval) in self.VW_PARAMS.items() if defaultval is not None}) @@ -86,6 +87,7 @@ def _create_examples(self, corpus, project): pass # pragma: no cover def _create_model(self, project, initial_params={}): + initial_params = initial_params.copy() # don't mutate the original trainpath = os.path.join(self.datadir, self.TRAIN_FILE) initial_params['data'] = trainpath params = self._create_params(initial_params) From 4d7e0f799feae97b7aa47725c438c168c9c55e61 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 12:57:31 +0300 Subject: [PATCH 21/25] remove unused imports --- annif/backend/vw_multi.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 2511e5504..3c3aef9e7 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -2,8 +2,6 @@ classifiers""" import random -import os.path -from vowpalwabbit import pyvw import numpy as np import annif.project from annif.suggestion import ListSuggestionResult, VectorSuggestionResult From 0db12a3a2709fd67f48fc4665b445881969aa4bb Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 12:57:55 +0300 Subject: [PATCH 22/25] remove unused imports --- annif/backend/vw_ensemble.py | 1 - 1 file changed, 1 deletion(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 13f188041..0e556886a 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -7,7 +7,6 @@ import os.path import annif.util import annif.project -from vowpalwabbit import pyvw import numpy as np from annif.exception import NotInitializedException from annif.suggestion import VectorSuggestionResult From d28f9dea959d814a940e356670c9cc913ffcae81 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 13:34:45 +0300 Subject: [PATCH 23/25] add API documentation templates for the new vw_base and vw_multi modules --- docs/source/annif.backend.rst | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/source/annif.backend.rst b/docs/source/annif.backend.rst index 2b9cef414..484335b82 100644 --- a/docs/source/annif.backend.rst +++ b/docs/source/annif.backend.rst @@ -65,8 +65,24 @@ annif.backend.tfidf module :undoc-members: :show-inheritance: -annif.backend.vw\_multi module ------------------------------- +annif.backend.vw_base module +---------------------------- + +.. automodule:: annif.backend.vw_base + :members: + :undoc-members: + :show-inheritance: + +annif.backend.vw_ensemble module +-------------------------------- + +.. automodule:: annif.backend.vw_ensemble + :members: + :undoc-members: + :show-inheritance: + +annif.backend.vw_multi module +----------------------------- .. automodule:: annif.backend.vw_multi :members: From d3a26a498a02ced7dfafab48fd3a6122f2f5c32a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 15:23:33 +0300 Subject: [PATCH 24/25] Avoid scientific notation for weight values in VW train file (test to verify) --- annif/backend/vw_ensemble.py | 2 +- tests/test_backend_vw_ensemble.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 0e556886a..9950e43de 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -103,7 +103,7 @@ def _format_example(self, subject_id, scores, true=None): val = -1 ex = "{} |{}".format(val, subject_id) for proj_idx, proj in enumerate(self._source_project_ids): - ex += " {}:{}".format(proj, scores[proj_idx]) + ex += " {}:{:.6f}".format(proj, scores[proj_idx]) return ex def _doc_score_vector(self, doc, source_projects): diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py index 239b21073..3f36384be 100644 --- a/tests/test_backend_vw_ensemble.py +++ b/tests/test_backend_vw_ensemble.py @@ -97,3 +97,25 @@ def test_vw_ensemble_suggest(app, datadir): assert vw_ensemble._model is not None assert len(results) > 0 + + +def test_vw_ensemble_format_example(datadir): + vw_ensemble_type = annif.backend.get_backend("vw_ensemble") + vw_ensemble = vw_ensemble_type( + backend_id='vw_ensemble', + params={'sources': 'dummy-en'}, + datadir=str(datadir)) + + ex = vw_ensemble._format_example(0, [0.5]) + assert ex == ' |0 dummy-en:0.500000' + + +def test_vw_ensemble_format_example_avoid_sci_notation(datadir): + vw_ensemble_type = annif.backend.get_backend("vw_ensemble") + vw_ensemble = vw_ensemble_type( + backend_id='vw_ensemble', + params={'sources': 'dummy-en'}, + datadir=str(datadir)) + + ex = vw_ensemble._format_example(0, [7.24e-05]) + assert ex == ' |0 dummy-en:0.000072' From 1741cd7f751f72573b389b43ab6b9be8dfce99d2 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 28 Jun 2019 16:04:31 +0300 Subject: [PATCH 25/25] Bugfix: parse discount_rate into a float (with test to verify) --- annif/backend/vw_ensemble.py | 4 ++-- tests/test_backend_vw_ensemble.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 9950e43de..94cf28fa7 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -75,8 +75,8 @@ def _calculate_scores(self, subj_id, subj_score_vector): def _merge_hits_from_sources(self, hits_from_sources, project, params): score_vector = np.array([hits.vector for hits, _ in hits_from_sources]) - discount_rate = self.params.get('discount_rate', - self.DEFAULT_DISCOUNT_RATE) + discount_rate = float(self.params.get('discount_rate', + self.DEFAULT_DISCOUNT_RATE)) result = np.zeros(score_vector.shape[1]) for subj_id in range(score_vector.shape[1]): subj_score_vector = score_vector[:, subj_id] diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py index 3f36384be..23b48ff53 100644 --- a/tests/test_backend_vw_ensemble.py +++ b/tests/test_backend_vw_ensemble.py @@ -99,6 +99,25 @@ def test_vw_ensemble_suggest(app, datadir): assert len(results) > 0 +def test_vw_ensemble_suggest_set_discount_rate(app, datadir): + vw_ensemble_type = annif.backend.get_backend("vw_ensemble") + vw_ensemble = vw_ensemble_type( + backend_id='vw_ensemble', + params={'sources': 'dummy-en', 'discount_rate': '0.02'}, + datadir=str(datadir)) + + project = annif.project.get_project('dummy-en') + + results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös + muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede + tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä. + Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä, + joita ihmisten toiminta on jättänyt maaperään tai vesistöjen + pohjaan.""", project) + + assert len(results) > 0 + + def test_vw_ensemble_format_example(datadir): vw_ensemble_type = annif.backend.get_backend("vw_ensemble") vw_ensemble = vw_ensemble_type(