From 4a28abc59c7aeb279ef2c0587f83192525d0648d Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 3 Dec 2019 09:25:23 +0200 Subject: [PATCH 1/3] Remove vw_ensemble backend. Fixes #362 --- annif/backend/__init__.py | 4 +- annif/backend/vw_ensemble.py | 182 ------------------------------ tests/test_backend_vw_ensemble.py | 157 -------------------------- 3 files changed, 1 insertion(+), 342 deletions(-) delete mode 100644 annif/backend/vw_ensemble.py delete mode 100644 tests/test_backend_vw_ensemble.py diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index 8a3e52876..901cb4929 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -40,11 +40,9 @@ def get_backend(backend_id): try: from . import vw_multi register_backend(vw_multi.VWMultiBackend) - from . import vw_ensemble - register_backend(vw_ensemble.VWEnsembleBackend) except ImportError: annif.logger.debug("vowpalwabbit not available, not enabling " + - "vw_multi & vw_ensemble backends") + "vw_multi backend") try: from . import nn_ensemble diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py deleted file mode 100644 index e941817cf..000000000 --- a/annif/backend/vw_ensemble.py +++ /dev/null @@ -1,182 +0,0 @@ -"""Annif backend using the Vowpal Wabbit multiclass and multilabel -classifiers""" - -import collections -import json -import random -import os.path -import annif.util -import annif.project -import numpy as np -from annif.exception import NotInitializedException -from annif.suggestion import VectorSuggestionResult -from . import backend -from . import vw_base -from . import ensemble - - -class VWEnsembleBackend( - ensemble.EnsembleBackend, - vw_base.VWBaseBackend): - """Vowpal Wabbit ensemble backend that combines results from multiple - projects and learns how well those projects/backends recognize - particular subjects.""" - - name = "vw_ensemble" - - VW_PARAMS = { - 'bit_precision': (int, None), - 'learning_rate': (float, None), - 'loss_function': (['squared', 'logistic', 'hinge'], 'squared'), - 'l1': (float, None), - 'l2': (float, None), - 'passes': (int, None) - } - - # number of training examples per subject, stored as a collections.Counter - _subject_freq = None - - FREQ_FILE = 'subject-freq.json' - - # The discount rate affects how quickly the ensemble starts to trust its - # own judgement when the amount of training data increases, versus using - # a simple mean of scores. A higher value will mean that the model - # adapts quicker (and possibly makes more errors) while a lower value - # will make it more careful so that it will require more training data. - - DEFAULT_PARAMS = {'discount_rate': 0.01} - - def default_params(self): - params = backend.AnnifBackend.DEFAULT_PARAMS.copy() - params.update(self.DEFAULT_PARAMS) - params.update({param: default_val - for param, (_, default_val) in self.VW_PARAMS.items() - if default_val is not None}) - return params - - def _load_subject_freq(self): - path = os.path.join(self.datadir, self.FREQ_FILE) - if not os.path.exists(path): - raise NotInitializedException( - 'frequency file {} not found'.format(path), - backend_id=self.backend_id) - self.debug('loading concept frequencies from {}'.format(path)) - with open(path) as freqf: - # The Counter was serialized like a dictionary, need to - # convert it back. Keys that became strings need to be turned - # back into integers. - self._subject_freq = collections.Counter() - for cid, freq in json.load(freqf).items(): - self._subject_freq[int(cid)] = freq - self.debug('loaded frequencies for {} concepts'.format( - len(self._subject_freq))) - - def initialize(self): - if self._subject_freq is None: - self._load_subject_freq() - super().initialize() - - def _calculate_scores(self, subj_id, subj_score_vector): - ex = self._format_example(subj_id, subj_score_vector) - raw_score = subj_score_vector.mean() - pred_score = (self._model.predict(ex) + 1.0) / 2.0 - return raw_score, pred_score - - def _merge_hits_from_sources(self, hits_from_sources, params): - score_vector = np.array([hits.vector - for hits, _ in hits_from_sources], - dtype=np.float32) - discount_rate = float(self.params['discount_rate']) - result = np.zeros(score_vector.shape[1], dtype=np.float32) - for subj_id in range(score_vector.shape[1]): - subj_score_vector = score_vector[:, subj_id] - if subj_score_vector.sum() > 0.0: - raw_score, pred_score = self._calculate_scores( - subj_id, subj_score_vector) - raw_weight = 1.0 / \ - ((discount_rate * self._subject_freq[subj_id]) + 1) - result[subj_id] = (raw_weight * raw_score) + \ - (1.0 - raw_weight) * pred_score - return VectorSuggestionResult(result, self.project.subjects) - - @property - def _source_project_ids(self): - sources = annif.util.parse_sources(self.params['sources']) - return [project_id for project_id, _ in sources] - - def _format_example(self, subject_id, scores, true=None): - if true is None: - val = '' - elif true: - val = 1 - else: - val = -1 - ex = "{} |{}".format(val, subject_id) - for proj_idx, proj in enumerate(self._source_project_ids): - ex += " {}:{:.6f}".format(proj, scores[proj_idx]) - return ex - - def _doc_score_vector(self, doc, source_projects): - score_vectors = [] - for source_project in source_projects: - hits = source_project.suggest(doc.text) - score_vectors.append(hits.vector) - return np.array(score_vectors, dtype=np.float32) - - def _doc_to_example(self, doc, source_projects): - examples = [] - subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) - true = subjects.as_vector(self.project.subjects) - score_vector = self._doc_score_vector(doc, source_projects) - for subj_id in range(len(true)): - if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: - ex = (subj_id, self._format_example( - subj_id, - score_vector[:, subj_id], - true[subj_id])) - examples.append(ex) - return examples - - def _create_examples(self, corpus): - source_projects = [annif.project.get_project(project_id) - for project_id in self._source_project_ids] - examples = [] - for doc in corpus.documents: - examples += self._doc_to_example(doc, source_projects) - random.shuffle(examples) - return examples - - @staticmethod - def _write_freq_file(subject_freq, filename): - with open(filename, 'w') as freqfile: - json.dump(subject_freq, freqfile) - - def _create_train_file(self, corpus): - self.info('creating VW train file') - exampledata = self._create_examples(corpus) - - subjects = [subj_id for subj_id, ex in exampledata] - self._subject_freq = collections.Counter(subjects) - annif.util.atomic_save(self._subject_freq, - self.datadir, - self.FREQ_FILE, - method=self._write_freq_file) - - examples = [ex for subj_id, ex in exampledata] - annif.util.atomic_save(examples, - self.datadir, - self.TRAIN_FILE, - method=self._write_train_file) - - def learn(self, corpus): - self.initialize() - exampledata = self._create_examples(corpus) - for subj_id, example in exampledata: - self._model.learn(example) - self._subject_freq[subj_id] += 1 - modelpath = os.path.join(self.datadir, self.MODEL_FILE) - self._model.save(modelpath) - annif.util.atomic_save(self._subject_freq, - self.datadir, - self.FREQ_FILE, - method=self._write_freq_file) diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py deleted file mode 100644 index b711051c7..000000000 --- a/tests/test_backend_vw_ensemble.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Unit tests for the vw_ensemble backend in Annif""" - -import json -import time -import pytest -import py.path -import annif.backend -import annif.corpus -import annif.project -from annif.exception import NotInitializedException - -pytest.importorskip("annif.backend.vw_ensemble") - - -def test_vw_ensemble_default_params(project): - vw_type = annif.backend.get_backend("vw_ensemble") - vw = vw_type( - backend_id='vw_ensemble', - config_params={}, - project=project) - - expected_default_params = { - 'limit': 100, - 'discount_rate': 0.01, - 'loss_function': 'squared', - } - actual_params = vw.params - for param, val in expected_default_params.items(): - assert param in actual_params and actual_params[param] == val - - -def test_vw_ensemble_suggest_no_model(project): - vw_ensemble_type = annif.backend.get_backend('vw_ensemble') - vw_ensemble = vw_ensemble_type( - backend_id='vw_ensemble', - config_params={'sources': 'dummy-en'}, - project=project) - - with pytest.raises(NotInitializedException): - results = vw_ensemble.suggest("example text") - - -def test_vw_ensemble_train_and_learn(app, tmpdir): - project = annif.project.get_project('dummy-en') - vw_ensemble_type = annif.backend.get_backend("vw_ensemble") - vw_ensemble = vw_ensemble_type( - backend_id='vw_ensemble', - config_params={'sources': 'dummy-en'}, - project=project) - - tmpfile = tmpdir.join('document.tsv') - tmpfile.write("dummy\thttp://example.org/dummy\n" + - "another\thttp://example.org/dummy\n" + - "none\thttp://example.org/none") - document_corpus = annif.corpus.DocumentFile(str(tmpfile)) - - with app.app_context(): - vw_ensemble.train(document_corpus) - datadir = py.path.local(project.datadir) - assert datadir.join('vw-train.txt').exists() - assert datadir.join('vw-train.txt').size() > 0 - assert datadir.join('subject-freq.json').exists() - assert datadir.join('subject-freq.json').size() > 0 - assert datadir.join('vw-model').exists() - assert datadir.join('vw-model').size() > 0 - - # test online learning - modelfile = datadir.join('vw-model') - freqfile = datadir.join('subject-freq.json') - - old_size = modelfile.size() - old_mtime = modelfile.mtime() - with open(str(freqfile)) as freqf: - old_totalfreq = sum(json.load(freqf).values()) - - time.sleep(0.1) # make sure the timestamp has a chance to increase - - vw_ensemble.learn(document_corpus) - - assert modelfile.size() != old_size or modelfile.mtime() != old_mtime - with open(str(freqfile)) as freqf: - assert sum(json.load(freqf).values()) != old_totalfreq - - -def test_vw_ensemble_initialize(app, app_project): - vw_ensemble_type = annif.backend.get_backend("vw_ensemble") - vw_ensemble = vw_ensemble_type( - backend_id='vw_ensemble', - config_params={'sources': 'dummy-en'}, - project=app_project) - - assert vw_ensemble._model is None - with app.app_context(): - vw_ensemble.initialize() - assert vw_ensemble._model is not None - # initialize a second time - this shouldn't do anything - with app.app_context(): - vw_ensemble.initialize() - - -def test_vw_ensemble_suggest(app, app_project): - vw_ensemble_type = annif.backend.get_backend("vw_ensemble") - vw_ensemble = vw_ensemble_type( - backend_id='vw_ensemble', - config_params={'sources': 'dummy-en'}, - project=app_project) - - with app.app_context(): - results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös - muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen - tiede tai oikeammin joukko tieteitä, jotka tutkivat ihmisen - menneisyyttä. Tutkimusta tehdään analysoimalla muinaisjäännöksiä - eli niitä jälkiä, joita ihmisten toiminta on jättänyt maaperään - tai vesistöjen pohjaan.""") - - assert vw_ensemble._model is not None - assert len(results) > 0 - - -def test_vw_ensemble_suggest_set_discount_rate(app, app_project): - vw_ensemble_type = annif.backend.get_backend("vw_ensemble") - vw_ensemble = vw_ensemble_type( - backend_id='vw_ensemble', - config_params={'sources': 'dummy-en', 'discount_rate': '0.02'}, - project=app_project) - - with app.app_context(): - results = vw_ensemble.suggest("""Arkeologiaa sanotaan joskus myös - muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen - tiede tai oikeammin joukko tieteitä, jotka tutkivat ihmisen - menneisyyttä. Tutkimusta tehdään analysoimalla muinaisjäännöksiä - eli niitä jälkiä, joita ihmisten toiminta on jättänyt maaperään - tai vesistöjen pohjaan.""") - - assert len(results) > 0 - - -def test_vw_ensemble_format_example(project): - vw_ensemble_type = annif.backend.get_backend("vw_ensemble") - vw_ensemble = vw_ensemble_type( - backend_id='vw_ensemble', - config_params={'sources': 'dummy-en'}, - project=project) - - ex = vw_ensemble._format_example(0, [0.5]) - assert ex == ' |0 dummy-en:0.500000' - - -def test_vw_ensemble_format_example_avoid_sci_notation(project): - vw_ensemble_type = annif.backend.get_backend("vw_ensemble") - vw_ensemble = vw_ensemble_type( - backend_id='vw_ensemble', - config_params={'sources': 'dummy-en'}, - project=project) - - ex = vw_ensemble._format_example(0, [7.24e-05]) - assert ex == ' |0 dummy-en:0.000072' From bb28d6f50bfce76bfdc67c46eeb2464af200c579 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 3 Dec 2019 09:34:42 +0200 Subject: [PATCH 2/3] Merge vw_base with vw_multi backend; there is no use for this base class anymore after the removal of vw_ensemble --- annif/backend/vw_base.py | 112 -------------------------------------- annif/backend/vw_multi.py | 90 +++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 114 deletions(-) delete mode 100644 annif/backend/vw_base.py diff --git a/annif/backend/vw_base.py b/annif/backend/vw_base.py deleted file mode 100644 index 609bfec9a..000000000 --- a/annif/backend/vw_base.py +++ /dev/null @@ -1,112 +0,0 @@ -"""Base class for Vowpal Wabbit based Annif backends""" - -import abc -import os -from vowpalwabbit import pyvw -import annif.util -from annif.exception import ConfigurationException -from annif.exception import NotInitializedException -from . import backend - - -class VWBaseBackend(backend.AnnifLearningBackend, metaclass=abc.ABCMeta): - """Base class for Vowpal Wabbit based Annif backends""" - - # Parameters for VW based backends - # each param specifier is a pair (allowed_values, default_value) - # where allowed_values is either a type or a list of allowed values - # and default_value may be None, to let VW decide by itself - VW_PARAMS = {} # needs to be specified in subclasses - - MODEL_FILE = 'vw-model' - TRAIN_FILE = 'vw-train.txt' - - # defaults for uninitialized instances - _model = None - - def initialize(self): - if self._model is None: - path = os.path.join(self.datadir, self.MODEL_FILE) - if not os.path.exists(path): - raise NotInitializedException( - 'model {} not found'.format(path), - backend_id=self.backend_id) - self.debug('loading VW model from {}'.format(path)) - params = self._create_params({'i': path, 'quiet': True}) - if 'passes' in params: - # don't confuse the model with passes - del params['passes'] - self.debug("model parameters: {}".format(params)) - self._model = pyvw.vw(**params) - self.debug('loaded model {}'.format(str(self._model))) - - def _convert_param(self, param, val): - pspec, _ = self.VW_PARAMS[param] - if isinstance(pspec, list): - if val in pspec: - return val - raise ConfigurationException( - "{} is not a valid value for {} (allowed: {})".format( - val, param, ', '.join(pspec)), backend_id=self.backend_id) - try: - return pspec(val) - except ValueError: - raise ConfigurationException( - "The {} value {} cannot be converted to {}".format( - param, val, pspec), backend_id=self.backend_id) - - def _create_params(self, params): - params = params.copy() # don't mutate the original dict - params.update({param: defaultval - for param, (_, defaultval) in self.VW_PARAMS.items() - if defaultval is not None}) - params.update({param: self._convert_param(param, val) - for param, val in self.params.items() - if param in self.VW_PARAMS}) - return params - - @staticmethod - def _write_train_file(examples, filename): - with open(filename, 'w', encoding='utf-8') as trainfile: - for ex in examples: - print(ex, file=trainfile) - - def _create_train_file(self, corpus): - self.info('creating VW train file') - examples = self._create_examples(corpus) - annif.util.atomic_save(examples, - self.datadir, - self.TRAIN_FILE, - method=self._write_train_file) - - @abc.abstractmethod - def _create_examples(self, corpus): - """This method should be implemented by concrete backends. It - should return a sequence of strings formatted according to the VW - input format.""" - pass # pragma: no cover - - def _create_model(self, initial_params={}): - initial_params = initial_params.copy() # don't mutate the original - trainpath = os.path.join(self.datadir, self.TRAIN_FILE) - initial_params['data'] = trainpath - params = self._create_params(initial_params) - if params.get('passes', 1) > 1: - # need a cache file when there are multiple passes - params.update({'cache': True, 'kill_cache': True}) - self.debug("model parameters: {}".format(params)) - self._model = pyvw.vw(**params) - modelpath = os.path.join(self.datadir, self.MODEL_FILE) - self._model.save(modelpath) - - def train(self, corpus): - self.info("creating VW model") - self._create_train_file(corpus) - self._create_model() - - def learn(self, corpus): - self.initialize() - for example in self._create_examples(corpus): - self._model.learn(example) - modelpath = os.path.join(self.datadir, self.MODEL_FILE) - self._model.save(modelpath) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 468e23d4e..a4ecd48ca 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -1,22 +1,31 @@ """Annif backend using the Vowpal Wabbit multiclass and multilabel classifiers""" +import os import random import numpy as np +from vowpalwabbit import pyvw import annif.project from annif.suggestion import ListSuggestionResult, VectorSuggestionResult from annif.exception import ConfigurationException +from annif.exception import NotInitializedException from . import vw_base from . import backend from . import mixins -class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend): +class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifLearningBackend): """Vowpal Wabbit multiclass/multilabel backend for Annif""" name = "vw_multi" needs_subject_index = True + MODEL_FILE = 'vw-model' + TRAIN_FILE = 'vw-train.txt' + + # defaults for uninitialized instances + _model = None + VW_PARAMS = { 'bit_precision': (int, None), 'ngram': (lambda x: '_{}'.format(int(x)), None), @@ -34,6 +43,47 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend): DEFAULT_PARAMS = {'algorithm': 'oaa'} + def initialize(self): + if self._model is None: + path = os.path.join(self.datadir, self.MODEL_FILE) + if not os.path.exists(path): + raise NotInitializedException( + 'model {} not found'.format(path), + backend_id=self.backend_id) + self.debug('loading VW model from {}'.format(path)) + params = self._create_params({'i': path, 'quiet': True}) + if 'passes' in params: + # don't confuse the model with passes + del params['passes'] + self.debug("model parameters: {}".format(params)) + self._model = pyvw.vw(**params) + self.debug('loaded model {}'.format(str(self._model))) + + def _convert_param(self, param, val): + pspec, _ = self.VW_PARAMS[param] + if isinstance(pspec, list): + if val in pspec: + return val + raise ConfigurationException( + "{} is not a valid value for {} (allowed: {})".format( + val, param, ', '.join(pspec)), backend_id=self.backend_id) + try: + return pspec(val) + except ValueError: + raise ConfigurationException( + "The {} value {} cannot be converted to {}".format( + param, val, pspec), backend_id=self.backend_id) + + def _create_params(self, params): + params = params.copy() # don't mutate the original dict + params.update({param: defaultval + for param, (_, defaultval) in self.VW_PARAMS.items() + if defaultval is not None}) + params.update({param: self._convert_param(param, val) + for param, val in self.params.items() + if param in self.VW_PARAMS}) + return params + def default_params(self): params = backend.AnnifBackend.DEFAULT_PARAMS.copy() params.update(mixins.ChunkingBackend.DEFAULT_PARAMS) @@ -117,7 +167,17 @@ def _create_examples(self, corpus): def _create_model(self): self.info('creating VW model (algorithm: {})'.format(self.algorithm)) - super()._create_model({self.algorithm: len(self.project.subjects)}) + trainpath = os.path.join(self.datadir, self.TRAIN_FILE) + initial_params = {'data': trainpath, + self.algorithm: len(self.project.subjects)} + params = self._create_params(initial_params) + if params.get('passes', 1) > 1: + # need a cache file when there are multiple passes + params.update({'cache': True, 'kill_cache': True}) + self.debug("model parameters: {}".format(params)) + self._model = pyvw.vw(**params) + modelpath = os.path.join(self.datadir, self.MODEL_FILE) + self._model.save(modelpath) def _convert_result(self, result): if self.algorithm == 'multilabel_oaa': @@ -150,3 +210,29 @@ def _suggest_chunks(self, chunktexts): return VectorSuggestionResult( np.array(results, dtype=np.float32).mean(axis=0), self.project.subjects) + + @staticmethod + def _write_train_file(examples, filename): + with open(filename, 'w', encoding='utf-8') as trainfile: + for ex in examples: + print(ex, file=trainfile) + + def _create_train_file(self, corpus): + self.info('creating VW train file') + examples = self._create_examples(corpus) + annif.util.atomic_save(examples, + self.datadir, + self.TRAIN_FILE, + method=self._write_train_file) + + def train(self, corpus): + self.info("creating VW model") + self._create_train_file(corpus) + self._create_model() + + def learn(self, corpus): + self.initialize() + for example in self._create_examples(corpus): + self._model.learn(example) + modelpath = os.path.join(self.datadir, self.MODEL_FILE) + self._model.save(modelpath) From 841b0adea9c3d20f8d960272e3accb0173aaa351 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 3 Dec 2019 11:24:38 +0200 Subject: [PATCH 3/3] Remove invalid import --- annif/backend/vw_multi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index a4ecd48ca..cd6171b57 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -9,7 +9,6 @@ from annif.suggestion import ListSuggestionResult, VectorSuggestionResult from annif.exception import ConfigurationException from annif.exception import NotInitializedException -from . import vw_base from . import backend from . import mixins