Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vw_ensemble backend #284

Merged
merged 28 commits into from
Jun 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
112554d
Make sure all projects are initialized when they are used for suggesting
osma Jun 24, 2019
a56be26
Initial implementation of vw_ensemble backend. Fixes #235
osma Jun 24, 2019
2c1e9b5
Merge branch 'master' of https://github.com/NatLibFi/Annif into issue…
osma Jun 24, 2019
a6079a0
fix test failure caused by using an uninitialized tfidf-fi project (w…
osma Jun 26, 2019
a3579d6
remove unused fixture
osma Jun 26, 2019
9d46450
introduce common base class for VW backends
osma Jun 26, 2019
9503aa8
refactor: move initialize method to vw_base
osma Jun 26, 2019
098e02a
Refactor: move parameter handling to vw_base
osma Jun 26, 2019
7456016
Refactor: move learn method to vw_base (and make it an abstract base …
osma Jun 26, 2019
b53f7c7
Refactor: move train method to vw_base
osma Jun 26, 2019
393b507
fix whitespace
osma Jun 26, 2019
78d206b
Refactor _create_examples to reduce(?) its complexity
osma Jun 26, 2019
9a49fc2
Merge branch 'master' into issue235-vw-ensemble-backend
osma Jun 27, 2019
c0cc1c7
Add missing import (why did it work before?)
osma Jun 27, 2019
ff05920
Refactor: move _create_model to vw_base; disable quadratic features f…
osma Jun 27, 2019
9f25308
Add "pragma: no cover" annotation for abstract method
osma Jun 27, 2019
2b81385
Add discounting mechanism to vw_ensemble backend
osma Jun 28, 2019
dfaaf6d
add more tests for vw_ensemble
osma Jun 28, 2019
b844f6f
Refactor: split initialize method in vw_ensemble
osma Jun 28, 2019
9a1345c
Refactor: split _doc_to_example method in vw_ensemble
osma Jun 28, 2019
34dc5a4
Refactor: split _merge_hits_from_sources in vw_ensemble
osma Jun 28, 2019
b162b50
Refactor: ensure dicts passed as function parameters are not mutated
osma Jun 28, 2019
4d7e0f7
remove unused imports
osma Jun 28, 2019
0db12a3
remove unused imports
osma Jun 28, 2019
d28f9de
add API documentation templates for the new vw_base and vw_multi modules
osma Jun 28, 2019
d3a26a4
Avoid scientific notation for weight values in VW train file (test to…
osma Jun 28, 2019
1741cd7
Bugfix: parse discount_rate into a float (with test to verify)
osma Jun 28, 2019
70d3973
Merge branch 'master' into issue235-vw-ensemble-backend
osma Jun 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions annif/backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def get_backend(backend_id):
try:
from . import vw_multi
register_backend(vw_multi.VWMultiBackend)
from . import vw_ensemble
register_backend(vw_ensemble.VWEnsembleBackend)
except ImportError:
annif.logger.debug(
"vowpalwabbit not available, not enabling vw_multi backend")
annif.logger.debug("vowpalwabbit not available, not enabling " +
"vw_multi & vw_ensemble backends")
1 change: 1 addition & 0 deletions annif/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def _suggest(self, text, project, params):
def suggest(self, text, project, params=None):
"""Suggest subjects for the input text and return a list of subjects
represented as a list of SubjectSuggestion objects."""
self.initialize()
beparams = dict(self.params)
if params:
beparams.update(params)
Expand Down
10 changes: 8 additions & 2 deletions annif/backend/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,16 @@ def _suggest_with_sources(self, text, sources):
hits=norm_hits, weight=weight))
return hits_from_sources

def _merge_hits_from_sources(self, hits_from_sources, project, params):
"""Hook for merging hits from sources. Can be overridden by
subclasses."""
return annif.util.merge_hits(hits_from_sources, project.subjects)

def _suggest(self, text, project, params):
sources = annif.util.parse_sources(params['sources'])
hits_from_sources = self._suggest_with_sources(text, sources)
merged_hits = annif.util.merge_hits(
hits_from_sources, project.subjects)
merged_hits = self._merge_hits_from_sources(hits_from_sources,
project,
params)
self.debug('{} hits after merging'.format(len(merged_hits)))
return merged_hits
1 change: 0 additions & 1 deletion annif/backend/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def _suggest_chunks(self, chunktexts, project):
pass # pragma: no cover

def _suggest(self, text, project, params):
self.initialize()
self.debug('Suggesting subjects for text "{}..." (len={})'.format(
text[:20], len(text)))
sentences = project.analyzer.tokenize_sentences(text)
Expand Down
1 change: 0 additions & 1 deletion annif/backend/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def train(self, corpus, project):
self.INDEX_FILE)

def _suggest(self, text, project, params):
self.initialize()
self.debug('Suggesting subjects for text "{}..." (len={})'.format(
text[:20], len(text)))
vectors = project.vectorizer.transform([text])
Expand Down
112 changes: 112 additions & 0 deletions annif/backend/vw_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Base class for Vowpal Wabbit based Annif backends"""

import abc
import os
from vowpalwabbit import pyvw
import annif.util
from annif.exception import ConfigurationException
from annif.exception import NotInitializedException
from . import backend


class VWBaseBackend(backend.AnnifLearningBackend, metaclass=abc.ABCMeta):
"""Base class for Vowpal Wabbit based Annif backends"""

# Parameters for VW based backends
# each param specifier is a pair (allowed_values, default_value)
# where allowed_values is either a type or a list of allowed values
# and default_value may be None, to let VW decide by itself
VW_PARAMS = {} # needs to be specified in subclasses

MODEL_FILE = 'vw-model'
TRAIN_FILE = 'vw-train.txt'

# defaults for uninitialized instances
_model = None

def initialize(self):
if self._model is None:
path = os.path.join(self.datadir, self.MODEL_FILE)
if not os.path.exists(path):
raise NotInitializedException(
'model {} not found'.format(path),
backend_id=self.backend_id)
self.debug('loading VW model from {}'.format(path))
params = self._create_params({'i': path, 'quiet': True})
if 'passes' in params:
# don't confuse the model with passes
del params['passes']
self.debug("model parameters: {}".format(params))
self._model = pyvw.vw(**params)
self.debug('loaded model {}'.format(str(self._model)))

def _convert_param(self, param, val):
pspec, _ = self.VW_PARAMS[param]
if isinstance(pspec, list):
if val in pspec:
return val
raise ConfigurationException(
"{} is not a valid value for {} (allowed: {})".format(
val, param, ', '.join(pspec)), backend_id=self.backend_id)
try:
return pspec(val)
except ValueError:
raise ConfigurationException(
"The {} value {} cannot be converted to {}".format(
param, val, pspec), backend_id=self.backend_id)

def _create_params(self, params):
params = params.copy() # don't mutate the original dict
params.update({param: defaultval
for param, (_, defaultval) in self.VW_PARAMS.items()
if defaultval is not None})
params.update({param: self._convert_param(param, val)
for param, val in self.params.items()
if param in self.VW_PARAMS})
return params

@staticmethod
def _write_train_file(examples, filename):
with open(filename, 'w', encoding='utf-8') as trainfile:
for ex in examples:
print(ex, file=trainfile)

def _create_train_file(self, corpus, project):
self.info('creating VW train file')
examples = self._create_examples(corpus, project)
annif.util.atomic_save(examples,
self.datadir,
self.TRAIN_FILE,
method=self._write_train_file)

@abc.abstractmethod
def _create_examples(self, corpus, project):
"""This method should be implemented by concrete backends. It
should return a sequence of strings formatted according to the VW
input format."""
pass # pragma: no cover

def _create_model(self, project, initial_params={}):
initial_params = initial_params.copy() # don't mutate the original
trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
initial_params['data'] = trainpath
params = self._create_params(initial_params)
if params.get('passes', 1) > 1:
# need a cache file when there are multiple passes
params.update({'cache': True, 'kill_cache': True})
self.debug("model parameters: {}".format(params))
self._model = pyvw.vw(**params)
modelpath = os.path.join(self.datadir, self.MODEL_FILE)
self._model.save(modelpath)

def train(self, corpus, project):
self.info("creating VW model")
self._create_train_file(corpus, project)
self._create_model(project)

def learn(self, corpus, project):
self.initialize()
for example in self._create_examples(corpus, project):
self._model.learn(example)
modelpath = os.path.join(self.datadir, self.MODEL_FILE)
self._model.save(modelpath)
172 changes: 172 additions & 0 deletions annif/backend/vw_ensemble.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""Annif backend using the Vowpal Wabbit multiclass and multilabel
classifiers"""

import collections
import json
import random
import os.path
import annif.util
import annif.project
import numpy as np
from annif.exception import NotInitializedException
from annif.suggestion import VectorSuggestionResult
from . import vw_base
from . import ensemble


class VWEnsembleBackend(
ensemble.EnsembleBackend,
vw_base.VWBaseBackend):
"""Vowpal Wabbit ensemble backend that combines results from multiple
projects and learns how well those projects/backends recognize
particular subjects."""

name = "vw_ensemble"

VW_PARAMS = {
'bit_precision': (int, None),
'learning_rate': (float, None),
'loss_function': (['squared', 'logistic', 'hinge'], 'squared'),
'l1': (float, None),
'l2': (float, None),
'passes': (int, None)
}

# number of training examples per subject, stored as a collections.Counter
_subject_freq = None

FREQ_FILE = 'subject-freq.json'

# The discount rate affects how quickly the ensemble starts to trust its
# own judgement when the amount of training data increases, versus using
# a simple mean of scores. A higher value will mean that the model
# adapts quicker (and possibly makes more errors) while a lower value
# will make it more careful so that it will require more training data.
DEFAULT_DISCOUNT_RATE = 0.01

def _load_subject_freq(self):
path = os.path.join(self.datadir, self.FREQ_FILE)
if not os.path.exists(path):
raise NotInitializedException(
'frequency file {} not found'.format(path),
backend_id=self.backend_id)
self.debug('loading concept frequencies from {}'.format(path))
with open(path) as freqf:
# The Counter was serialized like a dictionary, need to
# convert it back. Keys that became strings need to be turned
# back into integers.
self._subject_freq = collections.Counter()
for cid, freq in json.load(freqf).items():
self._subject_freq[int(cid)] = freq
self.debug('loaded frequencies for {} concepts'.format(
len(self._subject_freq)))

def initialize(self):
if self._subject_freq is None:
self._load_subject_freq()
super().initialize()

def _calculate_scores(self, subj_id, subj_score_vector):
ex = self._format_example(subj_id, subj_score_vector)
raw_score = subj_score_vector.mean()
pred_score = (self._model.predict(ex) + 1.0) / 2.0
return raw_score, pred_score

def _merge_hits_from_sources(self, hits_from_sources, project, params):
score_vector = np.array([hits.vector
for hits, _ in hits_from_sources])
discount_rate = float(self.params.get('discount_rate',
self.DEFAULT_DISCOUNT_RATE))
result = np.zeros(score_vector.shape[1])
for subj_id in range(score_vector.shape[1]):
subj_score_vector = score_vector[:, subj_id]
if subj_score_vector.sum() > 0.0:
raw_score, pred_score = self._calculate_scores(
subj_id, subj_score_vector)
raw_weight = 1.0 / \
((discount_rate * self._subject_freq[subj_id]) + 1)
result[subj_id] = (raw_weight * raw_score) + \
(1.0 - raw_weight) * pred_score
return VectorSuggestionResult(result, project.subjects)

@property
def _source_project_ids(self):
sources = annif.util.parse_sources(self.params['sources'])
return [project_id for project_id, _ in sources]

def _format_example(self, subject_id, scores, true=None):
if true is None:
val = ''
elif true:
val = 1
else:
val = -1
ex = "{} |{}".format(val, subject_id)
for proj_idx, proj in enumerate(self._source_project_ids):
ex += " {}:{:.6f}".format(proj, scores[proj_idx])
return ex

def _doc_score_vector(self, doc, source_projects):
score_vectors = []
for source_project in source_projects:
hits = source_project.suggest(doc.text)
score_vectors.append(hits.vector)
return np.array(score_vectors)

def _doc_to_example(self, doc, project, source_projects):
examples = []
subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
true = subjects.as_vector(project.subjects)
score_vector = self._doc_score_vector(doc, source_projects)
for subj_id in range(len(true)):
if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
ex = (subj_id, self._format_example(
subj_id,
score_vector[:, subj_id],
true[subj_id]))
examples.append(ex)
return examples

def _create_examples(self, corpus, project):
source_projects = [annif.project.get_project(project_id)
for project_id in self._source_project_ids]
examples = []
for doc in corpus.documents:
examples += self._doc_to_example(doc, project, source_projects)
random.shuffle(examples)
return examples

@staticmethod
def _write_freq_file(subject_freq, filename):
with open(filename, 'w') as freqfile:
json.dump(subject_freq, freqfile)

def _create_train_file(self, corpus, project):
self.info('creating VW train file')
exampledata = self._create_examples(corpus, project)

subjects = [subj_id for subj_id, ex in exampledata]
self._subject_freq = collections.Counter(subjects)
annif.util.atomic_save(self._subject_freq,
self.datadir,
self.FREQ_FILE,
method=self._write_freq_file)

examples = [ex for subj_id, ex in exampledata]
annif.util.atomic_save(examples,
self.datadir,
self.TRAIN_FILE,
method=self._write_train_file)

def learn(self, corpus, project):
self.initialize()
exampledata = self._create_examples(corpus, project)
for subj_id, example in exampledata:
self._model.learn(example)
self._subject_freq[subj_id] += 1
modelpath = os.path.join(self.datadir, self.MODEL_FILE)
self._model.save(modelpath)
annif.util.atomic_save(self._subject_freq,
self.datadir,
self.FREQ_FILE,
method=self._write_freq_file)
Loading