Skip to content

Commit

Permalink
Merge pull request #324 from NatLibFi/issue273-default-values-for-con…
Browse files Browse the repository at this point in the history
…figuration-settings

Issue273 default values for configuration settings
  • Loading branch information
juhoinkinen authored Sep 30, 2019
2 parents e449223 + d22cd2d commit 244db95
Show file tree
Hide file tree
Showing 17 changed files with 257 additions and 55 deletions.
16 changes: 14 additions & 2 deletions annif/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,25 @@ class AnnifBackend(metaclass=abc.ABCMeta):
needs_subject_index = False
needs_subject_vectorizer = False

def __init__(self, backend_id, params, datadir):
DEFAULT_PARAMS = {'limit': 100}

def __init__(self, backend_id, config_params, datadir):
"""Initialize backend with specific parameters. The
parameters are a dict. Keys and values depend on the specific
backend type."""
self.backend_id = backend_id
self.params = params
self.datadir = datadir
self.config_params = config_params

def default_params(self):
return self.DEFAULT_PARAMS

@property
def params(self):
params = {}
params.update(self.default_params())
params.update(self.config_params)
return params

def train(self, corpus, project):
"""train the model on the given document or subject corpus"""
Expand Down
3 changes: 3 additions & 0 deletions annif/backend/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ class DummyBackend(backend.AnnifLearningBackend):
uri = 'http://example.org/dummy'
label = 'dummy'

def default_params(self):
return backend.AnnifBackend.DEFAULT_PARAMS

def initialize(self):
self.initialized = True

Expand Down
13 changes: 13 additions & 0 deletions annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,25 @@ class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend):
't': float
}

DEFAULT_PARAMS = {
'dim': 100,
'lr': 0.25,
'epoch': 5,
'loss': 'hs',
}

MODEL_FILE = 'fasttext-model'
TRAIN_FILE = 'fasttext-train.txt'

# defaults for uninitialized instances
_model = None

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(mixins.ChunkingBackend.DEFAULT_PARAMS)
params.update(self.DEFAULT_PARAMS)
return params

def initialize(self):
if self._model is None:
path = os.path.join(self.datadir, self.MODEL_FILE)
Expand Down
5 changes: 5 additions & 0 deletions annif/backend/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
class ChunkingBackend(metaclass=abc.ABCMeta):
"""Annif backend mixin that implements chunking of input"""

DEFAULT_PARAMS = {'chunksize': 1}

def default_params(self):
return self.DEFAULT_PARAMS

@abc.abstractmethod
def _suggest_chunks(self, chunktexts, project):
"""Suggest subjects for the chunked text; should be implemented by
Expand Down
2 changes: 2 additions & 0 deletions annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class PAVBackend(ensemble.EnsembleBackend):
# defaults for uninitialized instances
_models = None

DEFAULT_PARAMS = {'min-docs': 10}

def initialize(self):
if self._models is not None:
return # already initialized
Expand Down
15 changes: 12 additions & 3 deletions annif/backend/vw_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
from annif.exception import NotInitializedException
from annif.suggestion import VectorSuggestionResult
from . import backend
from . import vw_base
from . import ensemble

Expand Down Expand Up @@ -42,7 +43,16 @@ class VWEnsembleBackend(
# a simple mean of scores. A higher value will mean that the model
# adapts quicker (and possibly makes more errors) while a lower value
# will make it more careful so that it will require more training data.
DEFAULT_DISCOUNT_RATE = 0.01

DEFAULT_PARAMS = {'discount_rate': 0.01}

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(self.DEFAULT_PARAMS)
params.update({param: default_val
for param, (_, default_val) in self.VW_PARAMS.items()
if default_val is not None})
return params

def _load_subject_freq(self):
path = os.path.join(self.datadir, self.FREQ_FILE)
Expand Down Expand Up @@ -75,8 +85,7 @@ def _calculate_scores(self, subj_id, subj_score_vector):
def _merge_hits_from_sources(self, hits_from_sources, project, params):
score_vector = np.array([hits.vector
for hits, _ in hits_from_sources])
discount_rate = float(self.params.get('discount_rate',
self.DEFAULT_DISCOUNT_RATE))
discount_rate = float(self.params['discount_rate'])
result = np.zeros(score_vector.shape[1])
for subj_id in range(score_vector.shape[1]):
subj_score_vector = score_vector[:, subj_id]
Expand Down
15 changes: 13 additions & 2 deletions annif/backend/vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
from annif.exception import ConfigurationException
from . import vw_base
from . import backend
from . import mixins


Expand All @@ -27,14 +28,24 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend):
'probabilities': (bool, None)
}

DEFAULT_ALGORITHM = 'oaa'
SUPPORTED_ALGORITHMS = ('oaa', 'ect', 'log_multi', 'multilabel_oaa')

DEFAULT_INPUTS = '_text_'

DEFAULT_PARAMS = {'algorithm': 'oaa'}

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(mixins.ChunkingBackend.DEFAULT_PARAMS)
params.update(self.DEFAULT_PARAMS)
params.update({param: default_val
for param, (_, default_val) in self.VW_PARAMS.items()
if default_val is not None})
return params

@property
def algorithm(self):
algorithm = self.params.get('algorithm', self.DEFAULT_ALGORITHM)
algorithm = self.params['algorithm']
if algorithm not in self.SUPPORTED_ALGORITHMS:
raise ConfigurationException(
"{} is not a valid algorithm (allowed: {})".format(
Expand Down
5 changes: 3 additions & 2 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class AnnifProject(DatadirMixin):
def __init__(self, project_id, config, datadir):
DatadirMixin.__init__(self, datadir, 'projects', project_id)
self.project_id = project_id
self.name = config['name']
self.name = config.get('name', project_id)
self.language = config['language']
self.analyzer_spec = config.get('analyzer', None)
self.vocab_id = config.get('vocab', None)
Expand Down Expand Up @@ -144,7 +144,8 @@ def backend(self):
try:
backend_class = annif.backend.get_backend(backend_id)
self._backend = backend_class(
backend_id, params=self.config, datadir=self.datadir)
backend_id, config_params=self.config,
datadir=self.datadir)
except ValueError:
logger.warning(
"Could not create backend %s, "
Expand Down
20 changes: 20 additions & 0 deletions tests/projects.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,26 @@ language=en
vocab=dummy
analyzer=snowball(english)

[noname]
language=en
backend=tfidf
vocab=dummy
analyzer=snowball(english)

[noparams-tfidf-fi]
name=TF-IDF Finnish using default params
language=fi
backend=tfidf
analyzer=snowball(finnish)
vocab=yso-fi

[noparams-fasttext-fi]
name=fastText Finnish using default params
language=fi
backend=fasttext
analyzer=snowball(finnish)
vocab=yso-fi

[pav]
name=PAV Ensemble Finnish
language=fi
Expand Down
13 changes: 11 additions & 2 deletions tests/test_backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Unit tests for backends in Annif"""

import pytest
import logging
import annif
import annif.backend
import annif.corpus
Expand All @@ -13,7 +14,7 @@ def test_get_backend_nonexistent():

def test_get_backend_dummy(app, project):
dummy_type = annif.backend.get_backend("dummy")
dummy = dummy_type(backend_id='dummy', params={},
dummy = dummy_type(backend_id='dummy', config_params={},
datadir=app.config['DATADIR'])
result = dummy.suggest(text='this is some text', project=project)
assert len(result) == 1
Expand All @@ -24,7 +25,7 @@ def test_get_backend_dummy(app, project):

def test_learn_dummy(app, project, tmpdir):
dummy_type = annif.backend.get_backend("dummy")
dummy = dummy_type(backend_id='dummy', params={},
dummy = dummy_type(backend_id='dummy', config_params={},
datadir=app.config['DATADIR'])

tmpdir.join('doc1.txt').write('doc1')
Expand All @@ -40,3 +41,11 @@ def test_learn_dummy(app, project, tmpdir):
assert result[0].uri == 'http://example.org/key1'
assert result[0].label == 'key1'
assert result[0].score == 1.0


def test_fill_params_with_defaults(app):
dummy_type = annif.backend.get_backend('dummy')
dummy = dummy_type(backend_id='dummy', config_params={},
datadir=app.config['DATADIR'])
expected_default_params = {'limit': 100} # From AnnifBackend class
assert expected_default_params == dummy.params
28 changes: 24 additions & 4 deletions tests/test_backend_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,31 @@
fasttext = pytest.importorskip("annif.backend.fasttext")


def test_fasttext_default_params(datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
config_params={},
datadir=str(datadir))

expected_default_params = {
'limit': 100,
'chunksize': 1,
'dim': 100,
'lr': 0.25,
'epoch': 5,
'loss': 'hs',
}
actual_params = fasttext.params
for param, val in expected_default_params.items():
assert param in actual_params and actual_params[param] == val


def test_fasttext_train(datadir, document_corpus, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'dim': 100,
'lr': 0.25,
Expand All @@ -30,7 +50,7 @@ def test_fasttext_train_unknown_subject(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'dim': 100,
'lr': 0.25,
Expand All @@ -53,7 +73,7 @@ def test_fasttext_train_nodocuments(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'dim': 100,
'lr': 0.25,
Expand All @@ -73,7 +93,7 @@ def test_fasttext_suggest(datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'chunksize': 1,
'dim': 100,
Expand Down
12 changes: 6 additions & 6 deletions tests/test_backend_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_http_suggest(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -40,7 +40,7 @@ def test_http_suggest_with_results(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/dummy/analyze',
},
datadir=app.config['DATADIR'])
Expand All @@ -63,7 +63,7 @@ def test_http_suggest_zero_score(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -79,7 +79,7 @@ def test_http_suggest_error(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -98,7 +98,7 @@ def test_http_suggest_json_fails(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -117,7 +117,7 @@ def test_http_suggest_unexpected_json(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand Down
Loading

0 comments on commit 244db95

Please sign in to comment.