Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue273 default values for configuration settings #324

Merged
merged 22 commits into from
Sep 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
aa592d8
Default project name to project_id
juhoinkinen Aug 28, 2019
6480bd8
Fill missing parameters with defaults when creating backend objects
juhoinkinen Aug 29, 2019
7146977
Switch info about using default params to debug level
juhoinkinen Aug 29, 2019
4ef0b5a
Get all parent/ancestor classes instead of only direct ones
juhoinkinen Aug 29, 2019
f52a8bc
Use type to get class and thus avoid the magic method
juhoinkinen Aug 30, 2019
f8825ed
Refine debug message and docstring
juhoinkinen Aug 30, 2019
bf58247
Simpler & more controllable parameter defaulting
juhoinkinen Sep 4, 2019
ff7969b
Add default params for fasttext and PAV
juhoinkinen Sep 5, 2019
089ac38
Debug msg for using default params to one line
juhoinkinen Sep 6, 2019
0104fbc
Use default params for VW multi and ensemble
juhoinkinen Sep 6, 2019
2781087
Make fill_params_with_defaults testable; use it to set params in the …
juhoinkinen Sep 6, 2019
2d9a891
Default params for dummy BE and a test for params filling with it
juhoinkinen Sep 11, 2019
a215af9
Tests for backend params defaulting
juhoinkinen Sep 18, 2019
0fb125c
Skip test when fasttext not available
juhoinkinen Sep 18, 2019
c037815
Make params a property with defaults setting
juhoinkinen Sep 24, 2019
1843332
From SectionProxy obj to dict to be able to use .copy()
juhoinkinen Sep 24, 2019
63a199c
Rename params originating from projects.cfg for clarity
juhoinkinen Sep 24, 2019
420d3d1
Defaulting simply by updating default params with config params
juhoinkinen Sep 25, 2019
0df9735
Merge branch 'master' into issue273-default-values-for-configuration-…
juhoinkinen Sep 26, 2019
91136a6
Adapt to PR #322
juhoinkinen Sep 26, 2019
157ecab
Make config_params a field and params a property
juhoinkinen Sep 30, 2019
d22cd2d
Drop copying of default_params
juhoinkinen Sep 30, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions annif/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,25 @@ class AnnifBackend(metaclass=abc.ABCMeta):
needs_subject_index = False
needs_subject_vectorizer = False

def __init__(self, backend_id, params, datadir):
DEFAULT_PARAMS = {'limit': 100}

def __init__(self, backend_id, config_params, datadir):
"""Initialize backend with specific parameters. The
parameters are a dict. Keys and values depend on the specific
backend type."""
self.backend_id = backend_id
self.params = params
self.datadir = datadir
self.config_params = config_params

def default_params(self):
return self.DEFAULT_PARAMS

@property
def params(self):
params = {}
params.update(self.default_params())
params.update(self.config_params)
return params

def train(self, corpus, project):
"""train the model on the given document or subject corpus"""
Expand Down
3 changes: 3 additions & 0 deletions annif/backend/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ class DummyBackend(backend.AnnifLearningBackend):
uri = 'http://example.org/dummy'
label = 'dummy'

def default_params(self):
return backend.AnnifBackend.DEFAULT_PARAMS

def initialize(self):
self.initialized = True

Expand Down
13 changes: 13 additions & 0 deletions annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,25 @@ class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend):
't': float
}

DEFAULT_PARAMS = {
'dim': 100,
'lr': 0.25,
'epoch': 5,
'loss': 'hs',
}

MODEL_FILE = 'fasttext-model'
TRAIN_FILE = 'fasttext-train.txt'

# defaults for uninitialized instances
_model = None

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(mixins.ChunkingBackend.DEFAULT_PARAMS)
params.update(self.DEFAULT_PARAMS)
return params

def initialize(self):
if self._model is None:
path = os.path.join(self.datadir, self.MODEL_FILE)
Expand Down
5 changes: 5 additions & 0 deletions annif/backend/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
class ChunkingBackend(metaclass=abc.ABCMeta):
"""Annif backend mixin that implements chunking of input"""

DEFAULT_PARAMS = {'chunksize': 1}

def default_params(self):
return self.DEFAULT_PARAMS

@abc.abstractmethod
def _suggest_chunks(self, chunktexts, project):
"""Suggest subjects for the chunked text; should be implemented by
Expand Down
2 changes: 2 additions & 0 deletions annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class PAVBackend(ensemble.EnsembleBackend):
# defaults for uninitialized instances
_models = None

DEFAULT_PARAMS = {'min-docs': 10}

def initialize(self):
if self._models is not None:
return # already initialized
Expand Down
15 changes: 12 additions & 3 deletions annif/backend/vw_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
from annif.exception import NotInitializedException
from annif.suggestion import VectorSuggestionResult
from . import backend
from . import vw_base
from . import ensemble

Expand Down Expand Up @@ -42,7 +43,16 @@ class VWEnsembleBackend(
# a simple mean of scores. A higher value will mean that the model
# adapts quicker (and possibly makes more errors) while a lower value
# will make it more careful so that it will require more training data.
DEFAULT_DISCOUNT_RATE = 0.01

DEFAULT_PARAMS = {'discount_rate': 0.01}

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(self.DEFAULT_PARAMS)
params.update({param: default_val
for param, (_, default_val) in self.VW_PARAMS.items()
if default_val is not None})
return params

def _load_subject_freq(self):
path = os.path.join(self.datadir, self.FREQ_FILE)
Expand Down Expand Up @@ -75,8 +85,7 @@ def _calculate_scores(self, subj_id, subj_score_vector):
def _merge_hits_from_sources(self, hits_from_sources, project, params):
score_vector = np.array([hits.vector
for hits, _ in hits_from_sources])
discount_rate = float(self.params.get('discount_rate',
self.DEFAULT_DISCOUNT_RATE))
discount_rate = float(self.params['discount_rate'])
result = np.zeros(score_vector.shape[1])
for subj_id in range(score_vector.shape[1]):
subj_score_vector = score_vector[:, subj_id]
Expand Down
15 changes: 13 additions & 2 deletions annif/backend/vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
from annif.exception import ConfigurationException
from . import vw_base
from . import backend
from . import mixins


Expand All @@ -27,14 +28,24 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend):
'probabilities': (bool, None)
}

DEFAULT_ALGORITHM = 'oaa'
SUPPORTED_ALGORITHMS = ('oaa', 'ect', 'log_multi', 'multilabel_oaa')

DEFAULT_INPUTS = '_text_'

DEFAULT_PARAMS = {'algorithm': 'oaa'}

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(mixins.ChunkingBackend.DEFAULT_PARAMS)
params.update(self.DEFAULT_PARAMS)
params.update({param: default_val
for param, (_, default_val) in self.VW_PARAMS.items()
if default_val is not None})
return params

@property
def algorithm(self):
algorithm = self.params.get('algorithm', self.DEFAULT_ALGORITHM)
algorithm = self.params['algorithm']
if algorithm not in self.SUPPORTED_ALGORITHMS:
raise ConfigurationException(
"{} is not a valid algorithm (allowed: {})".format(
Expand Down
5 changes: 3 additions & 2 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class AnnifProject(DatadirMixin):
def __init__(self, project_id, config, datadir):
DatadirMixin.__init__(self, datadir, 'projects', project_id)
self.project_id = project_id
self.name = config['name']
self.name = config.get('name', project_id)
self.language = config['language']
self.analyzer_spec = config.get('analyzer', None)
self.vocab_id = config.get('vocab', None)
Expand Down Expand Up @@ -144,7 +144,8 @@ def backend(self):
try:
backend_class = annif.backend.get_backend(backend_id)
self._backend = backend_class(
backend_id, params=self.config, datadir=self.datadir)
backend_id, config_params=self.config,
datadir=self.datadir)
except ValueError:
logger.warning(
"Could not create backend %s, "
Expand Down
20 changes: 20 additions & 0 deletions tests/projects.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,26 @@ language=en
vocab=dummy
analyzer=snowball(english)

[noname]
language=en
backend=tfidf
vocab=dummy
analyzer=snowball(english)

[noparams-tfidf-fi]
name=TF-IDF Finnish using default params
language=fi
backend=tfidf
analyzer=snowball(finnish)
vocab=yso-fi

[noparams-fasttext-fi]
name=fastText Finnish using default params
language=fi
backend=fasttext
analyzer=snowball(finnish)
vocab=yso-fi

[pav]
name=PAV Ensemble Finnish
language=fi
Expand Down
13 changes: 11 additions & 2 deletions tests/test_backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Unit tests for backends in Annif"""

import pytest
import logging
import annif
import annif.backend
import annif.corpus
Expand All @@ -13,7 +14,7 @@ def test_get_backend_nonexistent():

def test_get_backend_dummy(app, project):
dummy_type = annif.backend.get_backend("dummy")
dummy = dummy_type(backend_id='dummy', params={},
dummy = dummy_type(backend_id='dummy', config_params={},
datadir=app.config['DATADIR'])
result = dummy.suggest(text='this is some text', project=project)
assert len(result) == 1
Expand All @@ -24,7 +25,7 @@ def test_get_backend_dummy(app, project):

def test_learn_dummy(app, project, tmpdir):
dummy_type = annif.backend.get_backend("dummy")
dummy = dummy_type(backend_id='dummy', params={},
dummy = dummy_type(backend_id='dummy', config_params={},
datadir=app.config['DATADIR'])

tmpdir.join('doc1.txt').write('doc1')
Expand All @@ -40,3 +41,11 @@ def test_learn_dummy(app, project, tmpdir):
assert result[0].uri == 'http://example.org/key1'
assert result[0].label == 'key1'
assert result[0].score == 1.0


def test_fill_params_with_defaults(app):
dummy_type = annif.backend.get_backend('dummy')
dummy = dummy_type(backend_id='dummy', config_params={},
datadir=app.config['DATADIR'])
expected_default_params = {'limit': 100} # From AnnifBackend class
assert expected_default_params == dummy.params
28 changes: 24 additions & 4 deletions tests/test_backend_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,31 @@
fasttext = pytest.importorskip("annif.backend.fasttext")


def test_fasttext_default_params(datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
config_params={},
datadir=str(datadir))

expected_default_params = {
'limit': 100,
'chunksize': 1,
'dim': 100,
'lr': 0.25,
'epoch': 5,
'loss': 'hs',
}
actual_params = fasttext.params
for param, val in expected_default_params.items():
assert param in actual_params and actual_params[param] == val


def test_fasttext_train(datadir, document_corpus, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'dim': 100,
'lr': 0.25,
Expand All @@ -30,7 +50,7 @@ def test_fasttext_train_unknown_subject(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'dim': 100,
'lr': 0.25,
Expand All @@ -53,7 +73,7 @@ def test_fasttext_train_nodocuments(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'dim': 100,
'lr': 0.25,
Expand All @@ -73,7 +93,7 @@ def test_fasttext_suggest(datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
config_params={
'limit': 50,
'chunksize': 1,
'dim': 100,
Expand Down
12 changes: 6 additions & 6 deletions tests/test_backend_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_http_suggest(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -40,7 +40,7 @@ def test_http_suggest_with_results(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/dummy/analyze',
},
datadir=app.config['DATADIR'])
Expand All @@ -63,7 +63,7 @@ def test_http_suggest_zero_score(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -79,7 +79,7 @@ def test_http_suggest_error(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -98,7 +98,7 @@ def test_http_suggest_json_fails(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand All @@ -117,7 +117,7 @@ def test_http_suggest_unexpected_json(app, project):
http_type = annif.backend.get_backend("http")
http = http_type(
backend_id='http',
params={
config_params={
'endpoint': 'http://api.example.org/analyze',
'project': 'dummy'},
datadir=app.config['DATADIR'])
Expand Down
Loading