Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue273 default values for configuration settings #324

Merged
merged 22 commits into from
Sep 30, 2019
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
aa592d8
Default project name to project_id
juhoinkinen Aug 28, 2019
6480bd8
Fill missing parameters with defaults when creating backend objects
juhoinkinen Aug 29, 2019
7146977
Switch info about using default params to debug level
juhoinkinen Aug 29, 2019
4ef0b5a
Get all parent/ancestor classes instead of only direct ones
juhoinkinen Aug 29, 2019
f52a8bc
Use type to get class and thus avoid the magic method
juhoinkinen Aug 30, 2019
f8825ed
Refine debug message and docstring
juhoinkinen Aug 30, 2019
bf58247
Simpler & more controllable parameter defaulting
juhoinkinen Sep 4, 2019
ff7969b
Add default params for fasttext and PAV
juhoinkinen Sep 5, 2019
089ac38
Debug msg for using default params to one line
juhoinkinen Sep 6, 2019
0104fbc
Use default params for VW multi and ensemble
juhoinkinen Sep 6, 2019
2781087
Make fill_params_with_defaults testable; use it to set params in the …
juhoinkinen Sep 6, 2019
2d9a891
Default params for dummy BE and a test for params filling with it
juhoinkinen Sep 11, 2019
a215af9
Tests for backend params defaulting
juhoinkinen Sep 18, 2019
0fb125c
Skip test when fasttext not available
juhoinkinen Sep 18, 2019
c037815
Make params a property with defaults setting
juhoinkinen Sep 24, 2019
1843332
From SectionProxy obj to dict to be able to use .copy()
juhoinkinen Sep 24, 2019
63a199c
Rename params originating from projects.cfg for clarity
juhoinkinen Sep 24, 2019
420d3d1
Defaulting simply by updating default params with config params
juhoinkinen Sep 25, 2019
0df9735
Merge branch 'master' into issue273-default-values-for-configuration-…
juhoinkinen Sep 26, 2019
91136a6
Adapt to PR #322
juhoinkinen Sep 26, 2019
157ecab
Make config_params a field and params a property
juhoinkinen Sep 30, 2019
d22cd2d
Drop copying of default_params
juhoinkinen Sep 30, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion annif/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,30 @@ class AnnifBackend(metaclass=abc.ABCMeta):
needs_subject_index = False
needs_subject_vectorizer = False

DEFAULT_PARAMS = {'limit': 100}

def __init__(self, backend_id, params, datadir):
"""Initialize backend with specific parameters. The
parameters are a dict. Keys and values depend on the specific
backend type."""
self.backend_id = backend_id
self.params = params
self.datadir = datadir
self.params = self.fill_params_with_defaults(params)

def default_params(self):
return self.DEFAULT_PARAMS

def fill_params_with_defaults(self, params):
Copy link
Member Author

@juhoinkinen juhoinkinen Sep 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would the params field actually be better to be implemented as a property, and then this would be decorated with @params.setter?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea of params being a property. It could be implemented in such a way that it transparently fills in default parameters before returning a dict. And it could have a setter used to set manual parameters (which would be stored in an internal field, say _params). Actually, the filling in could also be done within the setter - then it would only happen once instead of on every access. Accessing the params property would then just return self._params or perhaps a copy() to be safe.

"""Set the parameters that are not provided in the projects config file
with default values set in the backend class."""
params_to_set = {param: str(val)
for param, val in self.default_params().items()
if param not in params}
if params_to_set:
self.debug('all parameters not set, using following defaults: {}'
.format(params_to_set))
params.update(params_to_set)
return params

def train(self, corpus, project):
"""train the model on the given document or subject corpus"""
Expand Down
4 changes: 4 additions & 0 deletions annif/backend/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ class DummyBackend(backend.AnnifLearningBackend):
uri = 'http://example.org/dummy'
label = 'dummy'

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
osma marked this conversation as resolved.
Show resolved Hide resolved
return params

def initialize(self):
self.initialized = True

Expand Down
13 changes: 13 additions & 0 deletions annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,25 @@ class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend):
't': float
}

DEFAULT_PARAMS = {
'dim': 100,
'lr': 0.25,
'epoch': 5,
'loss': 'hs',
}

MODEL_FILE = 'fasttext-model'
TRAIN_FILE = 'fasttext-train.txt'

# defaults for uninitialized instances
_model = None

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(mixins.ChunkingBackend.DEFAULT_PARAMS)
params.update(self.DEFAULT_PARAMS)
return params

def initialize(self):
if self._model is None:
path = os.path.join(self.datadir, self.MODEL_FILE)
Expand Down
5 changes: 5 additions & 0 deletions annif/backend/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
class ChunkingBackend(metaclass=abc.ABCMeta):
"""Annif backend mixin that implements chunking of input"""

DEFAULT_PARAMS = {'chunksize': 1}

def default_params(self):
return self.DEFAULT_PARAMS

@abc.abstractmethod
def _suggest_chunks(self, chunktexts, project):
"""Suggest subjects for the chunked text; should be implemented by
Expand Down
2 changes: 2 additions & 0 deletions annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class PAVBackend(ensemble.EnsembleBackend):
# defaults for uninitialized instances
_models = None

DEFAULT_PARAMS = {'min-docs': 10}

def initialize(self):
if self._models is not None:
return # already initialized
Expand Down
15 changes: 12 additions & 3 deletions annif/backend/vw_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
from annif.exception import NotInitializedException
from annif.suggestion import VectorSuggestionResult
from . import backend
from . import vw_base
from . import ensemble

Expand Down Expand Up @@ -42,7 +43,16 @@ class VWEnsembleBackend(
# a simple mean of scores. A higher value will mean that the model
# adapts quicker (and possibly makes more errors) while a lower value
# will make it more careful so that it will require more training data.
DEFAULT_DISCOUNT_RATE = 0.01

DEFAULT_PARAMS = {'discount_rate': 0.01}

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(self.DEFAULT_PARAMS)
params.update({param: default_val
for param, (_, default_val) in self.VW_PARAMS.items()
if default_val is not None})
return params

def _load_subject_freq(self):
path = os.path.join(self.datadir, self.FREQ_FILE)
Expand Down Expand Up @@ -75,8 +85,7 @@ def _calculate_scores(self, subj_id, subj_score_vector):
def _merge_hits_from_sources(self, hits_from_sources, project, params):
score_vector = np.array([hits.vector
for hits, _ in hits_from_sources])
discount_rate = float(self.params.get('discount_rate',
self.DEFAULT_DISCOUNT_RATE))
discount_rate = float(self.params['discount_rate'])
result = np.zeros(score_vector.shape[1])
for subj_id in range(score_vector.shape[1]):
subj_score_vector = score_vector[:, subj_id]
Expand Down
15 changes: 13 additions & 2 deletions annif/backend/vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
from annif.exception import ConfigurationException
from . import vw_base
from . import backend
from . import mixins


Expand All @@ -27,14 +28,24 @@ class VWMultiBackend(mixins.ChunkingBackend, vw_base.VWBaseBackend):
'probabilities': (bool, None)
}

DEFAULT_ALGORITHM = 'oaa'
SUPPORTED_ALGORITHMS = ('oaa', 'ect', 'log_multi', 'multilabel_oaa')

DEFAULT_INPUTS = '_text_'

DEFAULT_PARAMS = {'algorithm': 'oaa'}

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMS.copy()
params.update(mixins.ChunkingBackend.DEFAULT_PARAMS)
params.update(self.DEFAULT_PARAMS)
params.update({param: default_val
for param, (_, default_val) in self.VW_PARAMS.items()
if default_val is not None})
return params

@property
def algorithm(self):
algorithm = self.params.get('algorithm', self.DEFAULT_ALGORITHM)
algorithm = self.params['algorithm']
if algorithm not in self.SUPPORTED_ALGORITHMS:
raise ConfigurationException(
"{} is not a valid algorithm (allowed: {})".format(
Expand Down
2 changes: 1 addition & 1 deletion annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class AnnifProject(DatadirMixin):
def __init__(self, project_id, config, datadir):
DatadirMixin.__init__(self, datadir, 'projects', project_id)
self.project_id = project_id
self.name = config['name']
self.name = config.get('name', project_id)
self.language = config['language']
self.analyzer_spec = config.get('analyzer', None)
self.vocab_id = config.get('vocab', None)
Expand Down
20 changes: 20 additions & 0 deletions tests/projects.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,26 @@ language=en
vocab=dummy
analyzer=snowball(english)

[noname]
language=en
backend=tfidf
vocab=dummy
analyzer=snowball(english)

[noparams-tfidf-fi]
name=TF-IDF Finnish using default params
language=fi
backend=tfidf
analyzer=snowball(finnish)
vocab=yso-fi

[noparams-fasttext-fi]
name=fastText Finnish using default params
language=fi
backend=fasttext
analyzer=snowball(finnish)
vocab=yso-fi

[pav]
name=PAV Ensemble Finnish
language=fi
Expand Down
14 changes: 14 additions & 0 deletions tests/test_backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Unit tests for backends in Annif"""

import pytest
import logging
import annif
import annif.backend
import annif.corpus
Expand Down Expand Up @@ -40,3 +41,16 @@ def test_learn_dummy(app, project, tmpdir):
assert result[0].uri == 'http://example.org/key1'
assert result[0].label == 'key1'
assert result[0].score == 1.0


def test_fill_params_with_defaults(app, caplog):
logger = annif.logger
logger.propagate = True
dummy_type = annif.backend.get_backend('dummy')
dummy = dummy_type(backend_id='dummy', params={},
datadir=app.config['DATADIR'])
expected_default_params = {'limit': '100'} # From AnnifBackend class
expected_msg = 'all parameters not set, using following defaults:'
caplog.set_level(logging.DEBUG)
assert expected_default_params == dummy.fill_params_with_defaults({})
assert expected_msg in caplog.records[0].message
27 changes: 27 additions & 0 deletions tests/test_backend_fasttext.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,39 @@
"""Unit tests for the fastText backend in Annif"""

import logging
import pytest
import annif.backend
import annif.corpus

fasttext = pytest.importorskip("annif.backend.fasttext")


def test_fasttext_default_params(datadir, project, caplog):
logger = annif.logger
logger.propagate = True
caplog.set_level(logging.DEBUG)

fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={},
datadir=str(datadir))

expected_default_params = {
'limit': 100,
'chunksize': 1,
'dim': 100,
'lr': 0.25,
'epoch': 5,
'loss': 'hs',
}
expected_msg = "all parameters not set, using following defaults:"
assert expected_msg in caplog.records[0].message
actual_params = fasttext.params
for param, val in expected_default_params.items():
assert param in actual_params and actual_params[param] == str(val)


def test_fasttext_train(datadir, document_corpus, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
Expand Down
22 changes: 22 additions & 0 deletions tests/test_backend_pav.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,31 @@
"""Unit tests for the PAV backend in Annif"""

import logging
import annif.backend
import annif.corpus


def test_pav_default_params(datadir, document_corpus, project, caplog):
logger = annif.logger
logger.propagate = True
caplog.set_level(logging.DEBUG)

pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id='pav',
params={},
datadir=str(datadir))

expected_default_params = {
'min-docs': 10,
}
expected_msg = "all parameters not set, using following defaults:"
assert expected_msg in caplog.records[0].message
actual_params = pav.params
for param, val in expected_default_params.items():
assert param in actual_params and actual_params[param] == str(val)


def test_pav_train(app, datadir, tmpdir, project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
Expand Down
22 changes: 22 additions & 0 deletions tests/test_backend_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import annif.backend
import annif.corpus
from sklearn.feature_extraction.text import TfidfVectorizer
import logging
import pytest
import unittest.mock

Expand All @@ -18,6 +19,27 @@ def project(document_corpus):
return proj


def test_tfidf_default_params(datadir, project, caplog):
logger = annif.logger
logger.propagate = True
caplog.set_level(logging.DEBUG)

tfidf_type = annif.backend.get_backend("tfidf")
tfidf = tfidf_type(
backend_id='tfidf',
params={},
datadir=str(datadir))

expected_default_params = {
'limit': 100 # From AnnifBackend class
}
expected_msg = "all parameters not set, using following defaults:"
assert expected_msg in caplog.records[0].message
actual_params = tfidf.params
for param, val in expected_default_params.items():
assert param in actual_params and actual_params[param] == str(val)


def test_tfidf_train(datadir, document_corpus, project):
tfidf_type = annif.backend.get_backend("tfidf")
tfidf = tfidf_type(
Expand Down
24 changes: 24 additions & 0 deletions tests/test_backend_vw_ensemble.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Unit tests for the vw_ensemble backend in Annif"""

import logging
import json
import time
import pytest
Expand All @@ -11,6 +12,29 @@
pytest.importorskip("annif.backend.vw_ensemble")


def test_vw_ensemble_default_params(datadir, project, caplog):
logger = annif.logger
logger.propagate = True
caplog.set_level(logging.DEBUG)

vw_type = annif.backend.get_backend("vw_ensemble")
vw = vw_type(
backend_id='vw_ensemble',
params={},
datadir=str(datadir))

expected_default_params = {
'limit': 100,
'discount_rate': 0.01,
'loss_function': 'squared',
}
expected_msg = "all parameters not set, using following defaults:"
assert expected_msg in caplog.records[0].message
actual_params = vw.params
for param, val in expected_default_params.items():
assert param in actual_params and actual_params[param] == str(val)


def test_vw_ensemble_suggest_no_model(datadir, project):
vw_ensemble_type = annif.backend.get_backend('vw_ensemble')
vw_ensemble = vw_ensemble_type(
Expand Down
Loading