From e48a2fc532491039c6eaafec41e27e28b4222634 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 17 Aug 2022 15:20:16 +0300 Subject: [PATCH] Store vocabs in AnnifRegistry so they are shared between projects. Fixes #603 --- annif/project.py | 6 ++---- annif/registry.py | 47 +++++++++++++++++++++++++++++++++------------ annif/vocab.py | 13 ------------- tests/test_vocab.py | 4 ++-- 4 files changed, 39 insertions(+), 31 deletions(-) diff --git a/annif/project.py b/annif/project.py index 13f470fcb..05fa7353f 100644 --- a/annif/project.py +++ b/annif/project.py @@ -9,7 +9,6 @@ import annif.corpus import annif.suggestion import annif.backend -import annif.vocab from annif.datadir import DatadirMixin from annif.exception import AnnifException, ConfigurationException, \ NotSupportedException, NotInitializedException @@ -155,9 +154,8 @@ def vocab(self): if self.vocab_spec is None: raise ConfigurationException("vocab setting is missing", project_id=self.project_id) - self._vocab = annif.vocab.get_vocab(self.vocab_spec, - self._base_datadir, - self.language) + self._vocab = self.registry.get_vocab(self.vocab_spec, + self.language) return self._vocab diff --git a/annif/registry.py b/annif/registry.py index 878ccfbdb..a56340ab0 100644 --- a/annif/registry.py +++ b/annif/registry.py @@ -1,35 +1,41 @@ """Registry that keeps track of Annif projects""" import collections +import re from flask import current_app import annif from annif.config import parse_config from annif.project import Access, AnnifProject +from annif.vocab import AnnifVocabulary +from annif.util import parse_args logger = annif.logger class AnnifRegistry: - """Class that keeps track of the Annif projects""" - - # Note: The individual projects are stored in a shared static variable, - # keyed by the "registry ID" which is unique to the registry instance. - # This is done to make it possible to serialize AnnifRegistry instances - # without including the potentially huge project objects (which contain - # backends with large models, vocabularies with lots of concepts etc). - # Serialized AnnifRegistry instances can then be passed between - # processes when using the multiprocessing module. + """Class that keeps track of the Annif projects and vocabularies""" + + # Note: The individual projects and vocabularies are stored in shared + # static variables, keyed by the "registry ID" which is unique to the + # registry instance. This is done to make it possible to serialize + # AnnifRegistry instances without including the potentially huge objects + # (which contain backends with large models, vocabularies with lots of + # concepts etc). Serialized AnnifRegistry instances can then be passed + # between processes when using the multiprocessing module. _projects = {} + _vocabs = {} def __init__(self, projects_config_path, datadir, init_projects): self._rid = id(self) + self._datadir = datadir self._projects[self._rid] = \ - self._create_projects(projects_config_path, datadir) + self._create_projects(projects_config_path) + self._vocabs[self._rid] = {} if init_projects: for project in self._projects[self._rid].values(): project.initialize() - def _create_projects(self, projects_config_path, datadir): + def _create_projects(self, projects_config_path): # parse the configuration config = parse_config(projects_config_path) @@ -42,7 +48,7 @@ def _create_projects(self, projects_config_path, datadir): for project_id in config.project_ids: projects[project_id] = AnnifProject(project_id, config[project_id], - datadir, + self._datadir, self) return projects @@ -64,6 +70,23 @@ def get_project(self, project_id, min_access=Access.private): except KeyError: raise ValueError("No such project {}".format(project_id)) + def get_vocab(self, vocab_spec, default_language): + """Return an AnnifVocabulary corresponding to the vocab_spec. If no + language information is specified, use the given default language.""" + match = re.match(r'(\w+)(\((.*)\))?', vocab_spec) + if match is None: + raise ValueError( + f"Invalid vocabulary specification: {vocab_spec}") + vocab_id = match.group(1) + posargs, kwargs = parse_args(match.group(3)) + language = posargs[0] if posargs else default_language + vocab_key = (vocab_id, language) + + if vocab_key not in self._vocabs[self._rid]: + self._vocabs[self._rid][vocab_key] = AnnifVocabulary( + vocab_id, self._datadir, language) + return self._vocabs[self._rid][vocab_key] + def initialize_projects(app): projects_config_path = app.config['PROJECTS_CONFIG_PATH'] diff --git a/annif/vocab.py b/annif/vocab.py index da57ec6cb..9605bdf9b 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -1,28 +1,15 @@ """Vocabulary management functionality for Annif""" import os.path -import re import annif import annif.corpus import annif.util from annif.datadir import DatadirMixin from annif.exception import NotInitializedException -from annif.util import parse_args logger = annif.logger -def get_vocab(vocab_spec, datadir, default_language): - match = re.match(r'(\w+)(\((.*)\))?', vocab_spec) - if match is None: - raise ValueError(f"Invalid vocabulary specification: {vocab_spec}") - vocab_id = match.group(1) - posargs, kwargs = parse_args(match.group(3)) - language = posargs[0] if posargs else default_language - - return AnnifVocabulary(vocab_id, datadir, language) - - class AnnifVocabulary(DatadirMixin): """Class representing a subject vocabulary which can be used by multiple Annif projects.""" diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 68f0cb983..02225ce16 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -19,9 +19,9 @@ def load_dummy_vocab(tmpdir): return vocab -def test_get_vocab_invalid(): +def test_get_vocab_invalid(registry): with pytest.raises(ValueError) as excinfo: - annif.vocab.get_vocab('', None, None) + registry.get_vocab('', None) assert 'Invalid vocabulary specification' in str(excinfo.value)