From 33895a259258c33156568415770a0486ba49268e Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Dec 2024 10:11:58 +0100 Subject: [PATCH 1/2] Add regex to filter out suspiciuos ontology releases. --- src/hpotk/store/_api.py | 12 ++++++------ src/hpotk/store/_config.py | 8 ++++---- src/hpotk/store/_github.py | 34 +++++++++++++++++++++++----------- tests/test_store.py | 9 +++++++++ 4 files changed, 42 insertions(+), 21 deletions(-) diff --git a/src/hpotk/store/_api.py b/src/hpotk/store/_api.py index 5918cb9..37b21c6 100644 --- a/src/hpotk/store/_api.py +++ b/src/hpotk/store/_api.py @@ -82,8 +82,8 @@ class OntologyReleaseService(metaclass=abc.ABCMeta): @abc.abstractmethod def fetch_tags( - self, - ontology_type: OntologyType, + self, + ontology_type: OntologyType, ) -> typing.Iterable[str]: """ Fetch sequence of tags for an ontology. @@ -100,10 +100,10 @@ class OntologyStore: """ def __init__( - self, - store_dir: str, - ontology_release_service: OntologyReleaseService, - remote_ontology_service: RemoteOntologyService, + self, + store_dir: str, + ontology_release_service: OntologyReleaseService, + remote_ontology_service: RemoteOntologyService, ): self._logger = logging.getLogger(__name__) self._store_dir = store_dir diff --git a/src/hpotk/store/_config.py b/src/hpotk/store/_config.py index 104a8b4..94b7f61 100644 --- a/src/hpotk/store/_config.py +++ b/src/hpotk/store/_config.py @@ -9,9 +9,9 @@ def configure_ontology_store( - store_dir: typing.Optional[str] = None, - ontology_release_service: OntologyReleaseService = GitHubOntologyReleaseService(), - remote_ontology_service: RemoteOntologyService = GitHubRemoteOntologyService(), + store_dir: typing.Optional[str] = None, + ontology_release_service: OntologyReleaseService = GitHubOntologyReleaseService(), + remote_ontology_service: RemoteOntologyService = GitHubRemoteOntologyService(), ) -> OntologyStore: """ Configure and create the default ontology store. @@ -28,7 +28,7 @@ def configure_ontology_store( store_dir = get_default_ontology_store_dir() else: if not os.path.isdir(store_dir): - raise ValueError(f'`store_dir` must point to an existing directory') + raise ValueError('`store_dir` must point to an existing directory') return OntologyStore( store_dir=store_dir, ontology_release_service=ontology_release_service, diff --git a/src/hpotk/store/_github.py b/src/hpotk/store/_github.py index 51f221e..cdebb75 100644 --- a/src/hpotk/store/_github.py +++ b/src/hpotk/store/_github.py @@ -1,6 +1,7 @@ import io import json import logging +import re import ssl import typing from urllib.request import urlopen @@ -14,18 +15,23 @@ OntologyType.HPO: { 'owner': 'obophenotype', 'repo': 'human-phenotype-ontology', + 'tag_pt': r'^v(?P\d{4})-(?P\d{2})-(?P\d{2})$', }, OntologyType.MAxO: { 'owner': 'monarch-initiative', 'repo': 'MAxO', + 'tag_pt': r'^v(?P\d{4})-(?P\d{2})-(?P\d{2})$', }, OntologyType.MONDO: { 'owner': 'monarch-initiative', 'repo': 'mondo', + 'tag_pt': r'^v(?P\d{4})-(?P\d{2})-(?P\d{2})$', }, } """ -The default ontology credentials that only include HPO at the time. +The default ontology credentials that only include HPO, MAxO, and MONDO at this time. + +The tag pattern ensures we only include the "production" tags (e.g. not `2024-12-12X`). """ @@ -35,9 +41,9 @@ class GitHubOntologyReleaseService(OntologyReleaseService): """ def __init__( - self, - timeout: int = 10, - ontology_credentials: typing.Mapping[OntologyType, typing.Mapping[str, str]] = ONTOLOGY_CREDENTIALS, + self, + timeout: int = 10, + ontology_credentials: typing.Mapping[OntologyType, typing.Mapping[str, str]] = ONTOLOGY_CREDENTIALS, ): self._logger = logging.getLogger(__name__) self._timeout = timeout @@ -56,20 +62,22 @@ def fetch_tags(self, ontology_type: OntologyType) -> typing.Iterable[str]: return self._get_tag_names( owner=credentials['owner'], repo=credentials['repo'], + tag_pt=credentials['tag_pt'], ) def _get_tag_names( - self, - owner: str, - repo: str, + self, + owner: str, + repo: str, + tag_pt: str, ) -> typing.Iterable[str]: tag_url = self._tag_api_url.format(owner=owner, repo=repo) self._logger.debug('Pulling tag from %s', tag_url) with urlopen( - tag_url, - timeout=self._timeout, - context=self._ctx, + tag_url, + timeout=self._timeout, + context=self._ctx, ) as fh: tags = json.load(fh) @@ -78,7 +86,11 @@ def _get_tag_names( else: self._logger.debug('Fetched %d tags', len(tags)) - return (tag['name'] for tag in tags) + pattern = re.compile(tag_pt) + return filter( + lambda tag: pattern.match(tag), + (tag['name'] for tag in tags), + ) class GitHubRemoteOntologyService(RemoteOntologyService): diff --git a/tests/test_store.py b/tests/test_store.py index 20be2fb..7c64d51 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -173,6 +173,15 @@ def initialize_store_dir(store_dir: Path): mondo_path.joinpath("x.txt").touch() mondo_path.joinpath("y.txt").touch() + @pytest.mark.skip("Just for manual debugging") + def test_resolve_store_path__latest( + self, + ontology_store: hpotk.OntologyStore, + ): + latest = ontology_store.resolve_store_path(hpotk.store.OntologyType.HPO) + print(latest) + + @pytest.mark.online class TestGitHubOntologyStoreOnline: """ From f72d19dac0b2f7d1d1b0b20b002b348d87ccced7 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Dec 2024 10:16:03 +0100 Subject: [PATCH 2/2] Extract the pattern to a variable. --- src/hpotk/store/_github.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/hpotk/store/_github.py b/src/hpotk/store/_github.py index cdebb75..6d05277 100644 --- a/src/hpotk/store/_github.py +++ b/src/hpotk/store/_github.py @@ -11,27 +11,31 @@ from ._api import OntologyType, OntologyReleaseService, RemoteOntologyService +production_tag_pt = r'^v(?P\d{4})-(?P\d{2})-(?P\d{2})$' +""" +A tag pattern to ensure we only include the "production" tags (e.g. not `v2024-12-12X`). +""" + + ONTOLOGY_CREDENTIALS = { OntologyType.HPO: { 'owner': 'obophenotype', 'repo': 'human-phenotype-ontology', - 'tag_pt': r'^v(?P\d{4})-(?P\d{2})-(?P\d{2})$', + 'tag_pt': production_tag_pt, }, OntologyType.MAxO: { 'owner': 'monarch-initiative', 'repo': 'MAxO', - 'tag_pt': r'^v(?P\d{4})-(?P\d{2})-(?P\d{2})$', + 'tag_pt': production_tag_pt, }, OntologyType.MONDO: { 'owner': 'monarch-initiative', 'repo': 'mondo', - 'tag_pt': r'^v(?P\d{4})-(?P\d{2})-(?P\d{2})$', + 'tag_pt': production_tag_pt, }, } """ The default ontology credentials that only include HPO, MAxO, and MONDO at this time. - -The tag pattern ensures we only include the "production" tags (e.g. not `2024-12-12X`). """