From ecd23093afd235691d42416cf6c1e1e440e7c747 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 13 Mar 2024 11:03:37 -0400 Subject: [PATCH 01/13] Next development iteration `0.5.1dev0`. --- recipe/meta.yaml | 2 +- src/hpotk/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 3d47f9d..353d12b 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "hpo-toolkit" %} -{% set version = "0.5.0" %} +{% set version = "0.5.1dev0" %} package: name: {{ name|lower }} diff --git a/src/hpotk/__init__.py b/src/hpotk/__init__.py index a91f2cf..cfae680 100644 --- a/src/hpotk/__init__.py +++ b/src/hpotk/__init__.py @@ -2,7 +2,7 @@ HPO toolkit is a library for working with Human Phenotype Ontology and the HPO annotation data. """ -__version__ = "0.5.0" +__version__ = "0.5.1dev0" from . import algorithm from . import annotations From 2bacfc157bc450fec2bf8403d8308e983bb00324 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 13 Mar 2024 11:17:04 -0400 Subject: [PATCH 02/13] Add release badge to `README.md`. --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ffc293f..3621bb9 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # hpo-toolkit -![Build status](https://img.shields.io/github/actions/workflow/status/TheJacksonLaboratory/hpo-toolkit/python_ci.yml) -![PyPi downloads](https://img.shields.io/pypi/dm/hpo-toolkit.svg?label=Pypi%20downloads) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/hpo-toolkit) +![PyPi downloads](https://img.shields.io/pypi/dm/hpo-toolkit.svg?label=Pypi%20downloads) +![Build status](https://img.shields.io/github/actions/workflow/status/TheJacksonLaboratory/hpo-toolkit/python_ci.yml) +[![GitHub release](https://img.shields.io/github/release/TheJacksonLaboratory/hpo-toolkit.svg)](https://github.com/TheJacksonLaboratory/hpo-toolkit/releases) A toolkit for working with Human Phenotype Ontology (HPO) and HPO disease annotations in Python. From c6b77589d784a3b7c368646731fbb47a819f8a06 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 21 Mar 2024 15:41:33 -0400 Subject: [PATCH 03/13] Support providing custom stream to the logger. --- src/hpotk/util/_log.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/hpotk/util/_log.py b/src/hpotk/util/_log.py index ad2eb86..b12067b 100644 --- a/src/hpotk/util/_log.py +++ b/src/hpotk/util/_log.py @@ -1,10 +1,14 @@ import logging +import typing DEFAULT_LOG_FMT = '%(asctime)s %(name)-20s %(levelname)-3s : %(message)s' -def setup_logging(level: int = logging.INFO, - log_fmt: str = DEFAULT_LOG_FMT): +def setup_logging( + level: int = logging.INFO, + log_fmt: str = DEFAULT_LOG_FMT, + stream: typing.Optional[typing.TextIO] = None, +): """ Create a basic configuration for the logging library. Set up console and file handler using provided `log_fmt`. @@ -19,12 +23,13 @@ def setup_logging(level: int = logging.INFO, :param level: the verbosity to use, `INFO` by default. :param log_fmt: format string for logging. + :param stream: stream to write to. Will default to `sys.stderr` if `None`. """ # create logger logger = logging.getLogger() logger.setLevel(level) # create console handler and set level to debug - ch = logging.StreamHandler() + ch = logging.StreamHandler(stream=stream) ch.setLevel(level) # create formatter formatter = logging.Formatter(log_fmt) From 855fa50818555f00a062106e6f27e3e178b14394 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 25 Mar 2024 09:33:38 -0400 Subject: [PATCH 04/13] Use pytest runner to run all tests: unit tests, integration tests, documentation tests, and tutorials. --- docs/user-guide/sort-term-ids.rst | 48 ++++---- docs/user-guide/use-hierarchy.rst | 39 +++--- .../validate-phenotypic-features.rst | 111 ++++++++---------- pytest.ini | 9 +- src/hpotk/ontology/_api.py | 100 +++++++++------- 5 files changed, 153 insertions(+), 154 deletions(-) diff --git a/docs/user-guide/sort-term-ids.rst b/docs/user-guide/sort-term-ids.rst index 4bd5108..4f33ce1 100644 --- a/docs/user-guide/sort-term-ids.rst +++ b/docs/user-guide/sort-term-ids.rst @@ -10,16 +10,15 @@ HPO toolkit provides logic for sorting HPO terms such that the similar terms are Let's illustrate this on example. Suppose having a subject annotated with the following terms: -.. doctest:: sort-term-ids - - >>> subject = ( - ... 'HP:0001744', # Splenomegaly - ... 'HP:0020221', # Clonic seizure - ... 'HP:0001238', # Slender finger - ... 'HP:0011153', # Focal motor seizure - ... 'HP:0002240' # Hepatomegaly - ... ) - >>> term_ids = tuple(TermId.from_curie(curie) for curie in subject) +>>> import hpotk +>>> subject = ( +... 'HP:0001744', # Splenomegaly +... 'HP:0020221', # Clonic seizure +... 'HP:0001238', # Slender finger +... 'HP:0011153', # Focal motor seizure +... 'HP:0002240' # Hepatomegaly +... ) +>>> term_ids = tuple(hpotk.TermId.from_curie(curie) for curie in subject) The order of HPO annotations does not reflect that *Splenomegaly* is more "similar" to *Hepatomegaly* than @@ -39,18 +38,18 @@ The algorithm iteratively chooses the most similar term ID pairs and places them We'll use a toy HPO with several terms to present the functionality: -.. doctest:: sort-term-ids +>>> import os +>>> fpath_hpo = os.path.join('docs', 'data', 'hp.toy.json') +>>> hpo = hpotk.load_minimal_ontology(fpath_hpo) - >>> from hpotk.util.sort import HierarchicalEdgeTermIdSorting - >>> hpo = hpotk.load_minimal_ontology('data/hp.toy.json') - >>> sorting = HierarchicalEdgeTermIdSorting(hpo) +>>> from hpotk.util.sort import HierarchicalEdgeTermIdSorting +>>> sorting = HierarchicalEdgeTermIdSorting(hpo) We can obtain the indices that will sort the HPO terms and prepare a `tuple` with sorted terms: -.. doctest:: sort-term-ids - >>> indices = sorting.argsort(term_ids) - >>> ordered = tuple(term_ids[idx] for idx in indices) +>>> indices = sorting.argsort(term_ids) +>>> ordered = tuple(term_ids[idx] for idx in indices) Now let's look at the order. Originally, the HPO terms were ordered as follows:: @@ -62,14 +61,13 @@ Now let's look at the order. Originally, the HPO terms were ordered as follows:: After the sorting, we get this order: -.. doctest:: sort-term-ids - >>> for term_id in ordered: - ... print(hpo.get_term(term_id).name) - Focal motor seizure - Clonic seizure - Hepatomegaly - Splenomegaly - Slender finger +>>> for term_id in ordered: +... print(hpo.get_term(term_id).name) +Focal motor seizure +Clonic seizure +Hepatomegaly +Splenomegaly +Slender finger which is much better, right? diff --git a/docs/user-guide/use-hierarchy.rst b/docs/user-guide/use-hierarchy.rst index 0426912..6f3c671 100644 --- a/docs/user-guide/use-hierarchy.rst +++ b/docs/user-guide/use-hierarchy.rst @@ -14,12 +14,13 @@ HPO toolkit enables accessing the ontology hierarchy through the :class:`hpotk.g in turn available through :class:`hpotk.ontology.MinimalOntology`. In other words, each ontology has the ontology graph as a property: -.. doctest:: traverse-hierarchy +>>> import os +>>> import hpotk - >>> import hpotk - >>> hpo = hpotk.load_minimal_ontology('data/hp.toy.json') - >>> hpo.graph - CsrIndexedOntologyGraph(root=HP:0000001, n_nodes=393) +>>> fpath_hpo = os.path.join('docs', 'data', 'hp.toy.json') +>>> hpo = hpotk.load_minimal_ontology(fpath_hpo) +>>> hpo.graph +CsrIndexedOntologyGraph(root=HP:0000001, n_nodes=393) We can leverage the hierarchy to infer a lot of extra information about the concepts, and, for instance, @@ -37,23 +38,19 @@ a term instead of successor/predecessors of a node. Let's illustrate this on a c We can get term IDs of the *parents* of a term, such as `Seizure `_ [`HP:0001250`] by calling: -.. doctest:: traverse-hierarchy - - >>> for parent in hpo.graph.get_parents('HP:0001250'): - ... print(parent) - HP:0012638 +>>> for parent in hpo.graph.get_parents('HP:0001250'): +... print(parent) +HP:0012638 `HP:0012638` corresponds to `Abnormal nervous system physiology `_. *Children* are accessed in a similar fashion: -.. doctest:: traverse-hierarchy - - >>> for child in hpo.graph.get_children('HP:0001250'): - ... print(child) - HP:0020219 - HP:0007359 +>>> for child in hpo.graph.get_children('HP:0001250'): +... print(child) +HP:0020219 +HP:0007359 We will leave finding the ancestors or descendants of a term as an exercise for the interested reader. @@ -66,13 +63,11 @@ ancestors/descendants of each other. We can test if Seizure [`HP:0001250`] is a parent or an ancestor of Clonic seizure [`HP:0020221`]: -.. doctest:: traverse-hierarchy - - >>> hpo.graph.is_parent_of('HP:0001250', 'HP:0020221') - False +>>> hpo.graph.is_parent_of('HP:0001250', 'HP:0020221') +False - >>> hpo.graph.is_ancestor_of('HP:0001250', 'HP:0020221') - True +>>> hpo.graph.is_ancestor_of('HP:0001250', 'HP:0020221') +True Similar methods exist for checking if a term is a child or a descendant of another term. diff --git a/docs/user-guide/validate-phenotypic-features.rst b/docs/user-guide/validate-phenotypic-features.rst index 05bbc16..bc81822 100644 --- a/docs/user-guide/validate-phenotypic-features.rst +++ b/docs/user-guide/validate-phenotypic-features.rst @@ -20,27 +20,24 @@ For the sake of this guide, let's assume we have an individual annotated with th * *Focal clonic seizure* * *Enuresis nocturna* -.. doctest:: check-consistency - - >>> curies = [ - ... 'HP:0001505', # Arachnodactyly - ... 'HP:0001250', # Seizure - ... 'HP:0002266', # Focal clonic seizure - ... 'HP:0010677' # Enuresis nocturna - ... ] +>>> curies = [ +... 'HP:0001505', # Arachnodactyly +... 'HP:0001250', # Seizure +... 'HP:0002266', # Focal clonic seizure +... 'HP:0010677' # Enuresis nocturna +... ] Let's convert the CURIEs into term ids: -.. doctest:: check-consistency - - >>> import hpotk - >>> term_ids = [hpotk.TermId.from_curie(curie) for curie in curies] +>>> import hpotk +>>> term_ids = [hpotk.TermId.from_curie(curie) for curie in curies] and let's finish the setup by loading the toy HPO shipped with the documentation. -.. doctest:: check-consistency - >>> hpo = hpotk.load_minimal_ontology('data/hp.toy.json') +>>> import os +>>> fpath_hpo = os.path.join('docs', 'data', 'hp.toy.json') +>>> hpo = hpotk.load_minimal_ontology(fpath_hpo) Do not use obsolete term ids @@ -55,28 +52,22 @@ and suggests the replacement. Let's create the validator and check if the phenotypic features are OK: -.. doctest:: check-consistency - - >>> from hpotk.validate import ObsoleteTermIdsValidator - >>> obs_val = ObsoleteTermIdsValidator(hpo) +>>> from hpotk.validate import ObsoleteTermIdsValidator +>>> obs_val = ObsoleteTermIdsValidator(hpo) - >>> vr = obs_val.validate(term_ids) +>>> vr = obs_val.validate(term_ids) The validator returns back an instance of :class:`hpotk.validate.ValidationResults` with the validation output. We can check for presence of issues in the input: -.. doctest:: check-consistency - - >>> vr.is_ok() - False +>>> vr.is_ok() +False The input is *not* OK, so we should look at the issues in greater detail: -.. doctest:: check-consistency - - >>> for validation_result in vr.results: - ... print(validation_result) - ValidationResult(level=, category='obsolete_term_id_is_used', message='Using the obsolete HP:0001505 instead of HP:0001166 for Arachnodactyly') +>>> for validation_result in vr.results: +... print(validation_result) +ValidationResult(level=, category='obsolete_term_id_is_used', message='Using the obsolete HP:0001505 instead of HP:0001166 for Arachnodactyly') We see that the `HP:0001505` is obsolete and `HP:0001166` should be used as the new *Arachnodactyly* identifier. @@ -93,14 +84,12 @@ of *Phenotypic abnormality*: Let's test that this is valid for the patient features: -.. doctest:: check-consistency +>>> from hpotk.validate import PhenotypicAbnormalityValidator +>>> pa_val = PhenotypicAbnormalityValidator(hpo) - >>> from hpotk.validate import PhenotypicAbnormalityValidator - >>> pa_val = PhenotypicAbnormalityValidator(hpo) - - >>> vr = pa_val.validate(term_ids) - >>> vr.is_ok() - True +>>> vr = pa_val.validate(term_ids) +>>> vr.is_ok() +True Yes, the all term ids represent the descendants of *Phenotypic abnormality*. @@ -123,26 +112,22 @@ of a similar kind. :class:`hpotk.validate.AnnotationPropagationValidator` checks if a set of terms violate the annotation propagation rule - if a collection of concepts contains a term and its ancestor. -.. doctest:: check-consistency - - >>> from hpotk.validate import AnnotationPropagationValidator - >>> ap_val = AnnotationPropagationValidator(hpo) +>>> from hpotk.validate import AnnotationPropagationValidator +>>> ap_val = AnnotationPropagationValidator(hpo) - >>> vr = ap_val.validate(term_ids) - >>> vr.is_ok() - False +>>> vr = ap_val.validate(term_ids) +>>> vr.is_ok() +False There seems to an issue. Let's break it down: -.. doctest:: check-consistency - - >>> for validation_result in vr.results: - ... print(validation_result.level) - ... print(validation_result.category) - ... print(validation_result.message) - ValidationLevel.ERROR - annotation_propagation - Terms should not contain both present Focal clonic seizure [HP:0002266] and its present or excluded ancestor Seizure [HP:0001250] +>>> for validation_result in vr.results: +... print(validation_result.level) +... print(validation_result.category) +... print(validation_result.message) +ValidationLevel.ERROR +annotation_propagation +Terms should not contain both present Focal clonic seizure [HP:0002266] and its present or excluded ancestor Seizure [HP:0001250] The validator points out that *Seizure* is an ancestor of *Focal clonic seizure* and should, therefore, not be used as an annotation of the individual. @@ -152,22 +137,20 @@ Validation pipeline For greater convenience, the validators can be integrated and run on the input at the same time: -.. doctest:: check-consistency - - >>> from hpotk.validate import ValidationRunner +>>> from hpotk.validate import ValidationRunner - >>> # Create a validation runner - >>> runner = ValidationRunner(validators=(obs_val, pa_val, ap_val)) +>>> # Create a validation runner +>>> runner = ValidationRunner(validators=(obs_val, pa_val, ap_val)) - >>> # Validate the input features - >>> vr = runner.validate_all(term_ids) - >>> vr.is_ok() - False +>>> # Validate the input features +>>> vr = runner.validate_all(term_ids) +>>> vr.is_ok() +False - >>> for validation_result in vr.results: - ... print(validation_result) - ValidationResult(level=, category='obsolete_term_id_is_used', message='Using the obsolete HP:0001505 instead of HP:0001166 for Arachnodactyly') - ValidationResult(level=, category='annotation_propagation', message='Terms should not contain both present Focal clonic seizure [HP:0002266] and its present or excluded ancestor Seizure [HP:0001250]') +>>> for validation_result in vr.results: +... print(validation_result) +ValidationResult(level=, category='obsolete_term_id_is_used', message='Using the obsolete HP:0001505 instead of HP:0001166 for Arachnodactyly') +ValidationResult(level=, category='annotation_propagation', message='Terms should not contain both present Focal clonic seizure [HP:0002266] and its present or excluded ancestor Seizure [HP:0001250]') :class:`hpotk.validate.ValidationRunner` applies several validators and aggregates the issues into :class:`hpotk.validate.ValidationResults`. We can check if the input passed the validation and if not, we can go through diff --git a/pytest.ini b/pytest.ini index 8c11252..9a5462e 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,8 @@ -# We want pytest to look for both test files prefixed with underscore `_` and "regular* test files. -# The unittest files, the test files located in the source code, are prepended with underscore -# to preclude their inclusion into API reference. [pytest] +; We want pytest to look for both test files prefixed with underscore `_` and "regular* test files. +; The unittest files, the test files located in the source code, are prepended with underscore +; to preclude their inclusion into API reference. python_files = _test*.py test*.py + +; Also test the documentation tests and the tutorial scripts. +addopts = --doctest-modules --doctest-glob docs/user-guide/*.rst diff --git a/src/hpotk/ontology/_api.py b/src/hpotk/ontology/_api.py index 56490fe..41c6e5d 100644 --- a/src/hpotk/ontology/_api.py +++ b/src/hpotk/ontology/_api.py @@ -7,47 +7,49 @@ class MinimalOntology(typing.Generic[ID, MINIMAL_TERM], GraphAware[ID], Versioned, metaclass=abc.ABCMeta): """ - `MinimalOntology` is a value object that mostly holds the ontology data. + `MinimalOntology` is a data structure for representing the ontology terms + and the ontology hierarchy. - The typical way to load the ontology is by parsing Obographs JSON file: + The typical way to load the ontology is by parsing Obographs JSON file + using :class:`hpotk.util.store.OntologyStore`, see :ref:`rstload-ontology` + section for more info. - .. doctest:: minimal-ontology-api + Here we will load a toy HPO shipped with the documentation: - >>> import hpotk - >>> hpo = hpotk.ontology.load.obographs.load_minimal_ontology('data/hp.toy.json') + >>> import os + >>> import hpotk + >>> fpath_hpo = os.path.join('docs', 'data', 'hp.toy.json') + >>> hpo = hpotk.load_minimal_ontology(fpath_hpo) - The ontology data comprises: + The ontology includes the following: - * ontology graph as :class:`hpotk.graph.OntologyGraph`, - * ontology concepts as :class:`hpotk.model.MinimalTerm`, and - * the version of the data. + * ontology hierarchy as :class:`hpotk.graph.OntologyGraph` + * ontology terms as :class:`hpotk.model.MinimalTerm` + * the metadata, such as the ontology version - The ontology acts as a Python container of term IDs, we can check if a term is in the ontology as: + The ontology acts as a Python container of term IDs, + we can check if a term is in the ontology as: - .. doctest:: minimal-ontology-api - - >>> seizure_curie = 'HP:0001250' - >>> seizure_curie in hpo - True + >>> seizure_curie = 'HP:0001250' + >>> seizure_curie in hpo + True This works for term IDs too: - .. doctest:: minimal-ontology-api - >>> seizure_id = hpotk.TermId.from_curie(seizure_curie) - >>> seizure_id in hpo - True - The ontology has length - the number of *primary* terms: + >>> seizure_id = hpotk.TermId.from_curie(seizure_curie) + >>> seizure_id in hpo + True - .. doctest:: minimal-ontology-api + The ontology has length - the number of *primary* terms: - >>> len(hpo) - 393 + >>> len(hpo) + 393 .. note:: - The toy HPO has only 393 terms. Real-life HPO has much more terms... + The toy HPO has only 393 terms. Real-life HPO has much more terms. The terms of `MinimalOntology` are instances of :class:`hpotk.model.MinimalTerm`. """ @@ -73,11 +75,14 @@ def get_term(self, term_id: CURIE_OR_TERM_ID_OR_IDENTIFIED) -> typing.Optional[M """ Get the current term for a `term_id`. - .. doctest:: minimal-ontology-api + .. testsetup:: - >>> seizure = hpo.get_term('HP:0001250') - >>> seizure.name - 'Seizure' + >>> import os, hpotk + >>> hpo = hpotk.load_minimal_ontology(os.path.join('docs', 'data', 'hp.toy.json')) + + >>> seizure = hpo.get_term('HP:0001250') + >>> seizure.name + 'Seizure' :param term_id: a CURIE `str` (e.g. 'HP:1234567'), a :class:`hpotk.model.TermId` or an :class:`hpotk.model.Identified` entity that represents a *current* or an *obsolete* term. @@ -89,11 +94,14 @@ def get_term_name(self, term_id: CURIE_OR_TERM_ID_OR_IDENTIFIED) -> typing.Optio """ Get the name of the term with a `term_id`. - .. doctest:: minimal-ontology-api + .. testsetup:: + + >>> import os, hpotk + >>> hpo = hpotk.load_minimal_ontology(os.path.join('docs', 'data', 'hp.toy.json')) - >>> seizure_name = hpo.get_term_name('HP:0001250') - >>> seizure_name - 'Seizure' + >>> seizure_name = hpo.get_term_name('HP:0001250') + >>> seizure_name + 'Seizure' :param term_id: a CURIE `str` (e.g. 'HP:1234567'), a :class:`hpotk.model.TermId` or an :class:`hpotk.model.Identified` entity that represents a *current* or an *obsolete* term. @@ -102,18 +110,27 @@ def get_term_name(self, term_id: CURIE_OR_TERM_ID_OR_IDENTIFIED) -> typing.Optio term = self.get_term(term_id) return term.name if term else None - def __contains__(self, term_id: CURIE_OR_TERM_ID_OR_IDENTIFIED) -> bool: """ Test if the ontology contains a `term_id`. Use :func:`get_term` if you want to use the corresponding term apart from knowing that it is there. - .. doctest:: minimal-ontology-api + .. testsetup:: + + >>> import os, hpotk + >>> hpo = hpotk.load_minimal_ontology(os.path.join('docs', 'data', 'hp.toy.json')) + + >>> 'HP:0001250' in hpo # CURIE + True - >>> assert 'HP:0001250' in hpo - >>> assert hpotk.TermId.from_curie('HP:0001250') in hpo - >>> assert seizure in hpo + >>> term_id = hpotk.TermId.from_curie('HP:0001250') + >>> term_id in hpo + True + + >>> seizure = hpo.get_term('HP:0001250') + >>> seizure in hpo + True :param term_id: a CURIE `str` (e.g. HP:1234567'), a :class:`hpotk.model.TermId` or an :class:`hpotk.model.Identified` entity that represents a *current* or an *obsolete* term. @@ -126,10 +143,13 @@ def __len__(self): """ Get the number of the primary (non-obsolete) terms in the ontology. - .. doctest:: minimal-ontology-api + .. testsetup:: + + >>> import os, hpotk + >>> hpo = hpotk.load_minimal_ontology(os.path.join('docs', 'data', 'hp.toy.json')) - >>> len(hpo) - 393 + >>> len(hpo) + 393 :return: the number of primary terms """ From 0609a26b1fa533528a40b3b8e4353a7f7f0567c9 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 25 Mar 2024 09:44:53 -0400 Subject: [PATCH 05/13] Update docs regarding running tests. --- .github/workflows/python_ci.yml | 7 +------ docs/setup.rst | 15 +++------------ 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/.github/workflows/python_ci.yml b/.github/workflows/python_ci.yml index 0ecbd2b..af337d7 100644 --- a/.github/workflows/python_ci.yml +++ b/.github/workflows/python_ci.yml @@ -25,11 +25,6 @@ jobs: - name: Install hpo-toolkit run: | python3 -m pip install --editable .[test,docs] - - name: Run tests + - name: Run all tests run: | pytest - - name: Run documentation tests - run: | - cd docs - sphinx-apidoc --separate --module-first -d 2 -H "API reference" -o apidocs ../src/hpotk - make doctest diff --git a/docs/setup.rst b/docs/setup.rst index 4eb74f9..ef965fa 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -33,23 +33,14 @@ Run tests The contributors may want to run the unit tests and the integration tests to ensure all features work as expected. -Before running tests, make sure you install HPO toolkit with `test` and `docs` dependencies:: +Before running tests, make sure you install HPO toolkit with `test` dependencies:: - python3 -m pip install .[test,docs] + python3 -m pip install .[test] -The unit tests and the integration tests can the be running by invoking the `pytest` runner:: +The unit tests, integration tests, doctests, and the tutorial scripts can the be running by invoking the `pytest` runner:: pytest -We go extra mile to ensure the documentation is always up-to-date, and, therefore, we also run the documentation tests. -The documentation tests are run by:: - - cd docs - sphinx-apidoc --separate --module-first -d 2 -H "API reference" -o apidocs ../src/hpotk - make doctest - -.. note:: - The library *must* be installed in the environment before running all tests. Otherwise, the test discovery will fail. Run benches From b954bdecd3557014dcf525051df2f61c74a2aafa Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 25 Mar 2024 09:49:20 -0400 Subject: [PATCH 06/13] Collect tests before running. --- .github/workflows/python_ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python_ci.yml b/.github/workflows/python_ci.yml index af337d7..785dd4a 100644 --- a/.github/workflows/python_ci.yml +++ b/.github/workflows/python_ci.yml @@ -27,4 +27,5 @@ jobs: python3 -m pip install --editable .[test,docs] - name: Run all tests run: | + pytest --co pytest From f9daff00fe63d814b29688530cb4c55e2901a663 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 25 Mar 2024 09:51:50 -0400 Subject: [PATCH 07/13] Specify test roots. --- .github/workflows/python_ci.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python_ci.yml b/.github/workflows/python_ci.yml index 785dd4a..c9f6aac 100644 --- a/.github/workflows/python_ci.yml +++ b/.github/workflows/python_ci.yml @@ -24,8 +24,7 @@ jobs: python-version: ${{ matrix.python }} - name: Install hpo-toolkit run: | - python3 -m pip install --editable .[test,docs] + python3 -m pip install --editable .[test] - name: Run all tests run: | - pytest --co - pytest + pytest src docs tests From 31512a77c40fd5e9ba8a84b2dafcd1a2b381968a Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Apr 2024 15:54:59 -0400 Subject: [PATCH 08/13] Pull out `RemoteOntologyService`. --- src/hpotk/__init__.py | 3 +- src/hpotk/store/__init__.py | 24 +++++ src/hpotk/{util => }/store/_api.py | 65 ++++++++++++- src/hpotk/{util => }/store/_config.py | 14 ++- src/hpotk/store/_github.py | 80 ++++++++++++++++ src/hpotk/util/__init__.py | 2 +- src/hpotk/util/store/__init__.py | 11 --- src/hpotk/util/store/_github.py | 130 -------------------------- tests/test_store.py | 68 +++++++++++--- 9 files changed, 233 insertions(+), 164 deletions(-) create mode 100644 src/hpotk/store/__init__.py rename src/hpotk/{util => }/store/_api.py (59%) rename src/hpotk/{util => }/store/_config.py (69%) create mode 100644 src/hpotk/store/_github.py delete mode 100644 src/hpotk/util/store/__init__.py delete mode 100644 src/hpotk/util/store/_github.py diff --git a/src/hpotk/__init__.py b/src/hpotk/__init__.py index cfae680..6239ea3 100644 --- a/src/hpotk/__init__.py +++ b/src/hpotk/__init__.py @@ -10,6 +10,7 @@ from . import graph from . import model from . import ontology +from . import store from . import util from . import validate @@ -18,4 +19,4 @@ from .ontology import Ontology, MinimalOntology from .ontology.load.obographs import load_minimal_ontology, load_ontology -from .util.store import OntologyType, OntologyStore, configure_ontology_store +from .store import OntologyType, OntologyStore, configure_ontology_store diff --git a/src/hpotk/store/__init__.py b/src/hpotk/store/__init__.py new file mode 100644 index 0000000..b5a833e --- /dev/null +++ b/src/hpotk/store/__init__.py @@ -0,0 +1,24 @@ +""" +The `hpotk.store` package provides :class:`OntologyStore` - a class for local caching of ontology data. + +The ontology store should be configured using :func:`hpotk.configure_ontology_store` function: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() + +The store can then be used to fetch an ontology with a given release, e.g. `v2023-10-09`: + +>>> hpo = store.load_minimal_hpo(release='v2023-10-09') +>>> hpo.version +'2023-10-09' +""" + +from ._api import OntologyType, OntologyStore, RemoteOntologyService +from ._github import GitHubRemoteOntologyService +from ._config import configure_ontology_store + +__all__ = [ + 'configure_ontology_store', + 'OntologyType', 'OntologyStore', 'RemoteOntologyService', + 'GitHubRemoteOntologyService', +] diff --git a/src/hpotk/util/store/_api.py b/src/hpotk/store/_api.py similarity index 59% rename from src/hpotk/util/store/_api.py rename to src/hpotk/store/_api.py index c05b3ff..9c8c428 100644 --- a/src/hpotk/util/store/_api.py +++ b/src/hpotk/store/_api.py @@ -1,9 +1,13 @@ import abc import enum +import io import logging +import os import typing from hpotk.ontology import MinimalOntology, Ontology +from hpotk.ontology.load.obographs import load_minimal_ontology, load_ontology +from hpotk.util import validate_instance class OntologyType(enum.Enum): @@ -32,7 +36,29 @@ def identifier(self) -> str: return self._id_ -class OntologyStore(metaclass=abc.ABCMeta): +class RemoteOntologyService(metaclass=abc.ABCMeta): + """ + `RemoteOntologyService` knows how to open a :class:`typing.BinaryIO` + for reading content of an `ontology_type` of a particular `release`. + """ + + @abc.abstractmethod + def fetch_ontology( + self, + ontology_type: OntologyType, + release: typing.Optional[str] = None, + ) -> io.BufferedIOBase: + """ + Open a connection for reading bytes of the `ontology_type` from a remote resource. + + :param ontology_type: the desired ontology kind, e.g. :class:`OntologyType.HPO`. + :param release: a `str` with the desired ontology release or `None` if the latest release should be fetched. + :return: a binary IO for reading the ontology data. + """ + pass + + +class OntologyStore: """ `OntologyStore` stores versions of the supported ontologies. """ @@ -40,11 +66,13 @@ class OntologyStore(metaclass=abc.ABCMeta): def __init__( self, store_dir: str, + remote_ontology_service: RemoteOntologyService, ): self._logger = logging.getLogger(__name__) self._store_dir = store_dir + self._remote_ontology_service = validate_instance( + remote_ontology_service, RemoteOntologyService, 'remote_ontology_service') - @abc.abstractmethod def load_minimal_ontology( self, ontology_type: OntologyType, @@ -57,7 +85,11 @@ def load_minimal_ontology( :param release: a `str` with the ontology release tag or `None` if the latest ontology should be fetched. :return: a minimal ontology. """ - pass + return self._impl_load_ontology( + load_minimal_ontology, + ontology_type, + release, + ) @abc.abstractmethod def load_ontology( @@ -72,7 +104,11 @@ def load_ontology( :param release: a `str` with the ontology release tag or `None` if the latest ontology should be fetched. :return: an ontology. """ - pass + return self._impl_load_ontology( + load_ontology, + ontology_type, + release, + ) @property def store_dir(self) -> str: @@ -107,3 +143,24 @@ def load_hpo( :return: a :class:`hpotk.Ontology` with the HPO data. """ return self.load_ontology(OntologyType.HPO, release=release) + + def _impl_load_ontology( + self, + loader_func, + ontology_type: OntologyType, + release: typing.Optional[str] = None, + ): + fdir_ontology = os.path.join(self.store_dir, ontology_type.identifier) + fpath_ontology = os.path.join(fdir_ontology, f'{ontology_type.identifier.lower()}.{release}.json') + + # Download ontology if missing. + if not os.path.isfile(fpath_ontology): + os.makedirs(fdir_ontology, exist_ok=True) + with (self._remote_ontology_service.fetch_ontology(ontology_type, release) as response, + open(fpath_ontology, 'wb') as fh_ontology): + fh_ontology.write(response.read()) + + self._logger.info('Stored the ontology at %s', fpath_ontology) + + # Load the ontology + return loader_func(fpath_ontology) diff --git a/src/hpotk/util/store/_config.py b/src/hpotk/store/_config.py similarity index 69% rename from src/hpotk/util/store/_config.py rename to src/hpotk/store/_config.py index 2025d00..7324f4f 100644 --- a/src/hpotk/util/store/_config.py +++ b/src/hpotk/store/_config.py @@ -4,18 +4,21 @@ import typing from pathlib import Path -from ._api import OntologyStore -from ._github import GitHubOntologyStore +from ._api import OntologyStore, RemoteOntologyService +from ._github import GitHubRemoteOntologyService def configure_ontology_store( store_dir: typing.Optional[str] = None, + remote_ontology_service: RemoteOntologyService = GitHubRemoteOntologyService(), ) -> OntologyStore: """ Configure and create the default ontology store. - :param: a `str` pointing to an existing directory for caching the ontology files + :param store_dir: a `str` pointing to an existing directory for caching the ontology files or `None` if the platform-specific default folder should be used. + :param remote_ontology_service: a :class:`RemoteOntologyService` responsible for fetching + the ontology data from a remote location if we do not have the data locally. :returns: an :class:`OntologyStore`. :raises: `ValueError` if something goes wrong. """ @@ -24,7 +27,10 @@ def configure_ontology_store( else: if not os.path.isdir(store_dir): raise ValueError(f'`store_dir` must point to an existing directory') - return GitHubOntologyStore(store_dir=store_dir) + return OntologyStore( + store_dir=store_dir, + remote_ontology_service=remote_ontology_service, + ) def get_default_ontology_store_dir() -> str: diff --git a/src/hpotk/store/_github.py b/src/hpotk/store/_github.py new file mode 100644 index 0000000..c03211e --- /dev/null +++ b/src/hpotk/store/_github.py @@ -0,0 +1,80 @@ +import io +import json +import logging +import typing +from urllib.request import urlopen + +from ._api import OntologyType, RemoteOntologyService + + +class GitHubRemoteOntologyService(RemoteOntologyService): + """ + `GitHubRemoteOntologyService` knows how to fetch ontology data from GitHub. + + The Obographs JSON files are fetched and only HPO is supported as of now. + """ + + ONTOLOGY_CREDENTIALS = { + OntologyType.HPO: { + 'owner': 'obophenotype', + 'repo': 'human-phenotype-ontology', + } + } + + def __init__( + self, + timeout: int = 10, + ): + self._logger = logging.getLogger(__name__) + self._timeout = timeout + self._tag_api_url = 'https://api.github.com/repos/{owner}/{repo}/tags' + self._release_url = 'https://github.com/{owner}/{repo}/releases/download/{release}/{ontology_id}.json' + + def fetch_ontology( + self, + ontology_type: OntologyType, + release: typing.Optional[str] = None, + ) -> io.BufferedIOBase: + if ontology_type not in self.ONTOLOGY_CREDENTIALS: + raise ValueError(f'Ontology {ontology_type} not among the known ontology credentials') + credentials = self.ONTOLOGY_CREDENTIALS[ontology_type] + + # Figure out the desired release + if release is None: + release = self._fetch_latest_tag_from_github(credentials) + self._logger.debug('Using %s as the ontology release', release) + + owner = credentials['owner'] + repo = credentials['repo'] + url = self._release_url.format( + owner=owner, + repo=repo, + release=release, + ontology_id=ontology_type.identifier.lower(), + ) + self._logger.info('Downloading ontology from %s', url) + + return urlopen(url, timeout=self._timeout) + + def _fetch_latest_tag_from_github(self, credentials: typing.Mapping[str, str]): + self._logger.debug('Release unset, getting the latest') + tag_names = self._get_tag_names( + owner=credentials['owner'], + repo=credentials['repo'], + ) + # We assume lexicographic sorting of the tags + return max(tag_names) + + def _get_tag_names(self, owner: str, repo: str) -> typing.Iterable[str]: + tag_url = self._tag_api_url.format(owner=owner, repo=repo) + self._logger.debug('Pulling tag from %s', tag_url) + + with urlopen(tag_url, timeout=self._timeout) as r: + tags = json.load(r) + + if len(tags) == 0: + raise ValueError('No tags could be fetched from GitHub tag API') + else: + self._logger.debug('Fetched %d tags', len(tags)) + + return (tag['name'] for tag in tags) diff --git a/src/hpotk/util/__init__.py b/src/hpotk/util/__init__.py index 80a4503..7e9352e 100644 --- a/src/hpotk/util/__init__.py +++ b/src/hpotk/util/__init__.py @@ -1,4 +1,4 @@ -from . import sort # TODO: probably not necessary +from . import sort from ._io import looks_like_url, looks_gzipped from ._io import open_text_io_handle, open_text_io_handle_for_reading, open_text_io_handle_for_writing diff --git a/src/hpotk/util/store/__init__.py b/src/hpotk/util/store/__init__.py deleted file mode 100644 index bdc7eaf..0000000 --- a/src/hpotk/util/store/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -The `hpotk.util.store` package provides -""" - -from ._api import OntologyType, OntologyStore -from ._config import configure_ontology_store - -__all__ = [ - 'OntologyType', 'OntologyStore', - 'configure_ontology_store', -] diff --git a/src/hpotk/util/store/_github.py b/src/hpotk/util/store/_github.py deleted file mode 100644 index 303c75b..0000000 --- a/src/hpotk/util/store/_github.py +++ /dev/null @@ -1,130 +0,0 @@ -import json -import os -import typing -import urllib -from urllib.request import urlopen, HTTPError - -from hpotk.ontology import Ontology, MinimalOntology -from hpotk.ontology.load.obographs import load_minimal_ontology, load_ontology -from ._api import OntologyType, OntologyStore - - -class GitHubOntologyStore(OntologyStore): - """ - `GitHubOntologyStore` fetches an Obographs ontology JSON from GitHub. - """ - - ONTOLOGY_CREDENTIALS = { - OntologyType.HPO: { - 'owner': 'obophenotype', - 'repo': 'human-phenotype-ontology', - } - } - - def __init__( - self, - store_dir: str, - timeout: int = 10, - ): - super().__init__(store_dir=store_dir) - self._timeout = timeout - self._tag_api_url = 'https://api.github.com/repos/{owner}/{repo}/tags' - self._release_url = 'https://github.com/{owner}/{repo}/releases/download/{release}/{ontology_id}.json' - - def load_minimal_ontology(self, ontology_type: OntologyType, - release: typing.Optional[str] = None) -> MinimalOntology: - return self._impl_load_ontology( - load_minimal_ontology, - ontology_type, - release, - ) - - def load_ontology( - self, - ontology_type: OntologyType, - release: typing.Optional[str] = None, - ) -> Ontology: - return self._impl_load_ontology( - load_ontology, - ontology_type, - release, - ) - - def _impl_load_ontology( - self, - loader_func, - ontology_type: OntologyType, - release: typing.Optional[str] = None, - ): - if ontology_type not in self.ONTOLOGY_CREDENTIALS: - raise ValueError(f'Ontology {ontology_type} not among the known ontology credentials') - credentials = self.ONTOLOGY_CREDENTIALS[ontology_type] - - # Figure out the desired release - if release is None: - release = self._fetch_latest_tag_from_github(credentials) - self._logger.debug('Using %s as the ontology release', release) - - # Check if we have the release in the local storage - # and download the JSON file if not. - fpath_ontology = self._download_ontology_if_missing(credentials, ontology_type.identifier, release) - - # Load the ontology - return loader_func(fpath_ontology) - - def _download_ontology_if_missing( - self, - credentials: typing.Mapping[str, str], - ontology_id: str, - release: str, - ) -> str: - fdir_ontology = os.path.join(self.store_dir, ontology_id) - fpath_ontology = os.path.join(fdir_ontology, f'{ontology_id.lower()}.{release}.json') - if not os.path.isfile(fpath_ontology): - os.makedirs(fdir_ontology, exist_ok=True) - owner = credentials['owner'] - repo = credentials['repo'] - url = self._release_url.format( - owner=owner, - repo=repo, - release=release, - ontology_id=ontology_id.lower(), - ) - self._logger.info('Downloading ontology from %s', url) - self._logger.info('Storing the ontology at %s', fpath_ontology) - try: - with urlopen(url, timeout=self._timeout) as response, open(fpath_ontology, 'wb') as fh_ontology: - fh_ontology.write(response.read()) - except HTTPError as he: - if he.code == 404: - # Most likely a non-existing release. - raise ValueError(f'Could not find {release} on GitHub') - else: - # Another error. - raise he - self._logger.info('Download complete') - - return fpath_ontology - - def _fetch_latest_tag_from_github(self, credentials: typing.Mapping[str, str]): - self._logger.debug('Release unset, getting the latest') - tag_names = self._get_tag_names( - owner=credentials['owner'], - repo=credentials['repo'], - ) - # We assume lexicographic sorting of the tags - return max(tag_names) - - def _get_tag_names(self, owner: str, repo: str) -> typing.Iterable[str]: - tag_url = self._tag_api_url.format(owner=owner, repo=repo) - self._logger.debug('Pulling tag from %s', tag_url) - - with urllib.request.urlopen(tag_url, timeout=self._timeout) as r: - tags = json.load(r) - - if len(tags) == 0: - raise ValueError('No tags could be fetched from GitHub tag API') - else: - self._logger.debug('Fetched %d tags', len(tags)) - - return [tag['name'] for tag in tags] diff --git a/tests/test_store.py b/tests/test_store.py index 787515c..3d7b816 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1,4 +1,6 @@ +import io import os +import typing from pathlib import Path import pytest @@ -6,35 +8,75 @@ import hpotk -@pytest.mark.skip('Needs an internet connection') +class MockRemoteOntologyService(hpotk.store.RemoteOntologyService): + + def __init__( + self, + release: str, + payload: bytes, + ): + self._release = release + self._payload = payload + + def fetch_ontology( + self, + ontology_type: hpotk.OntologyType, + release: typing.Optional[str] = None, + ) -> io.BufferedIOBase: + if release == self._release: + return io.BytesIO(self._payload) + else: + raise ValueError(f'Unsupported release {release}') + + class TestGitHubOntologyStore: - def test_load_minimal_hpo( + @pytest.fixture(scope='class') + def remote_ontology_service( + self, + fpath_toy_hpo: str, + ) -> hpotk.store.RemoteOntologyService: + with open(fpath_toy_hpo, 'rb') as fh: + return MockRemoteOntologyService( + release='v2022-10-05', + payload=fh.read(), + ) + + @pytest.fixture + def ontology_store( self, tmp_path: Path, - ): - assert len(os.listdir(tmp_path)) == 0 # We start with the clean slate. + remote_ontology_service: hpotk.store.RemoteOntologyService, + ) -> hpotk.OntologyStore: + return hpotk.configure_ontology_store( + store_dir=str(tmp_path), + remote_ontology_service=remote_ontology_service, + ) - store = hpotk.configure_ontology_store(store_dir=str(tmp_path)) - assert len(os.listdir(tmp_path)) == 0 # Creating the store does nothing. + def test_load_minimal_hpo( + self, + ontology_store: hpotk.OntologyStore, + ): + # We start with a clean slate. + assert len(os.listdir(ontology_store.store_dir)) == 0 - release = 'v2024-03-06' - hpo = store.load_minimal_hpo(release=release) + release = 'v2022-10-05' + hpo = ontology_store.load_minimal_hpo(release=release) assert isinstance(hpo, hpotk.MinimalOntology) assert hpo.version == release[1:] - fpath_expected = os.path.join(tmp_path, 'HP', f'hp.{release}.json') + fpath_expected = os.path.join(ontology_store.store_dir, 'HP', f'hp.{release}.json') assert os.path.isfile(fpath_expected) def test_load_minimal_hpo__invalid_release( self, - tmp_path: Path, + ontology_store: hpotk.OntologyStore, ): - store = hpotk.configure_ontology_store(store_dir=str(tmp_path)) release = 'v3400-12-31' with pytest.raises(ValueError) as e: - store.load_minimal_hpo(release=release) + ontology_store.load_minimal_hpo(release=release) - assert e.value.args[0] == f'Could not find {release} on GitHub' + # We test that we get whatever exception was raised by the `RemoteOntologyService`. + assert e.value.args[0] == f'Unsupported release {release}' From 13f276e93b8783f7ed63d8659b2b25d3317d9636 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Apr 2024 16:30:04 -0400 Subject: [PATCH 09/13] Create `ssl` context before downloading a file over `https`. --- pyproject.toml | 3 ++- src/hpotk/store/_github.py | 18 +++++++++++++++--- src/hpotk/util/_io.py | 18 ++++++++++++++---- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a3c4e23..fa48260 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,8 @@ classifiers = [ dynamic = ["version"] dependencies = [ - "numpy >= 1.23" + "numpy >= 1.23", + "certifi", # The latest version is the best. ] [project.optional-dependencies] diff --git a/src/hpotk/store/_github.py b/src/hpotk/store/_github.py index c03211e..d175c0d 100644 --- a/src/hpotk/store/_github.py +++ b/src/hpotk/store/_github.py @@ -1,9 +1,12 @@ import io import json import logging +import ssl import typing from urllib.request import urlopen +import certifi + from ._api import OntologyType, RemoteOntologyService @@ -27,6 +30,7 @@ def __init__( ): self._logger = logging.getLogger(__name__) self._timeout = timeout + self._ctx = ssl.create_default_context(cafile=certifi.where()) self._tag_api_url = 'https://api.github.com/repos/{owner}/{repo}/tags' self._release_url = 'https://github.com/{owner}/{repo}/releases/download/{release}/{ontology_id}.json' @@ -54,7 +58,11 @@ def fetch_ontology( ) self._logger.info('Downloading ontology from %s', url) - return urlopen(url, timeout=self._timeout) + return urlopen( + url, + timeout=self._timeout, + context=self._ctx, + ) def _fetch_latest_tag_from_github(self, credentials: typing.Mapping[str, str]): self._logger.debug('Release unset, getting the latest') @@ -69,8 +77,12 @@ def _get_tag_names(self, owner: str, repo: str) -> typing.Iterable[str]: tag_url = self._tag_api_url.format(owner=owner, repo=repo) self._logger.debug('Pulling tag from %s', tag_url) - with urlopen(tag_url, timeout=self._timeout) as r: - tags = json.load(r) + with urlopen( + tag_url, + timeout=self._timeout, + context=self._ctx, + ) as fh: + tags = json.load(fh) if len(tags) == 0: raise ValueError('No tags could be fetched from GitHub tag API') diff --git a/src/hpotk/util/_io.py b/src/hpotk/util/_io.py index d186b79..303b9a5 100644 --- a/src/hpotk/util/_io.py +++ b/src/hpotk/util/_io.py @@ -1,11 +1,14 @@ import gzip import io import logging +import ssl import sys import typing import warnings from urllib.request import urlopen +import certifi + def looks_like_url(file: str) -> bool: """ @@ -36,9 +39,11 @@ def _parse_encoding(encoding, logger) -> str: return encoding -def open_text_io_handle_for_reading(fh: typing.Union[typing.IO, str], - timeout: int = 30, - encoding: str = None) -> typing.TextIO: +def open_text_io_handle_for_reading( + fh: typing.Union[typing.IO, str], + timeout: int = 30, + encoding: str = None, +) -> typing.TextIO: """ Open a `io.TextIO` file handle based on `fh`. @@ -57,11 +62,16 @@ def open_text_io_handle_for_reading(fh: typing.Union[typing.IO, str], if isinstance(fh, str): # Can be a path to local file or URL if looks_like_url(fh): + ctx = ssl.create_default_context(cafile=certifi.where()) logger.debug(f'Looks like a URL: {fh}') if not isinstance(timeout, int) or timeout <= 0: raise ValueError(f'If {fh} looks like URL then timeout {timeout} must be a positive `int`') logger.debug(f'Downloading with timeout={timeout}s') - handle = urlopen(fh, timeout=timeout) + handle = urlopen( + fh, + timeout=timeout, + context=ctx, + ) else: logger.debug(f'Looks like a local file: {fh}') handle = open(fh, 'rb') From 9a0a7f21d2802c0e8ba8caaeea22ee9adfbb51c0 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Apr 2024 16:41:02 -0400 Subject: [PATCH 10/13] Reformat. --- src/hpotk/store/_api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hpotk/store/_api.py b/src/hpotk/store/_api.py index 9c8c428..8082993 100644 --- a/src/hpotk/store/_api.py +++ b/src/hpotk/store/_api.py @@ -156,8 +156,7 @@ def _impl_load_ontology( # Download ontology if missing. if not os.path.isfile(fpath_ontology): os.makedirs(fdir_ontology, exist_ok=True) - with (self._remote_ontology_service.fetch_ontology(ontology_type, release) as response, - open(fpath_ontology, 'wb') as fh_ontology): + with self._remote_ontology_service.fetch_ontology(ontology_type, release) as response, open(fpath_ontology, 'wb') as fh_ontology: fh_ontology.write(response.read()) self._logger.info('Stored the ontology at %s', fpath_ontology) From 737f975b0e257a279ae021457a897a6d40e2c86d Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 8 May 2024 21:09:48 -0400 Subject: [PATCH 11/13] Update urls after the move. --- README.md | 8 ++++---- docs/index.rst | 2 +- docs/setup.rst | 4 ++-- pyproject.toml | 5 ++++- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3621bb9..340ee06 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/hpo-toolkit) ![PyPi downloads](https://img.shields.io/pypi/dm/hpo-toolkit.svg?label=Pypi%20downloads) -![Build status](https://img.shields.io/github/actions/workflow/status/TheJacksonLaboratory/hpo-toolkit/python_ci.yml) -[![GitHub release](https://img.shields.io/github/release/TheJacksonLaboratory/hpo-toolkit.svg)](https://github.com/TheJacksonLaboratory/hpo-toolkit/releases) +![Build status](https://img.shields.io/github/actions/workflow/status/ielis/hpo-toolkit/python_ci.yml) +[![GitHub release](https://img.shields.io/github/release/ielis/hpo-toolkit.svg)](https://github.com/ielis/hpo-toolkit/releases) A toolkit for working with Human Phenotype Ontology (HPO) and HPO disease annotations in Python. @@ -39,5 +39,5 @@ You got yourself phenotype annotations of 12,468 rare diseases. Find more info in our detailed documentation: -- [Stable documentation](https://thejacksonlaboratory.github.io/hpo-toolkit/stable) (last release on `main` branch) -- [Latest documentation](https://thejacksonlaboratory.github.io/hpo-toolkit/latest) (bleeding edge, latest commit on `development` branch) +- [Stable documentation](https://ielis.github.io/hpo-toolkit/stable) (last release on `main` branch) +- [Latest documentation](https://ielis.github.io/hpo-toolkit/latest) (bleeding edge, latest commit on `development` branch) diff --git a/docs/index.rst b/docs/index.rst index bd907f9..35757a2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,5 +28,5 @@ Feedback -------- The best place to leave feedback, ask questions, and report bugs is the -`HPO Toolkit's Issue Tracker `_. +`HPO Toolkit's Issue Tracker `_. diff --git a/docs/setup.rst b/docs/setup.rst index ef965fa..775d8ca 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -20,7 +20,7 @@ The bleeding edge code To access the bleeding edge features, the development version can be installed by:: - git clone https://github.com/TheJacksonLaboratory/hpo-toolkit.git + git clone https://github.com/ielis/hpo-toolkit.git cd hpo-toolkit git checkout development && git pull python3 -m pip install . @@ -49,7 +49,7 @@ Run benches Bench suites provide an idea about the performance of the library. Running a bench requires checking out the GitHub repository and installing HPO toolkit with `bench` dependencies:: - git clone https://github.com/TheJacksonLaboratory/hpo-toolkit.git + git clone https://github.com/ielis/hpo-toolkit.git cd hpo-toolkit python3 -m pip install .[bench] diff --git a/pyproject.toml b/pyproject.toml index fa48260..5519b9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,10 @@ bench = [ ] [project.urls] -"Repository" = "https://github.com/TheJacksonLaboratory/hpo-toolkit" +homepage = "https://github.com/ielis/hpo-toolkit" +repository = "https://github.com/ielis/hpo-toolkit.git" +documentation = "https://ielis.github.io/hpo-toolkit/stable" +bugtracker = "https://github.com/ielis/hpo-toolkit/issues" [tool.setuptools] package-dir = { "" = "src" } From a0709fe221781f65adfba8f9a2dcefeb3c262c12 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 20 May 2024 19:02:24 +0200 Subject: [PATCH 12/13] Create ontology file using the latest tag if the release is not provided. --- src/hpotk/store/__init__.py | 8 +-- src/hpotk/store/_api.py | 37 +++++++++++- src/hpotk/store/_config.py | 7 ++- src/hpotk/store/_github.py | 111 ++++++++++++++++++++++-------------- tests/conftest.py | 26 +++++++++ tests/test_store.py | 27 +++++++++ 6 files changed, 165 insertions(+), 51 deletions(-) diff --git a/src/hpotk/store/__init__.py b/src/hpotk/store/__init__.py index b5a833e..0d2398a 100644 --- a/src/hpotk/store/__init__.py +++ b/src/hpotk/store/__init__.py @@ -13,12 +13,12 @@ '2023-10-09' """ -from ._api import OntologyType, OntologyStore, RemoteOntologyService -from ._github import GitHubRemoteOntologyService +from ._api import OntologyType, OntologyStore, RemoteOntologyService, OntologyReleaseService +from ._github import GitHubRemoteOntologyService, GitHubOntologyReleaseService from ._config import configure_ontology_store __all__ = [ 'configure_ontology_store', - 'OntologyType', 'OntologyStore', 'RemoteOntologyService', - 'GitHubRemoteOntologyService', + 'OntologyType', 'OntologyStore', 'RemoteOntologyService', 'OntologyReleaseService', + 'GitHubRemoteOntologyService', 'GitHubOntologyReleaseService', ] diff --git a/src/hpotk/store/_api.py b/src/hpotk/store/_api.py index 8082993..d8928a6 100644 --- a/src/hpotk/store/_api.py +++ b/src/hpotk/store/_api.py @@ -46,18 +46,37 @@ class RemoteOntologyService(metaclass=abc.ABCMeta): def fetch_ontology( self, ontology_type: OntologyType, - release: typing.Optional[str] = None, + release: str, ) -> io.BufferedIOBase: """ Open a connection for reading bytes of the `ontology_type` from a remote resource. :param ontology_type: the desired ontology kind, e.g. :class:`OntologyType.HPO`. - :param release: a `str` with the desired ontology release or `None` if the latest release should be fetched. + :param release: a `str` with the desired ontology release. :return: a binary IO for reading the ontology data. """ pass +class OntologyReleaseService(metaclass=abc.ABCMeta): + """ + `OntologyReleaseService` knows how to fetch ontology release tags, such as `v2023-10-09` for HPO. + """ + + @abc.abstractmethod + def fetch_tags( + self, + ontology_type: OntologyType, + ) -> typing.Iterable[str]: + """ + Fetch sequence of tags for an ontology. + + :param ontology_type: the target ontology type. + :return: + """ + pass + + class OntologyStore: """ `OntologyStore` stores versions of the supported ontologies. @@ -66,12 +85,17 @@ class OntologyStore: def __init__( self, store_dir: str, + ontology_release_service: OntologyReleaseService, remote_ontology_service: RemoteOntologyService, ): self._logger = logging.getLogger(__name__) self._store_dir = store_dir + self._ontology_release_service = validate_instance( + ontology_release_service, OntologyReleaseService, 'ontology_release_service', + ) self._remote_ontology_service = validate_instance( - remote_ontology_service, RemoteOntologyService, 'remote_ontology_service') + remote_ontology_service, RemoteOntologyService, 'remote_ontology_service', + ) def load_minimal_ontology( self, @@ -151,6 +175,13 @@ def _impl_load_ontology( release: typing.Optional[str] = None, ): fdir_ontology = os.path.join(self.store_dir, ontology_type.identifier) + if release is None: + # Fetch the latest release tag, assuming the lexicographic tag sort order. + latest_tag = max(self._ontology_release_service.fetch_tags(ontology_type), default=None) + if latest_tag is None: + raise ValueError(f'Unable to retrieve the latest tag for {ontology_type}') + release = latest_tag + fpath_ontology = os.path.join(fdir_ontology, f'{ontology_type.identifier.lower()}.{release}.json') # Download ontology if missing. diff --git a/src/hpotk/store/_config.py b/src/hpotk/store/_config.py index 7324f4f..104a8b4 100644 --- a/src/hpotk/store/_config.py +++ b/src/hpotk/store/_config.py @@ -4,12 +4,13 @@ import typing from pathlib import Path -from ._api import OntologyStore, RemoteOntologyService -from ._github import GitHubRemoteOntologyService +from ._api import OntologyStore, RemoteOntologyService, OntologyReleaseService +from ._github import GitHubRemoteOntologyService, GitHubOntologyReleaseService def configure_ontology_store( store_dir: typing.Optional[str] = None, + ontology_release_service: OntologyReleaseService = GitHubOntologyReleaseService(), remote_ontology_service: RemoteOntologyService = GitHubRemoteOntologyService(), ) -> OntologyStore: """ @@ -17,6 +18,7 @@ def configure_ontology_store( :param store_dir: a `str` pointing to an existing directory for caching the ontology files or `None` if the platform-specific default folder should be used. + :param ontology_release_service: an :class:`OntologyReleaseService` for fetching the ontology releases. :param remote_ontology_service: a :class:`RemoteOntologyService` responsible for fetching the ontology data from a remote location if we do not have the data locally. :returns: an :class:`OntologyStore`. @@ -29,6 +31,7 @@ def configure_ontology_store( raise ValueError(f'`store_dir` must point to an existing directory') return OntologyStore( store_dir=store_dir, + ontology_release_service=ontology_release_service, remote_ontology_service=remote_ontology_service, ) diff --git a/src/hpotk/store/_github.py b/src/hpotk/store/_github.py index d175c0d..486c2b1 100644 --- a/src/hpotk/store/_github.py +++ b/src/hpotk/store/_github.py @@ -7,7 +7,70 @@ import certifi -from ._api import OntologyType, RemoteOntologyService +from ._api import OntologyType, OntologyReleaseService, RemoteOntologyService + + +ONTOLOGY_CREDENTIALS = { + OntologyType.HPO: { + 'owner': 'obophenotype', + 'repo': 'human-phenotype-ontology', + } + } +""" +The default ontology credentials that only include HPO at the time. +""" + + +class GitHubOntologyReleaseService(OntologyReleaseService): + """ + `GitHubOntologyReleaseService` can fetch the ontology tags from GitHub. + """ + + def __init__( + self, + timeout: int = 10, + ontology_credentials: typing.Mapping[OntologyType, typing.Mapping[str, str]] = ONTOLOGY_CREDENTIALS, + ): + self._logger = logging.getLogger(__name__) + self._timeout = timeout + self._tag_api_url = 'https://api.github.com/repos/{owner}/{repo}/tags' + self._ctx = ssl.create_default_context(cafile=certifi.where()) + self._ontology_credentials = ontology_credentials + + def fetch_tags(self, ontology_type: OntologyType) -> typing.Iterable[str]: + if ontology_type not in self._ontology_credentials: + raise ValueError( + f'Ontology {ontology_type} not among ' + f'the known ontology credentials {set(self._ontology_credentials.keys())}' + ) + credentials = self._ontology_credentials[ontology_type] + + return self._get_tag_names( + owner=credentials['owner'], + repo=credentials['repo'], + ) + + def _get_tag_names( + self, + owner: str, + repo: str, + ) -> typing.Iterable[str]: + tag_url = self._tag_api_url.format(owner=owner, repo=repo) + self._logger.debug('Pulling tag from %s', tag_url) + + with urlopen( + tag_url, + timeout=self._timeout, + context=self._ctx, + ) as fh: + tags = json.load(fh) + + if len(tags) == 0: + raise ValueError('No tags could be fetched from GitHub tag API') + else: + self._logger.debug('Fetched %d tags', len(tags)) + + return (tag['name'] for tag in tags) class GitHubRemoteOntologyService(RemoteOntologyService): @@ -17,35 +80,26 @@ class GitHubRemoteOntologyService(RemoteOntologyService): The Obographs JSON files are fetched and only HPO is supported as of now. """ - ONTOLOGY_CREDENTIALS = { - OntologyType.HPO: { - 'owner': 'obophenotype', - 'repo': 'human-phenotype-ontology', - } - } - def __init__( self, timeout: int = 10, + ontology_credentials: typing.Mapping[OntologyType, typing.Mapping[str, str]] = ONTOLOGY_CREDENTIALS, ): self._logger = logging.getLogger(__name__) self._timeout = timeout self._ctx = ssl.create_default_context(cafile=certifi.where()) - self._tag_api_url = 'https://api.github.com/repos/{owner}/{repo}/tags' self._release_url = 'https://github.com/{owner}/{repo}/releases/download/{release}/{ontology_id}.json' + self._ontology_credentials = ontology_credentials def fetch_ontology( self, ontology_type: OntologyType, - release: typing.Optional[str] = None, + release: str, ) -> io.BufferedIOBase: - if ontology_type not in self.ONTOLOGY_CREDENTIALS: + if ontology_type not in self._ontology_credentials: raise ValueError(f'Ontology {ontology_type} not among the known ontology credentials') - credentials = self.ONTOLOGY_CREDENTIALS[ontology_type] + credentials = self._ontology_credentials[ontology_type] - # Figure out the desired release - if release is None: - release = self._fetch_latest_tag_from_github(credentials) self._logger.debug('Using %s as the ontology release', release) owner = credentials['owner'] @@ -63,30 +117,3 @@ def fetch_ontology( timeout=self._timeout, context=self._ctx, ) - - def _fetch_latest_tag_from_github(self, credentials: typing.Mapping[str, str]): - self._logger.debug('Release unset, getting the latest') - tag_names = self._get_tag_names( - owner=credentials['owner'], - repo=credentials['repo'], - ) - # We assume lexicographic sorting of the tags - return max(tag_names) - - def _get_tag_names(self, owner: str, repo: str) -> typing.Iterable[str]: - tag_url = self._tag_api_url.format(owner=owner, repo=repo) - self._logger.debug('Pulling tag from %s', tag_url) - - with urlopen( - tag_url, - timeout=self._timeout, - context=self._ctx, - ) as fh: - tags = json.load(fh) - - if len(tags) == 0: - raise ValueError('No tags could be fetched from GitHub tag API') - else: - self._logger.debug('Fetched %d tags', len(tags)) - - return (tag['name'] for tag in tags) diff --git a/tests/conftest.py b/tests/conftest.py index 70909f8..3481355 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,35 @@ import os import pytest + import hpotk +# ####################################### Pytest options ############################################################# # + +def pytest_addoption(parser): + parser.addoption( + "--runonline", action="store_true", default=False, help="run online tests" + ) + + +def pytest_configure(config): + config.addinivalue_line("markers", "online: mark test that require internet access to run") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runonline"): + # --runonline given in cli: do not skip online tests + return + skip_online = pytest.mark.skip(reason="need --runonline option to run") + for item in items: + if "online" in item.keywords: + item.add_marker(skip_online) + + +# ####################################### Fixtures ################################################################### # + + @pytest.fixture(scope='session') def fpath_data() -> str: parent = os.path.dirname(__file__) diff --git a/tests/test_store.py b/tests/test_store.py index 3d7b816..3e77138 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -80,3 +80,30 @@ def test_load_minimal_hpo__invalid_release( # We test that we get whatever exception was raised by the `RemoteOntologyService`. assert e.value.args[0] == f'Unsupported release {release}' + + +@pytest.mark.online +class TestGitHubOntologyReleaseService: + + @pytest.fixture + def ontology_release_service(self) -> hpotk.store.OntologyReleaseService: + return hpotk.store.GitHubOntologyReleaseService() + + def test_ontology_release_service( + self, + ontology_release_service: hpotk.store.OntologyReleaseService, + ): + tag_iter = ontology_release_service.fetch_tags(hpotk.store.OntologyType.HPO) + + assert tag_iter is not None + + tags = set(tag_iter) + + expected = { # As of May 20th, 2024 + 'v2020-08-11', 'v2020-10-12', 'v2020-12-07', 'v2021-02-08', 'v2021-04-13', 'v2021-06-08', 'v2021-06-13', + 'v2021-08-02', 'v2021-10-10', 'v2022-01-27', 'v2022-02-14', 'v2022-04-14', 'v2022-06-11', 'v2022-10-05', + 'v2022-12-15', 'v2023-01-27', 'v2023-04-05', 'v2023-06-06', 'v2023-06-17', 'v2023-07-21', 'v2023-09-01', + 'v2023-10-09', 'v2024-01-11', 'v2024-01-16', 'v2024-02-08', 'v2024-03-06', 'v2024-04-03', 'v2024-04-04', + 'v2024-04-19', 'v2024-04-26', + } + assert all(tag in tags for tag in expected) From e53ca091d5644db4c6dbd7c0de5008bfc61f8518 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 20 May 2024 19:12:14 +0200 Subject: [PATCH 13/13] Bump release to `v0.5.1`. --- docs/conf.py | 2 +- recipe/meta.yaml | 2 +- src/hpotk/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index be9cfdc..092ba1c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,7 +3,7 @@ hpotk_src = os.path.abspath(os.path.join('..', 'src')) sys.path.insert(0, hpotk_src) -# The import order is crucial to prevent having to install the library before generating documetation. +# The import order is crucial to prevent having to install the library before generating documentation. import hpotk diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 353d12b..af1dbf1 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "hpo-toolkit" %} -{% set version = "0.5.1dev0" %} +{% set version = "0.5.1" %} package: name: {{ name|lower }} diff --git a/src/hpotk/__init__.py b/src/hpotk/__init__.py index 6239ea3..620b6ee 100644 --- a/src/hpotk/__init__.py +++ b/src/hpotk/__init__.py @@ -2,7 +2,7 @@ HPO toolkit is a library for working with Human Phenotype Ontology and the HPO annotation data. """ -__version__ = "0.5.1dev0" +__version__ = '0.5.1' from . import algorithm from . import annotations