diff --git a/.gitignore b/.gitignore index 2840779..b4d9d5d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ .idea/ *.iml +.vscode/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/user-guide/load-ontology.rst b/docs/user-guide/load-ontology.rst index ece52e1..b78b376 100644 --- a/docs/user-guide/load-ontology.rst +++ b/docs/user-guide/load-ontology.rst @@ -41,6 +41,24 @@ and to load the data into :class:`hpotk.ontology.MinimalOntology`. A similar loader function :func:`hpotk.ontology.load.obographs.load_ontology` exists to load an :class:`hpotk.ontology.Ontology`. +Loading other ontologies +************************ + +HPO toolkit primarily supports HPO, but few other ontologies were tested as an experimental feature. + +HPO toolkit should load Medical Action Ontology (MAxO) and Mondo Disease Ontology (MONDO). +For instance: + +.. doctest:: load-minimal-ontology + + >>> url = 'https://github.com/monarch-initiative/MAxO/releases/download/v2024-05-24/maxo.json' + >>> maxo = hpotk.load_minimal_ontology(url, prefixes_of_interest={'MAXO'}) + >>> maxo.version + '2024-05-24' + +We provided `prefixes_of_interest` option to limit the terms to those with `MAXO` prefix, +effectively discarding all terms of other ontologies from the loading process. In result, +the ontology includes only the `MAXO` terms along with the corresponding ontology graph. Ontology store ^^^^^^^^^^^^^^ @@ -73,6 +91,60 @@ Moreover, `OntologyStore` will load the *latest* release, if the `release` optio As of the time of this writing, ``2024-03-06`` is the latest HPO release. +The store exposes the path of the ontology resources. For HPO, this is the path to the JSON file: + +.. doctest:: load-minimal-ontology + + >>> fpath_hpo = store.resolve_store_path(ontology_type=hpotk.OntologyType.HPO, release='v2023-10-09') + >>> fpath_hpo # doctest: +SKIP + '/path/to/.hpo-toolkit/HP/hp.v2023-10-09.json' + +The ontology resources can be cleaned to remove the content of the local directory: + +.. doctest:: load-minimal-ontology + + >>> store.clear() # doctest: +SKIP + + +Support for other ontologies +**************************** + +HPO toolkit was developed to work best with HPO, since it is the flagship ontology of the toolkit's developers. +However, support for loading of several other ontologies was tested as an experimental feature: + +* Medical Action Ontology (MAxO) + * `Manuscript `_ + * `GitHub `_ +* Mondo Disease Ontology (MONDO) + * `Website `_ + * `GitHub `_ + +Let's start with loading MAxO: + +.. doctest:: load-minimal-ontology + + >>> maxo = store.load_minimal_ontology( + ... hpotk.store.OntologyType.MAxO, release="v2024-05-24", + ... prefixes_of_interest={'MAXO'}, + ... ) + +Note that we added the `prefixes_of_interest` option - a `set` of term prefixes that should be kept +when loading the ontology file. + +Mondo is loaded in a very similar fashion: + +.. doctest:: load-minimal-ontology + + >>> mondo = store.load_minimal_ontology( + ... hpotk.OntologyType.MONDO, release='v2024-06-04', + ... prefixes_of_interest={'MONDO',}, + ... ) + +Note, this guide shows the loading with a set versions. +However, other versions are supported as long as an existing release tag is used. +Check the ontology releases for the available tags. + + Next steps ********** diff --git a/recipe/meta.yaml b/recipe/meta.yaml index af1dbf1..7d42c9d 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "hpo-toolkit" %} -{% set version = "0.5.1" %} +{% set version = "0.5.2" %} package: name: {{ name|lower }} diff --git a/src/hpotk/__init__.py b/src/hpotk/__init__.py index 620b6ee..8b5c48c 100644 --- a/src/hpotk/__init__.py +++ b/src/hpotk/__init__.py @@ -2,7 +2,7 @@ HPO toolkit is a library for working with Human Phenotype Ontology and the HPO annotation data. """ -__version__ = '0.5.1' +__version__ = '0.5.2' from . import algorithm from . import annotations diff --git a/src/hpotk/ontology/load/obographs/_load.py b/src/hpotk/ontology/load/obographs/_load.py index 492d2f6..5789b10 100644 --- a/src/hpotk/ontology/load/obographs/_load.py +++ b/src/hpotk/ontology/load/obographs/_load.py @@ -22,51 +22,75 @@ DATE_PATTERN = re.compile(r'.*/(?P\d{4}-\d{2}-\d{2})/.*') -def load_minimal_ontology(file: typing.Union[typing.IO, str], - term_factory: ObographsTermFactory[MinimalTerm] = MinimalTermFactory(), - graph_factory: GraphFactory = CsrIndexedGraphFactory()) -> MinimalOntology: - return _load_impl(file, term_factory, graph_factory, create_minimal_ontology) - - -def load_ontology(file: typing.Union[typing.IO, str], - term_factory: ObographsTermFactory[Term] = TermFactory(), - graph_factory: GraphFactory = CsrIndexedGraphFactory()) -> Ontology: - return _load_impl(file, term_factory, graph_factory, create_ontology) - - -def _load_impl(file: typing.Union[typing.IO, str], - term_factory: ObographsTermFactory[MinimalTerm], - graph_factory: GraphFactory, - ontology_creator): - hpo = get_hpo_graph(file) +def load_minimal_ontology( + file: typing.Union[typing.IO, str], + term_factory: ObographsTermFactory[MinimalTerm] = MinimalTermFactory(), + graph_factory: GraphFactory = CsrIndexedGraphFactory(), + prefixes_of_interest: typing.Set[str] = {'HP'}, +) -> MinimalOntology: + return _load_impl( + file, + term_factory, + graph_factory, + prefixes_of_interest, + create_minimal_ontology, + ) + + +def load_ontology( + file: typing.Union[typing.IO, str], + term_factory: ObographsTermFactory[Term] = TermFactory(), + graph_factory: GraphFactory = CsrIndexedGraphFactory(), + prefixes_of_interest: typing.Set[str] = {'HP'}, +) -> Ontology: + return _load_impl( + file, + term_factory, + graph_factory, + prefixes_of_interest, + create_ontology, + ) + + +def _load_impl( + file: typing.Union[typing.IO, str], + term_factory: ObographsTermFactory[MINIMAL_TERM], + graph_factory: GraphFactory, + prefixes_of_interest: typing.Set[str], + ontology_creator, +): + obograph = get_obographs_graph(file) logger.debug("Extracting ontology terms") - id_to_term_id, terms = extract_terms(hpo['nodes'], term_factory) + id_to_term_id, terms = extract_terms( + obograph['nodes'], term_factory, + prefixes_of_interest=prefixes_of_interest, + ) logger.debug("Creating the edge list") - edge_list = create_edge_list(hpo['edges'], id_to_term_id) + edge_list = create_edge_list(obograph['edges'], id_to_term_id) logger.debug("Building ontology graph") - graph: OntologyGraph = graph_factory.create_graph(edge_list) - if graph.root == OWL_THING: - # TODO - consider adding Owl thing into terms list + ontology_graph: OntologyGraph = graph_factory.create_graph(edge_list) + if ontology_graph.root == OWL_THING: + # TODO: - consider adding Owl thing into terms list pass - version = extract_ontology_version(hpo['meta']) + version = extract_ontology_version(obograph['meta']) logger.debug("Assembling the ontology") - ontology = ontology_creator(graph, terms, version) + ontology = ontology_creator(ontology_graph, terms, version) logger.debug("Done") return ontology -def get_hpo_graph(file: typing.Union[typing.IO, str]): +def get_obographs_graph(file: typing.Union[typing.IO, str]): with open_text_io_handle_for_reading(file) as fh: document = json.load(fh) if not isinstance(document, dict): raise ValueError(f'The JSON document should have been a dict but was {type(document)}') if 'graphs' not in document: - raise ValueError(f'Did not find the `graphs` attribute in the JSON document') + raise ValueError('Did not find the `graphs` attribute in the JSON document') graphs = document['graphs'] if not isinstance(graphs, typing.Sequence): - raise ValueError(f'`graphs` JSON attribute is not a sequence') + raise ValueError('`graphs` JSON attribute is not a sequence') if len(graphs) < 1: - raise ValueError(f'`graphs` JSON attribute is empty') + raise ValueError('`graphs` JSON attribute is empty') elif len(graphs) == 1: # The happy path return graphs[0] @@ -74,14 +98,16 @@ def get_hpo_graph(file: typing.Union[typing.IO, str]): raise ValueError(f'We expect exactly 1 graph but there are {len(graphs)} graphs in the JSON document') -def extract_terms(nodes: typing.Iterable[dict], - term_factory: ObographsTermFactory[MINIMAL_TERM]) \ - -> typing.Tuple[typing.Mapping[str, TermId], typing.Sequence[MINIMAL_TERM]]: +def extract_terms( + nodes: typing.Iterable[dict], + term_factory: ObographsTermFactory[MINIMAL_TERM], + prefixes_of_interest: typing.Set[str], +) -> typing.Tuple[typing.Mapping[str, TermId], typing.Sequence[MINIMAL_TERM]]: curie_to_term: typing.Dict[str, TermId] = {} - terms: typing.List[Term] = [] + terms: typing.List[MINIMAL_TERM] = [] for data in nodes: # 1) map data to `Node` - node: Node = create_node(data) + node: typing.Optional[Node] = create_node(data) # 2) we only work with class Nodes if not node or node.type != NodeType.CLASS: @@ -90,11 +116,11 @@ def extract_terms(nodes: typing.Iterable[dict], # 3) check if PURL is OK curie = extract_curie_from_purl(node.id) if not curie: - logger.debug(f'Unable to parse PURL {node.id} into CURIE') + logger.debug('Unable to extract CURIE from PURL %s', node.id) continue term_id = TermId.from_curie(curie) - if term_id.prefix != 'HP': - logger.debug(f'Skipping non-HPO term {term_id.value}') + if term_id.prefix not in prefixes_of_interest: + logger.debug('Skipping not a term of interest %s', term_id.value) continue curie_to_term[curie] = term_id @@ -107,32 +133,44 @@ def extract_terms(nodes: typing.Iterable[dict], return curie_to_term, terms -def create_edge_list(edges: typing.Iterable[typing.Dict[str, str]], - curie_to_termid: typing.Mapping[str, TermId]) -> typing.List[typing.Tuple[TermId, TermId]]: +def create_edge_list( + edges: typing.Iterable[typing.Dict[str, str]], + curie_to_termid: typing.Mapping[str, TermId], +) -> typing.List[typing.Tuple[TermId, TermId]]: edge_list: typing.List[typing.Tuple[TermId, TermId]] = [] for data in edges: edge: Edge = create_edge(data) # We only care about `is_a` relationships. if edge.pred != 'is_a': - logger.debug(f'Skipping edge with pred {edge.pred}!=\'is_a\'') + logger.debug('Skipping edge with pred %s!=\'is_a\'', edge.pred) continue # Get source and destination. + src_curie = extract_curie_from_purl(edge.sub) + if src_curie is None: + logger.warning('Unable to extract CURIE from sub PURL %s', edge.sub) + continue try: - curie = extract_curie_from_purl(edge.sub) - src: TermId = curie_to_termid[curie] + src: TermId = curie_to_termid[src_curie] except KeyError: - logger.warning(f'Source edge {edge.sub} was not found in terms') - # TODO - maybe we should even abort? + logger.debug( + 'Skipping edge %s %s %s because subject %s was was not found in terms', + edge.sub, edge.pred, edge.obj, edge.sub, + ) continue + dest_curie = extract_curie_from_purl(edge.obj) + if dest_curie is None: + logger.warning('Unable to extract CURIE from obj PURL %s', edge.obj) + continue try: - curie: str = extract_curie_from_purl(edge.obj) - dest: TermId = curie_to_termid[curie] + dest: TermId = curie_to_termid[dest_curie] except KeyError: - logger.warning(f'Destination edge {edge.obj} was not found in terms') - # TODO - maybe we should even abort? + logger.debug( + 'Skipping edge %s %s %s because object %s was was not found in terms', + edge.sub, edge.pred, edge.obj, edge.obj, + ) continue edge_list.append((src, dest)) @@ -158,7 +196,7 @@ def extract_ontology_version(meta: dict) -> typing.Optional[str]: if match: return match.group('date') else: - logger.debug(f'Could not find a date pattern in version {meta["version"]}') + logger.debug('Could not find a date pattern in version %s', meta["version"]) return None elif 'basicPropertyValues' in meta: for bpv in meta['basicPropertyValues']: @@ -171,8 +209,8 @@ def extract_ontology_version(meta: dict) -> typing.Optional[str]: # } return bpv['val'] - logger.debug(f'Could not find basic property value with the version info') + logger.debug('Could not find basic property value with the version info') return None else: - logger.debug(f'Could not determine the ontology version') + logger.debug('Could not determine the ontology version') return None diff --git a/src/hpotk/py.typed b/src/hpotk/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/hpotk/store/__init__.py b/src/hpotk/store/__init__.py index 0d2398a..bc45b33 100644 --- a/src/hpotk/store/__init__.py +++ b/src/hpotk/store/__init__.py @@ -11,6 +11,16 @@ >>> hpo = store.load_minimal_hpo(release='v2023-10-09') >>> hpo.version '2023-10-09' + +or fetch the *latest* release by omitting the `release` argument: + +>>> latest_hpo = store.load_minimal_hpo() # doctest: +SKIP +>>> latest_hpo.version # doctest: +SKIP +'2024-04-26' + +.. note:: + + The release `2024-04-26` is the latest release as of June 2024 when this documentation was written. """ from ._api import OntologyType, OntologyStore, RemoteOntologyService, OntologyReleaseService diff --git a/src/hpotk/store/_api.py b/src/hpotk/store/_api.py index d8928a6..5918cb9 100644 --- a/src/hpotk/store/_api.py +++ b/src/hpotk/store/_api.py @@ -3,6 +3,7 @@ import io import logging import os +import shutil import typing from hpotk.ontology import MinimalOntology, Ontology @@ -19,6 +20,16 @@ class OntologyType(enum.Enum): """ Human Phenotype Ontology. """ + + MAxO = 'MAxO', 'MAXO' + """ + Medical Action Ontology. + """ + + MONDO = 'MONDO', 'MONDO' + """ + Mondo Disease Ontology. + """ def __new__(cls, *args, **kwargs): obj = object.__new__(cls) @@ -32,6 +43,12 @@ def __init__(self, _: str, identifier: str): def identifier(self) -> str: """ Get a `str` with the ontology identifier (e.g. ``HP`` for HPO). + + >>> from hpotk.store import OntologyType + >>> OntologyType.HPO.identifier + 'HP' + >>> OntologyType.MAxO.identifier + 'MAXO' """ return self._id_ @@ -101,37 +118,43 @@ def load_minimal_ontology( self, ontology_type: OntologyType, release: typing.Optional[str] = None, + **kwargs, ) -> MinimalOntology: """ Load a `release` of a given `ontology_type` as a minimal ontology. :param ontology_type: the desired ontology type, see :class:`OntologyType` for a list of supported ontologies. :param release: a `str` with the ontology release tag or `None` if the latest ontology should be fetched. + :param kwargs: key-value arguments passed to the low-level loader function (currently :func:`load_minimal_ontology`). :return: a minimal ontology. """ return self._impl_load_ontology( load_minimal_ontology, ontology_type, release, + **kwargs, ) - @abc.abstractmethod def load_ontology( self, ontology_type: OntologyType, release: typing.Optional[str] = None, + **kwargs, ) -> Ontology: """ Load a `release` of a given `ontology_type` as an ontology. :param ontology_type: the desired ontology type, see :class:`OntologyType` for a list of supported ontologies. :param release: a `str` with the ontology release tag or `None` if the latest ontology should be fetched. + :param kwargs: key-value arguments passed to the low-level loader function (currently :func:`load_ontology`). :return: an ontology. + :raises ValueError: if the `release` corresponds to a non-existing ontology release. """ return self._impl_load_ontology( load_ontology, ontology_type, release, + **kwargs, ) @property @@ -153,8 +176,13 @@ def load_minimal_hpo( :param release: an optional `str` with the desired HPO release (if `None`, the latest HPO will be provided). :return: a :class:`hpotk.MinimalOntology` with the HPO data. + :raises ValueError: if the `release` corresponds to a non-existing HPO release. """ - return self.load_minimal_ontology(OntologyType.HPO, release=release) + return self.load_minimal_ontology( + OntologyType.HPO, + release=release, + prefixes_of_interest={'HP'}, + ) def load_hpo( self, @@ -165,32 +193,109 @@ def load_hpo( :param release: an optional `str` with the desired HPO release (if `None`, the latest HPO will be provided). :return: a :class:`hpotk.Ontology` with the HPO data. + :raises ValueError: if the `release` corresponds to a non-existing HPO release. """ - return self.load_ontology(OntologyType.HPO, release=release) + return self.load_ontology( + OntologyType.HPO, + release=release, + prefixes_of_interest={'HP'}, + ) - def _impl_load_ontology( + def clear( + self, + ontology_type: typing.Optional[OntologyType] = None, + ): + """ + Clear all ontology resources or resources of selected `ontology_type`. + + :param ontology_type: the ontology to be cleared or `None` if resources of *all* ontologies should be cleared. + """ + to_delete = [] + if ontology_type is None: + to_delete.extend(os.listdir(self._store_dir)) + else: + to_delete.append(os.path.join(self._store_dir, ontology_type.identifier)) + + for item in to_delete: + full_path = os.path.join(self._store_dir, item) + if os.path.isdir(full_path): + shutil.rmtree(full_path) + else: + os.remove(full_path) + + def resolve_store_path( self, - loader_func, ontology_type: OntologyType, release: typing.Optional[str] = None, - ): - fdir_ontology = os.path.join(self.store_dir, ontology_type.identifier) + ) -> str: + """ + Resolve the path of the ontology resource (e.g. HPO `hp.json` file) within the ontology store. + + Note, the path points to the location of the ontology resource in the local filesystem. + The path may point to a non-existing file, if the load function has not been run yet. + + **Example** + + >>> import hpotk + >>> store = hpotk.configure_ontology_store() + >>> store.resolve_store_path(hpotk.store.OntologyType.HPO, release='v2023-10-09') # doctest: +SKIP + '/home/user/.hpo-toolkit/HP/hp.v2023-10-09.json' + + :param ontology_type: the desired ontology type, see :class:`OntologyType` for a list of supported ontologies. + :param release: an optional `str` with the desired ontology release (if `None`, the latest ontology will be provided). + :return: a `str` with path to the ontology resource. + """ + fdir_ontology = os.path.join(self._store_dir, ontology_type.identifier) if release is None: # Fetch the latest release tag, assuming the lexicographic tag sort order. - latest_tag = max(self._ontology_release_service.fetch_tags(ontology_type), default=None) - if latest_tag is None: - raise ValueError(f'Unable to retrieve the latest tag for {ontology_type}') - release = latest_tag + release = self._fetch_latest_release_if_missing(ontology_type) + + return os.path.join( + fdir_ontology, f"{ontology_type.identifier.lower()}.{release}.json" + ) + + def _fetch_latest_release_if_missing( + self, + ontology_type: OntologyType, + ) -> str: + """ + Retrieve the latest release tag of the given `ontology_type`. + + :param ontology_type: the ontology resource of interest + :return: a `str` with the latest ontology tag + :raises ValueError` if unable to retrieve the latest release tag from the ontology release service + """ + + # Fetch the latest release tag, assuming the lexicographic tag sort order. + latest_tag = max( + self._ontology_release_service.fetch_tags(ontology_type), default=None + ) + if latest_tag is None: + raise ValueError(f"Unable to retrieve the latest tag for {ontology_type}") + return latest_tag + + def _impl_load_ontology( + self, + loader_func, + ontology_type: OntologyType, + release: typing.Optional[str] = None, + **kwargs, + ): + if release is None: + release = self._fetch_latest_release_if_missing(ontology_type) - fpath_ontology = os.path.join(fdir_ontology, f'{ontology_type.identifier.lower()}.{release}.json') + fpath_ontology = self.resolve_store_path(ontology_type, release) # Download ontology if missing. if not os.path.isfile(fpath_ontology): + fdir_ontology = os.path.dirname(fpath_ontology) os.makedirs(fdir_ontology, exist_ok=True) - with self._remote_ontology_service.fetch_ontology(ontology_type, release) as response, open(fpath_ontology, 'wb') as fh_ontology: + with self._remote_ontology_service.fetch_ontology( + ontology_type, release + ) as response, open(fpath_ontology, "wb") as fh_ontology: fh_ontology.write(response.read()) - self._logger.info('Stored the ontology at %s', fpath_ontology) + self._logger.debug("Stored the ontology at %s", fpath_ontology) # Load the ontology - return loader_func(fpath_ontology) + return loader_func(fpath_ontology, **kwargs) diff --git a/src/hpotk/store/_github.py b/src/hpotk/store/_github.py index 486c2b1..51f221e 100644 --- a/src/hpotk/store/_github.py +++ b/src/hpotk/store/_github.py @@ -14,7 +14,15 @@ OntologyType.HPO: { 'owner': 'obophenotype', 'repo': 'human-phenotype-ontology', - } + }, + OntologyType.MAxO: { + 'owner': 'monarch-initiative', + 'repo': 'MAxO', + }, + OntologyType.MONDO: { + 'owner': 'monarch-initiative', + 'repo': 'mondo', + }, } """ The default ontology credentials that only include HPO at the time. diff --git a/tests/conftest.py b/tests/conftest.py index 3481355..a7d7524 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -69,3 +69,8 @@ def small_hpo(fpath_small_hpo: str) -> hpotk.Ontology: @pytest.fixture(scope='session') def fpath_real_shortlist_hpoa(fpath_data: str) -> str: return os.path.join(fpath_data, 'phenotype.real-shortlist.hpoa') + + +@pytest.fixture(scope='session') +def fpath_real_maxo(fpath_data: str) -> str: + return os.path.join(fpath_data, 'maxo.2024-05-24.json.gz') diff --git a/tests/data/maxo.2024-05-24.json.gz b/tests/data/maxo.2024-05-24.json.gz new file mode 100644 index 0000000..bfefc39 Binary files /dev/null and b/tests/data/maxo.2024-05-24.json.gz differ diff --git a/tests/ontology/load/test_obographs.py b/tests/ontology/load/test_obographs.py index d1b76af..ee3b2b0 100644 --- a/tests/ontology/load/test_obographs.py +++ b/tests/ontology/load/test_obographs.py @@ -1,78 +1,79 @@ +import typing import pytest import hpotk -import hpotk as hp -from hpotk.model import TermId +from hpotk.model import TermId, MinimalTerm from hpotk.ontology.load.obographs import * -hp.util.setup_logging() - class TestLoad: def test_load_minimal_ontology(self, fpath_toy_hpo: str): - o: hp.ontology.MinimalOntology = load_minimal_ontology(fpath_toy_hpo) + o: hpotk.ontology.MinimalOntology = load_minimal_ontology(fpath_toy_hpo) assert o is not None, "Ontology must not be None" - assert isinstance(o, hp.ontology.MinimalOntology) + assert isinstance(o, hpotk.ontology.MinimalOntology) assert o.version == '2022-10-05' assert 393 == len(o), "There must be 393 terms in the ontology" assert 557 == len(list(o.term_ids)), "There must be 557 term IDs in the ontology" assert 557 == len(set(o.term_ids)), "There must be 557 unique term IDs in the ontology" - assert all([term_id in o for term_id in o.term_ids]), "The ontology must contain all term IDs" - assert all([o.get_term(k) is not None for k in o.term_ids]), \ + assert all(term_id in o for term_id in o.term_ids), "The ontology must contain all term IDs" + assert all(o.get_term(k) is not None for k in o.term_ids), \ "The `get_term` must get primary term for any term ID from ontology" - assert all([o.get_term(k.value) is not None for k in o.term_ids]), \ + assert all(o.get_term(k.value) is not None for k in o.term_ids), \ "The `get_term` must get primary term for any term ID value from ontology" - assert all([o.get_term(k).identifier == k or k in o.get_term(k).alt_term_ids for k in o.term_ids]), \ + assert all(o.get_term(k).identifier == k or k in o.get_term(k).alt_term_ids for k in o.term_ids), \ "Each term ID must be either primary or alternative ID" def test_load_ontology(self, fpath_toy_hpo: str): - o: hp.ontology.Ontology = load_ontology(fpath_toy_hpo) + o: hpotk.ontology.Ontology = load_ontology(fpath_toy_hpo) assert o is not None, "Ontology must not be None" - assert isinstance(o, hp.ontology.Ontology) + assert isinstance(o, hpotk.ontology.Ontology) assert o.version == '2022-10-05' assert 393 == len(o), "There must be 393 terms in the ontology" assert 557 == len(list(o.term_ids)), "There must be 557 term IDs in the ontology" assert 557 == len(set(o.term_ids)), "There must be 557 unique term IDs in the ontology" - assert all([term_id in o for term_id in o.term_ids]), \ + assert all(term_id in o for term_id in o.term_ids), \ "The ontology must contain all term IDs" - assert all([o.get_term(k) is not None for k in o.term_ids]), \ + assert all(o.get_term(k) is not None for k in o.term_ids), \ "The `get_term` must get primary term for any term ID from ontology" - assert all([o.get_term(k).identifier == k or k in o.get_term(k).alt_term_ids for k in o.term_ids]), \ + assert all(o.get_term(k).identifier == k or k in o.get_term(k).alt_term_ids for k in o.term_ids), \ "Each term ID must be either primary or alternative ID" def test_load_minimal_ontology_backed_by_csr(self, fpath_toy_hpo: str): - term_factory = hp.ontology.load.obographs.MinimalTermFactory() - graph_factory = hp.graph.CsrGraphFactory() - o: hp.ontology.MinimalOntology = load_minimal_ontology(fpath_toy_hpo, - term_factory=term_factory, - graph_factory=graph_factory) + term_factory = hpotk.ontology.load.obographs.MinimalTermFactory() + graph_factory = hpotk.graph.CsrGraphFactory() + o: hpotk.ontology.MinimalOntology = load_minimal_ontology( + fpath_toy_hpo, + term_factory=term_factory, + graph_factory=graph_factory, + ) assert o is not None, "Ontology must not be None" arachnodactyly = TermId.from_curie("HP:0001166") - assert all([val.value in {"HP:0001238", "HP:0100807"} for val in (o.graph.get_parents(arachnodactyly))]) + assert all( + [ + val.value in {"HP:0001238", "HP:0100807"} + for val in (o.graph.get_parents(arachnodactyly)) + ] + ) assert len(list(o.graph.get_children(arachnodactyly))) == 0 - @pytest.mark.skip - def test_real_life(self): - o: hp.ontology.Ontology = load_ontology('/home/ielis/data/ontologies/hpo/2023-01-27/hp.json') - assert o is not None, "Ontology must not be None" - @pytest.mark.skip def test_print_stats(self, fpath_toy_hpo: str): import json + with open(fpath_toy_hpo) as fh: graphs = json.load(fh) - graph = graphs['graphs'][0] - all_nodes = graph['nodes'] - all_edges = graph['edges'] - print(f'All nodes: {len(all_nodes)}, all edges: {len(all_edges)}') + graph = graphs["graphs"][0] + all_nodes = graph["nodes"] + all_edges = graph["edges"] + print(f"All nodes: {len(all_nodes)}, all edges: {len(all_edges)}") current_nodes = self._get_current_nodes(all_nodes) - print(f'Current nodes: {len(current_nodes)}') + print(f"Current nodes: {len(current_nodes)}") # Getting the number of all term IDs is too complicated to be implemented here at this moment. # The functionality should be implemented later if necessary. @@ -80,10 +81,10 @@ def test_print_stats(self, fpath_toy_hpo: str): def _get_current_nodes(nodes): result = [] for node in nodes: - if 'meta' in node: - meta = node['meta'] - if 'deprecated' in meta: - deprecated = meta['deprecated'] + if "meta" in node: + meta = node["meta"] + if "deprecated" in meta: + deprecated = meta["deprecated"] if not deprecated: result.append(node) else: @@ -96,53 +97,111 @@ class TestTerms: We only load the ontology once, and we test the properties of the loaded data. """ - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def toy_ontology(self, fpath_toy_hpo: str) -> hpotk.Ontology: return load_ontology(fpath_toy_hpo) def test_term_properties(self, toy_ontology: hpotk.Ontology): # Test properties of a Term - term = toy_ontology.get_term('HP:0001626') + term = toy_ontology.get_term("HP:0001626") - assert term.identifier.value == 'HP:0001626' - assert term.name == 'Abnormality of the cardiovascular system' + assert term is not None + assert term.identifier.value == "HP:0001626" + assert term.name == "Abnormality of the cardiovascular system" definition = term.definition - assert definition.definition == 'Any abnormality of the cardiovascular system.' - assert definition.xrefs == ('HPO:probinson',) - assert term.comment == 'The cardiovascular system consists of the heart, vasculature, and the lymphatic system.' + assert definition.definition == "Any abnormality of the cardiovascular system." + assert definition.xrefs == ("HPO:probinson",) + assert ( + term.comment + == "The cardiovascular system consists of the heart, vasculature, and the lymphatic system." + ) assert not term.is_obsolete - assert term.alt_term_ids == (TermId.from_curie('HP:0003116'),) + assert term.alt_term_ids == (TermId.from_curie("HP:0003116"),) synonyms = term.synonyms assert len(synonyms) == 3 one = synonyms[0] - assert one.name == 'Cardiovascular disease' - assert one.category == hp.model.SynonymCategory.RELATED - assert one.synonym_type == hp.model.SynonymType.LAYPERSON_TERM + assert one.name == "Cardiovascular disease" + assert one.category == hpotk.model.SynonymCategory.RELATED + assert one.synonym_type == hpotk.model.SynonymType.LAYPERSON_TERM assert one.xrefs is None two = synonyms[1] - assert two.name == 'Cardiovascular abnormality' - assert two.category == hp.model.SynonymCategory.EXACT - assert two.synonym_type == hp.model.SynonymType.LAYPERSON_TERM + assert two.name == "Cardiovascular abnormality" + assert two.category == hpotk.model.SynonymCategory.EXACT + assert two.synonym_type == hpotk.model.SynonymType.LAYPERSON_TERM assert two.xrefs is None three = synonyms[2] - assert three.name == 'Abnormality of the cardiovascular system' - assert three.category == hp.model.SynonymCategory.EXACT - assert three.synonym_type == hp.model.SynonymType.LAYPERSON_TERM + assert three.name == "Abnormality of the cardiovascular system" + assert three.category == hpotk.model.SynonymCategory.EXACT + assert three.synonym_type == hpotk.model.SynonymType.LAYPERSON_TERM assert three.xrefs is None - assert term.xrefs == tuple(TermId.from_curie(curie) for curie in ('UMLS:C0243050', 'UMLS:C0007222', - 'MSH:D018376', 'SNOMEDCT_US:49601007', - 'MSH:D002318')) + assert term.xrefs == tuple( + TermId.from_curie(curie) + for curie in ( + "UMLS:C0243050", + "UMLS:C0007222", + "MSH:D018376", + "SNOMEDCT_US:49601007", + "MSH:D002318", + ) + ) def test_synonym_properties(self, toy_ontology: hpotk.Ontology): - term = toy_ontology.get_term('HP:0001627') + term = toy_ontology.get_term("HP:0001627") + assert term is not None + synonym = term.synonyms[7] - assert synonym.name == 'Abnormally shaped heart' - assert synonym.category == hp.model.SynonymCategory.EXACT - assert synonym.synonym_type == hp.model.SynonymType.LAYPERSON_TERM - assert synonym.xrefs == [TermId.from_curie('ORCID:0000-0001-5208-3432')] + assert synonym.name == "Abnormally shaped heart" + assert synonym.category == hpotk.model.SynonymCategory.EXACT + assert synonym.synonym_type == hpotk.model.SynonymType.LAYPERSON_TERM + assert synonym.xrefs == [TermId.from_curie("ORCID:0000-0001-5208-3432")] + + +class TestLoadMaxo: + + def test_load_minimal_maxo( + self, + fpath_real_maxo: str, + ): + maxo = load_minimal_ontology(fpath_real_maxo, prefixes_of_interest={"MAXO"}) + + assert maxo.version == "2024-05-24" + + # Check root + root = maxo.graph.root + assert root is not None + assert root.value == "MAXO:0000001" + + # Check all MAxO terms are in the graph + for term in maxo.terms: + assert ( + term.identifier in maxo.graph + ), f"{term.identifier.value} should be in the graph" + + # Check we loaded a specific number of terms + assert len(maxo) == 1788 + + # Check we have certain number of terms in the graph + descendants = set(maxo.graph.get_descendants(root)) + assert len(descendants) == len(maxo) - 1 # -1 for the root + + # Check all terms are in the graph + cda_curie = "MAXO:0000043" # communicable disease avoidance + cda_term: typing.Optional[MinimalTerm] = maxo.get_term(cda_curie) + assert cda_term is not None + + assert cda_term.name == "communicable disease avoidance" + + ancestors = list(maxo.graph.get_ancestors(cda_term)) + + assert set(anc.value for anc in ancestors) == { + "MAXO:0000001", # medical action + "MAXO:0000002", # therapeutic procedure + "MAXO:0000151", # therapeutic avoidance intervention + "MAXO:0000042", # preventable disease avoidance recommendation + } diff --git a/tests/test_store.py b/tests/test_store.py index 3e77138..20be2fb 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -11,75 +11,226 @@ class MockRemoteOntologyService(hpotk.store.RemoteOntologyService): def __init__( - self, - release: str, - payload: bytes, + self, + release: str, + payload: bytes, ): self._release = release self._payload = payload def fetch_ontology( - self, - ontology_type: hpotk.OntologyType, - release: typing.Optional[str] = None, + self, + ontology_type: hpotk.OntologyType, + release: typing.Optional[str] = None, ) -> io.BufferedIOBase: if release == self._release: return io.BytesIO(self._payload) else: - raise ValueError(f'Unsupported release {release}') + raise ValueError(f"Unsupported release {release}") -class TestGitHubOntologyStore: +class TestGitHubOntologyStoreOffline: - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def remote_ontology_service( - self, - fpath_toy_hpo: str, + self, + fpath_toy_hpo: str, ) -> hpotk.store.RemoteOntologyService: - with open(fpath_toy_hpo, 'rb') as fh: + with open(fpath_toy_hpo, "rb") as fh: return MockRemoteOntologyService( - release='v2022-10-05', + release="v2022-10-05", payload=fh.read(), ) @pytest.fixture def ontology_store( - self, - tmp_path: Path, - remote_ontology_service: hpotk.store.RemoteOntologyService, + self, + tmp_path: Path, + remote_ontology_service: hpotk.store.RemoteOntologyService, ) -> hpotk.OntologyStore: - return hpotk.configure_ontology_store( + ontology_release_service = hpotk.store.GitHubOntologyReleaseService() + return hpotk.store.OntologyStore( store_dir=str(tmp_path), + ontology_release_service=ontology_release_service, remote_ontology_service=remote_ontology_service, ) def test_load_minimal_hpo( - self, - ontology_store: hpotk.OntologyStore, + self, + ontology_store: hpotk.OntologyStore, ): # We start with a clean slate. assert len(os.listdir(ontology_store.store_dir)) == 0 - release = 'v2022-10-05' + release = "v2022-10-05" hpo = ontology_store.load_minimal_hpo(release=release) assert isinstance(hpo, hpotk.MinimalOntology) assert hpo.version == release[1:] - fpath_expected = os.path.join(ontology_store.store_dir, 'HP', f'hp.{release}.json') + fpath_expected = os.path.join( + ontology_store.store_dir, "HP", f"hp.{release}.json" + ) assert os.path.isfile(fpath_expected) def test_load_minimal_hpo__invalid_release( - self, - ontology_store: hpotk.OntologyStore, + self, + ontology_store: hpotk.OntologyStore, ): - release = 'v3400-12-31' + release = "v3400-12-31" with pytest.raises(ValueError) as e: ontology_store.load_minimal_hpo(release=release) # We test that we get whatever exception was raised by the `RemoteOntologyService`. - assert e.value.args[0] == f'Unsupported release {release}' + assert e.value.args[0] == f"Unsupported release {release}" + + @pytest.mark.parametrize( + "ontology_type,release,expected_fname", + [ + (hpotk.store.OntologyType.HPO, "v2024-04-19", "hp.v2024-04-19.json"), + (hpotk.store.OntologyType.HPO, "v2024-04-26", "hp.v2024-04-26.json"), + (hpotk.store.OntologyType.MAxO, "v2024-05-24", "maxo.v2024-05-24.json"), + ], + ) + def test_resolve_store_path( + self, + ontology_store: hpotk.OntologyStore, + ontology_type: hpotk.store.OntologyType, + release: str, + expected_fname: str, + ): + actual = ontology_store.resolve_store_path( + ontology_type=ontology_type, release=release + ) + + expected = os.path.join( + ontology_store.store_dir, ontology_type.identifier, expected_fname + ) + assert actual == expected + + def test_clear__everything( + self, + ontology_store: hpotk.OntologyStore, + ): + store_dir = Path(ontology_store.store_dir) + + stuff = os.listdir(store_dir) + assert len(stuff) == 0, "The store directory is empty upon start" + + TestGitHubOntologyStoreOffline.initialize_store_dir(store_dir) + + stuff = os.listdir(store_dir) + assert ( + len(stuff) == 3 + ), "The store directory now includes two folders and one file" + + ontology_store.clear() + + stuff = os.listdir(store_dir) + assert len(stuff) == 0, "The store directory is empty after clearing everything" + + @pytest.mark.parametrize( + "resource", + [ + hpotk.OntologyType.HPO, + ], + ) + def test_clear__ontology_type( + self, + resource: hpotk.OntologyType, + ontology_store: hpotk.OntologyStore, + ): + store_dir = Path(ontology_store.store_dir) + stuff = os.listdir(store_dir) + + assert len(stuff) == 0, "The store directory is empty upon start" + + TestGitHubOntologyStoreOffline.initialize_store_dir(store_dir) + + stuff = os.listdir(store_dir) + assert ( + len(stuff) == 3 + ), "The store directory now includes two folders and one file" + + ontology_store.clear(resource) + + stuff = os.listdir(store_dir) + assert len(stuff) == 2 + + @staticmethod + def initialize_store_dir(store_dir: Path): + # Make a few folders and files + store_dir.joinpath("joe.txt").touch() # a file + + hp_path = store_dir.joinpath("HP") # a folder + os.mkdir(hp_path) + hp_path.joinpath("a.txt").touch() + hp_path.joinpath("b.txt").touch() + + mondo_path = store_dir.joinpath("MONDO") # another folder + os.mkdir(mondo_path) + mondo_path.joinpath("x.txt").touch() + mondo_path.joinpath("y.txt").touch() + +@pytest.mark.online +class TestGitHubOntologyStoreOnline: + """ + Tests of real-life situations. + """ + + @pytest.fixture + def ontology_store(self, tmp_path: Path) -> hpotk.OntologyStore: + return hpotk.OntologyStore( + store_dir=str(tmp_path), + ontology_release_service=hpotk.store.GitHubOntologyReleaseService(), + remote_ontology_service=hpotk.store.GitHubRemoteOntologyService()) + + def test_load_minimal_maxo(self, ontology_store: hpotk.OntologyStore): + """ + Test that we can load MAxO with a little bit of extra TLC. + """ + maxo = ontology_store.load_minimal_ontology( + hpotk.store.OntologyType.MAxO, + release="v2024-05-24", + prefixes_of_interest={'MAXO'}, + ) + assert maxo is not None + + assert isinstance(maxo, hpotk.MinimalOntology) + assert maxo.version == '2024-05-24' + + assert len(maxo) == 1788 + assert maxo.graph.root.value == 'MAXO:0000001' + + def test_load_minimal_mondo(self, ontology_store: hpotk.OntologyStore): + """ + Test that we can load MONDO with a tiny bit of extra TLC. + """ + mondo = ontology_store.load_minimal_ontology( + hpotk.store.OntologyType.MONDO, + release='v2024-06-04', + prefixes_of_interest={'MONDO'}, + ) + + assert isinstance(mondo, hpotk.MinimalOntology) + assert mondo.version == '2024-06-04' + + assert len(mondo) == 24_260 + + children = set(mondo.get_term_name(term_id) for term_id in mondo.graph.get_children(mondo.graph.root)) + assert children == { + 'disease', 'disease characteristic', + 'disease susceptibility', 'injury', + } + + disease_id = 'MONDO:0000001' # `disease` + disease = mondo.get_term(disease_id) + assert disease is not None + assert disease.name == 'disease' + + second_children = set(mondo.get_term_name(term_id) for term_id in mondo.graph.get_children(disease_id)) + assert second_children == {'human disease', 'non-human animal disease'} @pytest.mark.online @@ -90,8 +241,8 @@ def ontology_release_service(self) -> hpotk.store.OntologyReleaseService: return hpotk.store.GitHubOntologyReleaseService() def test_ontology_release_service( - self, - ontology_release_service: hpotk.store.OntologyReleaseService, + self, + ontology_release_service: hpotk.store.OntologyReleaseService, ): tag_iter = ontology_release_service.fetch_tags(hpotk.store.OntologyType.HPO) @@ -100,10 +251,35 @@ def test_ontology_release_service( tags = set(tag_iter) expected = { # As of May 20th, 2024 - 'v2020-08-11', 'v2020-10-12', 'v2020-12-07', 'v2021-02-08', 'v2021-04-13', 'v2021-06-08', 'v2021-06-13', - 'v2021-08-02', 'v2021-10-10', 'v2022-01-27', 'v2022-02-14', 'v2022-04-14', 'v2022-06-11', 'v2022-10-05', - 'v2022-12-15', 'v2023-01-27', 'v2023-04-05', 'v2023-06-06', 'v2023-06-17', 'v2023-07-21', 'v2023-09-01', - 'v2023-10-09', 'v2024-01-11', 'v2024-01-16', 'v2024-02-08', 'v2024-03-06', 'v2024-04-03', 'v2024-04-04', - 'v2024-04-19', 'v2024-04-26', + "v2020-08-11", + "v2020-10-12", + "v2020-12-07", + "v2021-02-08", + "v2021-04-13", + "v2021-06-08", + "v2021-06-13", + "v2021-08-02", + "v2021-10-10", + "v2022-01-27", + "v2022-02-14", + "v2022-04-14", + "v2022-06-11", + "v2022-10-05", + "v2022-12-15", + "v2023-01-27", + "v2023-04-05", + "v2023-06-06", + "v2023-06-17", + "v2023-07-21", + "v2023-09-01", + "v2023-10-09", + "v2024-01-11", + "v2024-01-16", + "v2024-02-08", + "v2024-03-06", + "v2024-04-03", + "v2024-04-04", + "v2024-04-19", + "v2024-04-26", } assert all(tag in tags for tag in expected)