Merge pull request #73 from ielis:release-v0.5.2

Release v0.5.2
ielis · Jun 17, 2024 · 5bc6eb5 · 5bc6eb5
2 parents 5912542 + 47e5ba5
commit 5bc6eb5
Show file tree

Hide file tree

Showing 13 changed files with 635 additions and 160 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 .idea/
 *.iml
 
+.vscode/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/docs/user-guide/load-ontology.rst b/docs/user-guide/load-ontology.rst
@@ -41,6 +41,24 @@ and to load the data into :class:`hpotk.ontology.MinimalOntology`.
 A similar loader function :func:`hpotk.ontology.load.obographs.load_ontology` exists
 to load an :class:`hpotk.ontology.Ontology`.
 
+Loading other ontologies
+************************
+
+HPO toolkit primarily supports HPO, but few other ontologies were tested as an experimental feature.
+
+HPO toolkit should load Medical Action Ontology (MAxO) and Mondo Disease Ontology (MONDO).
+For instance:
+
+.. doctest:: load-minimal-ontology
+
+  >>> url = 'https://github.com/monarch-initiative/MAxO/releases/download/v2024-05-24/maxo.json'
+  >>> maxo = hpotk.load_minimal_ontology(url, prefixes_of_interest={'MAXO'})
+  >>> maxo.version
+  '2024-05-24'
+
+We provided `prefixes_of_interest` option to limit the terms to those with `MAXO` prefix, 
+effectively discarding all terms of other ontologies from the loading process. In result,
+the ontology includes only the `MAXO` terms along with the corresponding ontology graph.
 
 Ontology store
 ^^^^^^^^^^^^^^
@@ -73,6 +91,60 @@ Moreover, `OntologyStore` will load the *latest* release, if the `release` optio
 As of the time of this writing, ``2024-03-06`` is the latest HPO release.
 
 
+The store exposes the path of the ontology resources. For HPO, this is the path to the JSON file:
+
+.. doctest:: load-minimal-ontology
+
+  >>> fpath_hpo = store.resolve_store_path(ontology_type=hpotk.OntologyType.HPO, release='v2023-10-09')
+  >>> fpath_hpo  # doctest: +SKIP
+  '/path/to/.hpo-toolkit/HP/hp.v2023-10-09.json'
+
+The ontology resources can be cleaned to remove the content of the local directory:
+
+.. doctest:: load-minimal-ontology
+
+  >>> store.clear()  # doctest: +SKIP
+
+
+Support for other ontologies
+****************************
+
+HPO toolkit was developed to work best with HPO, since it is the flagship ontology of the toolkit's developers. 
+However, support for loading of several other ontologies was tested as an experimental feature:
+
+* Medical Action Ontology (MAxO)
+  * `Manuscript <https://pubmed.ncbi.nlm.nih.gov/37963467>`_
+  * `GitHub <https://github.com/monarch-initiative/MAxO>`_
+* Mondo Disease Ontology (MONDO)
+  * `Website <https://mondo.monarchinitiative.org>`_
+  * `GitHub <https://github.com/monarch-initiative/mondo>`_
+
+Let's start with loading MAxO:
+
+.. doctest:: load-minimal-ontology
+
+  >>> maxo = store.load_minimal_ontology(
+  ...     hpotk.store.OntologyType.MAxO, release="v2024-05-24",
+  ...     prefixes_of_interest={'MAXO'},
+  ... )
+
+Note that we added the `prefixes_of_interest` option - a `set` of term prefixes that should be kept 
+when loading the ontology file.
+
+Mondo is loaded in a very similar fashion:
+
+.. doctest:: load-minimal-ontology
+
+  >>> mondo = store.load_minimal_ontology(
+  ...     hpotk.OntologyType.MONDO, release='v2024-06-04',
+  ...     prefixes_of_interest={'MONDO',},
+  ... )
+
+Note, this guide shows the loading with a set versions. 
+However, other versions are supported as long as an existing release tag is used.
+Check the ontology releases for the available tags.
+
+
 Next steps
 **********
 

diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "hpo-toolkit" %}
-{% set version = "0.5.1" %}
+{% set version = "0.5.2" %}
 
 package:
   name: {{ name|lower }}

diff --git a/src/hpotk/__init__.py b/src/hpotk/__init__.py
@@ -2,7 +2,7 @@
 HPO toolkit is a library for working with Human Phenotype Ontology and the HPO annotation data.
 """
 
-__version__ = '0.5.1'
+__version__ = '0.5.2'
 
 from . import algorithm
 from . import annotations

diff --git a/src/hpotk/ontology/load/obographs/_load.py b/src/hpotk/ontology/load/obographs/_load.py
@@ -22,66 +22,92 @@
 DATE_PATTERN = re.compile(r'.*/(?P<date>\d{4}-\d{2}-\d{2})/.*')
 
 
-def load_minimal_ontology(file: typing.Union[typing.IO, str],
-                          term_factory: ObographsTermFactory[MinimalTerm] = MinimalTermFactory(),
-                          graph_factory: GraphFactory = CsrIndexedGraphFactory()) -> MinimalOntology:
-    return _load_impl(file, term_factory, graph_factory, create_minimal_ontology)
-
-
-def load_ontology(file: typing.Union[typing.IO, str],
-                  term_factory: ObographsTermFactory[Term] = TermFactory(),
-                  graph_factory: GraphFactory = CsrIndexedGraphFactory()) -> Ontology:
-    return _load_impl(file, term_factory, graph_factory, create_ontology)
-
-
-def _load_impl(file: typing.Union[typing.IO, str],
-               term_factory: ObographsTermFactory[MinimalTerm],
-               graph_factory: GraphFactory,
-               ontology_creator):
-    hpo = get_hpo_graph(file)
+def load_minimal_ontology(
+        file: typing.Union[typing.IO, str],
+        term_factory: ObographsTermFactory[MinimalTerm] = MinimalTermFactory(),
+        graph_factory: GraphFactory = CsrIndexedGraphFactory(),
+        prefixes_of_interest: typing.Set[str] = {'HP'},
+) -> MinimalOntology:
+    return _load_impl(
+        file,
+        term_factory, 
+        graph_factory, 
+        prefixes_of_interest, 
+        create_minimal_ontology,
+    )
+
+
+def load_ontology(
+        file: typing.Union[typing.IO, str],
+        term_factory: ObographsTermFactory[Term] = TermFactory(),
+        graph_factory: GraphFactory = CsrIndexedGraphFactory(),
+        prefixes_of_interest: typing.Set[str] = {'HP'},
+) -> Ontology:
+    return _load_impl(
+        file,
+        term_factory, 
+        graph_factory, 
+        prefixes_of_interest, 
+        create_ontology,
+    )
+
+
+def _load_impl(
+        file: typing.Union[typing.IO, str],
+        term_factory: ObographsTermFactory[MINIMAL_TERM],
+        graph_factory: GraphFactory,
+        prefixes_of_interest: typing.Set[str],
+        ontology_creator,        
+):
+    obograph = get_obographs_graph(file)
     logger.debug("Extracting ontology terms")
-    id_to_term_id, terms = extract_terms(hpo['nodes'], term_factory)
+    id_to_term_id, terms = extract_terms(
+        obograph['nodes'], term_factory, 
+        prefixes_of_interest=prefixes_of_interest,
+    )
     logger.debug("Creating the edge list")
-    edge_list = create_edge_list(hpo['edges'], id_to_term_id)
+    edge_list = create_edge_list(obograph['edges'], id_to_term_id)
     logger.debug("Building ontology graph")
-    graph: OntologyGraph = graph_factory.create_graph(edge_list)
-    if graph.root == OWL_THING:
-        # TODO - consider adding Owl thing into terms list
+    ontology_graph: OntologyGraph = graph_factory.create_graph(edge_list)
+    if ontology_graph.root == OWL_THING:
+        # TODO: - consider adding Owl thing into terms list
         pass
-    version = extract_ontology_version(hpo['meta'])
+    version = extract_ontology_version(obograph['meta'])
     logger.debug("Assembling the ontology")
-    ontology = ontology_creator(graph, terms, version)
+    ontology = ontology_creator(ontology_graph, terms, version)
     logger.debug("Done")
     return ontology
 
 
-def get_hpo_graph(file: typing.Union[typing.IO, str]):
+def get_obographs_graph(file: typing.Union[typing.IO, str]):
     with open_text_io_handle_for_reading(file) as fh:
         document = json.load(fh)
     if not isinstance(document, dict):
         raise ValueError(f'The JSON document should have been a dict but was {type(document)}')
     if 'graphs' not in document:
-        raise ValueError(f'Did not find the `graphs` attribute in the JSON document')
+        raise ValueError('Did not find the `graphs` attribute in the JSON document')
     graphs = document['graphs']
     if not isinstance(graphs, typing.Sequence):
-        raise ValueError(f'`graphs` JSON attribute is not a sequence')
+        raise ValueError('`graphs` JSON attribute is not a sequence')
     if len(graphs) < 1:
-        raise ValueError(f'`graphs` JSON attribute is empty')
+        raise ValueError('`graphs` JSON attribute is empty')
     elif len(graphs) == 1:
         # The happy path
         return graphs[0]
     else:
         raise ValueError(f'We expect exactly 1 graph but there are {len(graphs)} graphs in the JSON document')
 
 
-def extract_terms(nodes: typing.Iterable[dict],
-                  term_factory: ObographsTermFactory[MINIMAL_TERM]) \
-        -> typing.Tuple[typing.Mapping[str, TermId], typing.Sequence[MINIMAL_TERM]]:
+def extract_terms(
+    nodes: typing.Iterable[dict],
+    term_factory: ObographsTermFactory[MINIMAL_TERM],
+    prefixes_of_interest: typing.Set[str],
+) -> typing.Tuple[typing.Mapping[str, TermId], typing.Sequence[MINIMAL_TERM]]:
     curie_to_term: typing.Dict[str, TermId] = {}
-    terms: typing.List[Term] = []
+    terms: typing.List[MINIMAL_TERM] = []
     for data in nodes:
         # 1) map data to `Node`
-        node: Node = create_node(data)
+        node: typing.Optional[Node] = create_node(data)
 
         # 2) we only work with class Nodes
         if not node or node.type != NodeType.CLASS:
@@ -90,11 +116,11 @@ def extract_terms(nodes: typing.Iterable[dict],
         # 3) check if PURL is OK
         curie = extract_curie_from_purl(node.id)
         if not curie:
-            logger.debug(f'Unable to parse PURL {node.id} into CURIE')
+            logger.debug('Unable to extract CURIE from PURL %s', node.id)
             continue
         term_id = TermId.from_curie(curie)
-        if term_id.prefix != 'HP':
-            logger.debug(f'Skipping non-HPO term {term_id.value}')
+        if term_id.prefix not in prefixes_of_interest:
+            logger.debug('Skipping not a term of interest %s', term_id.value)
             continue
 
         curie_to_term[curie] = term_id
@@ -107,32 +133,44 @@ def extract_terms(nodes: typing.Iterable[dict],
     return curie_to_term, terms
 
 
-def create_edge_list(edges: typing.Iterable[typing.Dict[str, str]],
-                     curie_to_termid: typing.Mapping[str, TermId]) -> typing.List[typing.Tuple[TermId, TermId]]:
+def create_edge_list(
+        edges: typing.Iterable[typing.Dict[str, str]],
+        curie_to_termid: typing.Mapping[str, TermId],
+) -> typing.List[typing.Tuple[TermId, TermId]]:
     edge_list: typing.List[typing.Tuple[TermId, TermId]] = []
     for data in edges:
         edge: Edge = create_edge(data)
 
         # We only care about `is_a` relationships.
         if edge.pred != 'is_a':
-            logger.debug(f'Skipping edge with pred {edge.pred}!=\'is_a\'')
+            logger.debug('Skipping edge with pred %s!=\'is_a\'', edge.pred)
             continue
 
         # Get source and destination.
+        src_curie = extract_curie_from_purl(edge.sub)
+        if src_curie is None:
+            logger.warning('Unable to extract CURIE from sub PURL %s', edge.sub)
+            continue
         try:
-            curie = extract_curie_from_purl(edge.sub)
-            src: TermId = curie_to_termid[curie]
+            src: TermId = curie_to_termid[src_curie]
         except KeyError:
-            logger.warning(f'Source edge {edge.sub} was not found in terms')
-            # TODO - maybe we should even abort?
+            logger.debug(
+                'Skipping edge %s %s %s because subject %s was was not found in terms', 
+                edge.sub, edge.pred, edge.obj, edge.sub,
+            )
             continue
 
+        dest_curie = extract_curie_from_purl(edge.obj)
+        if dest_curie is None:
+            logger.warning('Unable to extract CURIE from obj PURL %s', edge.obj)
+            continue
         try:
-            curie: str = extract_curie_from_purl(edge.obj)
-            dest: TermId = curie_to_termid[curie]
+            dest: TermId = curie_to_termid[dest_curie]
         except KeyError:
-            logger.warning(f'Destination edge {edge.obj} was not found in terms')
-            # TODO - maybe we should even abort?
+            logger.debug(
+                'Skipping edge %s %s %s because object %s was was not found in terms', 
+                edge.sub, edge.pred, edge.obj, edge.obj,
+            )
             continue
 
         edge_list.append((src, dest))
@@ -158,7 +196,7 @@ def extract_ontology_version(meta: dict) -> typing.Optional[str]:
         if match:
             return match.group('date')
         else:
-            logger.debug(f'Could not find a date pattern in version {meta["version"]}')
+            logger.debug('Could not find a date pattern in version %s', meta["version"])
             return None
     elif 'basicPropertyValues' in meta:
         for bpv in meta['basicPropertyValues']:
@@ -171,8 +209,8 @@ def extract_ontology_version(meta: dict) -> typing.Optional[str]:
                     # }
                     return bpv['val']
 
-        logger.debug(f'Could not find basic property value with the version info')
+        logger.debug('Could not find basic property value with the version info')
         return None
     else:
-        logger.debug(f'Could not determine the ontology version')
+        logger.debug('Could not determine the ontology version')
         return None
diff --git a/src/hpotk/py.typed b/src/hpotk/py.typed
diff --git a/src/hpotk/store/__init__.py b/src/hpotk/store/__init__.py
@@ -11,6 +11,16 @@
 >>> hpo = store.load_minimal_hpo(release='v2023-10-09')
 >>> hpo.version
 '2023-10-09'
+
+or fetch the *latest* release by omitting the `release` argument:
+
+>>> latest_hpo = store.load_minimal_hpo()  # doctest: +SKIP
+>>> latest_hpo.version  # doctest: +SKIP
+'2024-04-26'
+
+.. note::
+
+  The release `2024-04-26` is the latest release as of June 2024 when this documentation was written.
 """
 
 from ._api import OntologyType, OntologyStore, RemoteOntologyService, OntologyReleaseService