Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use default ssl context when fetching from https #67

Merged
merged 3 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ classifiers = [
dynamic = ["version"]

dependencies = [
"numpy >= 1.23"
"numpy >= 1.23",
"certifi", # The latest version is the best.
]

[project.optional-dependencies]
Expand Down
3 changes: 2 additions & 1 deletion src/hpotk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from . import graph
from . import model
from . import ontology
from . import store
from . import util
from . import validate

Expand All @@ -18,4 +19,4 @@
from .ontology import Ontology, MinimalOntology

from .ontology.load.obographs import load_minimal_ontology, load_ontology
from .util.store import OntologyType, OntologyStore, configure_ontology_store
from .store import OntologyType, OntologyStore, configure_ontology_store
24 changes: 24 additions & 0 deletions src/hpotk/store/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
The `hpotk.store` package provides :class:`OntologyStore` - a class for local caching of ontology data.

The ontology store should be configured using :func:`hpotk.configure_ontology_store` function:

>>> import hpotk
>>> store = hpotk.configure_ontology_store()

The store can then be used to fetch an ontology with a given release, e.g. `v2023-10-09`:

>>> hpo = store.load_minimal_hpo(release='v2023-10-09')
>>> hpo.version
'2023-10-09'
"""

from ._api import OntologyType, OntologyStore, RemoteOntologyService
from ._github import GitHubRemoteOntologyService
from ._config import configure_ontology_store

__all__ = [
'configure_ontology_store',
'OntologyType', 'OntologyStore', 'RemoteOntologyService',
'GitHubRemoteOntologyService',
]
64 changes: 60 additions & 4 deletions src/hpotk/util/store/_api.py → src/hpotk/store/_api.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import abc
import enum
import io
import logging
import os
import typing

from hpotk.ontology import MinimalOntology, Ontology
from hpotk.ontology.load.obographs import load_minimal_ontology, load_ontology
from hpotk.util import validate_instance


class OntologyType(enum.Enum):
Expand Down Expand Up @@ -32,19 +36,43 @@ def identifier(self) -> str:
return self._id_


class OntologyStore(metaclass=abc.ABCMeta):
class RemoteOntologyService(metaclass=abc.ABCMeta):
"""
`RemoteOntologyService` knows how to open a :class:`typing.BinaryIO`
for reading content of an `ontology_type` of a particular `release`.
"""

@abc.abstractmethod
def fetch_ontology(
self,
ontology_type: OntologyType,
release: typing.Optional[str] = None,
) -> io.BufferedIOBase:
"""
Open a connection for reading bytes of the `ontology_type` from a remote resource.

:param ontology_type: the desired ontology kind, e.g. :class:`OntologyType.HPO`.
:param release: a `str` with the desired ontology release or `None` if the latest release should be fetched.
:return: a binary IO for reading the ontology data.
"""
pass


class OntologyStore:
"""
`OntologyStore` stores versions of the supported ontologies.
"""

def __init__(
self,
store_dir: str,
remote_ontology_service: RemoteOntologyService,
):
self._logger = logging.getLogger(__name__)
self._store_dir = store_dir
self._remote_ontology_service = validate_instance(
remote_ontology_service, RemoteOntologyService, 'remote_ontology_service')

@abc.abstractmethod
def load_minimal_ontology(
self,
ontology_type: OntologyType,
Expand All @@ -57,7 +85,11 @@ def load_minimal_ontology(
:param release: a `str` with the ontology release tag or `None` if the latest ontology should be fetched.
:return: a minimal ontology.
"""
pass
return self._impl_load_ontology(
load_minimal_ontology,
ontology_type,
release,
)

@abc.abstractmethod
def load_ontology(
Expand All @@ -72,7 +104,11 @@ def load_ontology(
:param release: a `str` with the ontology release tag or `None` if the latest ontology should be fetched.
:return: an ontology.
"""
pass
return self._impl_load_ontology(
load_ontology,
ontology_type,
release,
)

@property
def store_dir(self) -> str:
Expand Down Expand Up @@ -107,3 +143,23 @@ def load_hpo(
:return: a :class:`hpotk.Ontology` with the HPO data.
"""
return self.load_ontology(OntologyType.HPO, release=release)

def _impl_load_ontology(
self,
loader_func,
ontology_type: OntologyType,
release: typing.Optional[str] = None,
):
fdir_ontology = os.path.join(self.store_dir, ontology_type.identifier)
fpath_ontology = os.path.join(fdir_ontology, f'{ontology_type.identifier.lower()}.{release}.json')

# Download ontology if missing.
if not os.path.isfile(fpath_ontology):
os.makedirs(fdir_ontology, exist_ok=True)
with self._remote_ontology_service.fetch_ontology(ontology_type, release) as response, open(fpath_ontology, 'wb') as fh_ontology:
fh_ontology.write(response.read())

self._logger.info('Stored the ontology at %s', fpath_ontology)

# Load the ontology
return loader_func(fpath_ontology)
14 changes: 10 additions & 4 deletions src/hpotk/util/store/_config.py → src/hpotk/store/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@
import typing
from pathlib import Path

from ._api import OntologyStore
from ._github import GitHubOntologyStore
from ._api import OntologyStore, RemoteOntologyService
from ._github import GitHubRemoteOntologyService


def configure_ontology_store(
store_dir: typing.Optional[str] = None,
remote_ontology_service: RemoteOntologyService = GitHubRemoteOntologyService(),
) -> OntologyStore:
"""
Configure and create the default ontology store.

:param: a `str` pointing to an existing directory for caching the ontology files
:param store_dir: a `str` pointing to an existing directory for caching the ontology files
or `None` if the platform-specific default folder should be used.
:param remote_ontology_service: a :class:`RemoteOntologyService` responsible for fetching
the ontology data from a remote location if we do not have the data locally.
:returns: an :class:`OntologyStore`.
:raises: `ValueError` if something goes wrong.
"""
Expand All @@ -24,7 +27,10 @@ def configure_ontology_store(
else:
if not os.path.isdir(store_dir):
raise ValueError(f'`store_dir` must point to an existing directory')
return GitHubOntologyStore(store_dir=store_dir)
return OntologyStore(
store_dir=store_dir,
remote_ontology_service=remote_ontology_service,
)


def get_default_ontology_store_dir() -> str:
Expand Down
92 changes: 92 additions & 0 deletions src/hpotk/store/_github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import io
import json
import logging
import ssl
import typing
from urllib.request import urlopen

import certifi

from ._api import OntologyType, RemoteOntologyService


class GitHubRemoteOntologyService(RemoteOntologyService):
"""
`GitHubRemoteOntologyService` knows how to fetch ontology data from GitHub.

The Obographs JSON files are fetched and only HPO is supported as of now.
"""

ONTOLOGY_CREDENTIALS = {
OntologyType.HPO: {
'owner': 'obophenotype',
'repo': 'human-phenotype-ontology',
}
}

def __init__(
self,
timeout: int = 10,
):
self._logger = logging.getLogger(__name__)
self._timeout = timeout
self._ctx = ssl.create_default_context(cafile=certifi.where())
self._tag_api_url = 'https://api.github.com/repos/{owner}/{repo}/tags'
self._release_url = 'https://github.com/{owner}/{repo}/releases/download/{release}/{ontology_id}.json'

def fetch_ontology(
self,
ontology_type: OntologyType,
release: typing.Optional[str] = None,
) -> io.BufferedIOBase:
if ontology_type not in self.ONTOLOGY_CREDENTIALS:
raise ValueError(f'Ontology {ontology_type} not among the known ontology credentials')
credentials = self.ONTOLOGY_CREDENTIALS[ontology_type]

# Figure out the desired release
if release is None:
release = self._fetch_latest_tag_from_github(credentials)
self._logger.debug('Using %s as the ontology release', release)

owner = credentials['owner']
repo = credentials['repo']
url = self._release_url.format(
owner=owner,
repo=repo,
release=release,
ontology_id=ontology_type.identifier.lower(),
)
self._logger.info('Downloading ontology from %s', url)

return urlopen(
url,
timeout=self._timeout,
context=self._ctx,
)

def _fetch_latest_tag_from_github(self, credentials: typing.Mapping[str, str]):
self._logger.debug('Release unset, getting the latest')
tag_names = self._get_tag_names(
owner=credentials['owner'],
repo=credentials['repo'],
)
# We assume lexicographic sorting of the tags
return max(tag_names)

def _get_tag_names(self, owner: str, repo: str) -> typing.Iterable[str]:
tag_url = self._tag_api_url.format(owner=owner, repo=repo)
self._logger.debug('Pulling tag from %s', tag_url)

with urlopen(
tag_url,
timeout=self._timeout,
context=self._ctx,
) as fh:
tags = json.load(fh)

if len(tags) == 0:
raise ValueError('No tags could be fetched from GitHub tag API')
else:
self._logger.debug('Fetched %d tags', len(tags))

return (tag['name'] for tag in tags)
2 changes: 1 addition & 1 deletion src/hpotk/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import sort # TODO: probably not necessary
from . import sort

from ._io import looks_like_url, looks_gzipped
from ._io import open_text_io_handle, open_text_io_handle_for_reading, open_text_io_handle_for_writing
Expand Down
18 changes: 14 additions & 4 deletions src/hpotk/util/_io.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import gzip
import io
import logging
import ssl
import sys
import typing
import warnings
from urllib.request import urlopen

import certifi


def looks_like_url(file: str) -> bool:
"""
Expand Down Expand Up @@ -36,9 +39,11 @@ def _parse_encoding(encoding, logger) -> str:
return encoding


def open_text_io_handle_for_reading(fh: typing.Union[typing.IO, str],
timeout: int = 30,
encoding: str = None) -> typing.TextIO:
def open_text_io_handle_for_reading(
fh: typing.Union[typing.IO, str],
timeout: int = 30,
encoding: str = None,
) -> typing.TextIO:
"""
Open a `io.TextIO` file handle based on `fh`.

Expand All @@ -57,11 +62,16 @@ def open_text_io_handle_for_reading(fh: typing.Union[typing.IO, str],
if isinstance(fh, str):
# Can be a path to local file or URL
if looks_like_url(fh):
ctx = ssl.create_default_context(cafile=certifi.where())
logger.debug(f'Looks like a URL: {fh}')
if not isinstance(timeout, int) or timeout <= 0:
raise ValueError(f'If {fh} looks like URL then timeout {timeout} must be a positive `int`')
logger.debug(f'Downloading with timeout={timeout}s')
handle = urlopen(fh, timeout=timeout)
handle = urlopen(
fh,
timeout=timeout,
context=ctx,
)
else:
logger.debug(f'Looks like a local file: {fh}')
handle = open(fh, 'rb')
Expand Down
11 changes: 0 additions & 11 deletions src/hpotk/util/store/__init__.py

This file was deleted.

Loading
Loading