Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add therapy-normalizer data sources #13

Merged
merged 23 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,15 @@ docstring-quotes = "double"

[tool.ruff.per-file-ignores]
# ANN001 - missing-type-function-argument
# ANN2 - missing-return-type
# ANN201 - Missing type annotation
# ANN102 - missing-type-cls
# D103 - Missing docstring in public function
# ANN2 - missing-return-type
# ANN201 - missing-return-type-undocumented-public-function
# D103 - undocumented-public-function
# F821 - undefined-name
# F401 - unused-import
# I001 - Import block unsorted or unformatted
# I001 - unsorted-imports
# N805 - invalid-first-argument-name-for-method
"tests/*" = ["ANN001", "ANN102", "ANN2"]
"*__init__.py" = ["F401"]
"docs/source/conf.py" = ["D100", "I001", "D103", "ANN201", "ANN001"]
"src/wags_tails/base_source.py" = ["ANN102"]
12 changes: 12 additions & 0 deletions src/wags_tails/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
"""Data acquisition tools for Wagnerds."""
from .base_source import DataSource, RemoteDataError
from .chembl import ChemblData
from .chemidplus import ChemIDplusData
from .custom import CustomData
from .do import DoData
from .drugbank import DrugBankData
from .drugsatfda import DrugsAtFdaData
from .guide_to_pharmacology import GToPLigandData
from .hemonc import HemOncData
from .mondo import MondoData
from .ncit import NcitData
from .oncotree import OncoTreeData
from .rxnorm import RxNormData

__all__ = [
"DataSource",
"RemoteDataError",
"ChemblData",
"ChemIDplusData",
"CustomData",
"DoData",
"DrugBankData",
"DrugsAtFdaData",
"GToPLigandData",
"HemOncData",
"MondoData",
"NcitData",
"OncoTreeData",
"RxNormData",
]
52 changes: 52 additions & 0 deletions src/wags_tails/chemidplus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Provide source fetching for ChemIDplus."""
import re
from datetime import datetime
from pathlib import Path

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http
from .utils.versioning import DATE_VERSION_PATTERN


class ChemIDplusData(DataSource):
"""Provide access to ChemIDplus database."""

_src_name = "chemidplus"
_filetype = "xml"

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value

:return: latest release value
:raise RemoteDataError: if unable to parse version number from data file
"""
latest_url = "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml"
headers = {"Range": "bytes=0-300"} # leave some slack to capture date
r = requests.get(latest_url, headers=headers)
r.raise_for_status()
result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text)
if result:
raw_date = result.groups()[0]
return datetime.strptime(raw_date, "%Y-%m-%d").strftime(
DATE_VERSION_PATTERN
)
else:
raise RemoteDataError(
"Unable to parse latest ChemIDplus version number from partial access to latest file"
)

def _download_data(self, version: str, outfile: Path) -> None:
"""Download data file to specified location. ChemIDplus data is no longer
updated, so versioning is irrelevant.

:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_http(
"https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml",
outfile,
tqdm_params=self._tqdm_params,
)
100 changes: 100 additions & 0 deletions src/wags_tails/drugbank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Provide source fetching for DrugBank."""
import logging
from pathlib import Path
from typing import Tuple

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http, handle_zip
from .utils.versioning import parse_file_version

_logger = logging.getLogger(__name__)


class DrugBankData(DataSource):
"""Provide access to DrugBank database."""

_src_name = "drugbank"
_filetype = "csv"

@staticmethod
def _get_latest_version() -> Tuple[str, str]:
"""Retrieve latest version value

:return: latest release value and base download URL
:raise RemoteDataError: if unable to parse version number from releases API
"""
releases_url = "https://go.drugbank.com/releases.json"
r = requests.get(releases_url)
r.raise_for_status()
try:
latest = r.json()[0]
return latest["version"], latest["url"]
except (KeyError, IndexError):
raise RemoteDataError(
"Unable to parse latest DrugBank version number from releases API endpoint"
)

def _get_latest_local_file(self, glob: str) -> Tuple[Path, str]:
"""Get most recent locally-available file. DrugBank uses versioning that isn't
easily sortable by default so we have to use some extra magic.

:param glob: file pattern to match against
:return: Path to most recent file, and its version
:raise FileNotFoundError: if no local data is available
"""
_logger.debug(f"Getting local match against pattern {glob}...")
file_version_pairs = []
for file in self.data_dir.glob(glob):
version = parse_file_version(file, r"drugbank_([\d\.]+).csv")
formatted_version = [int(digits) for digits in version.split(".")]
file_version_pairs.append((file, version, formatted_version))
files = list(sorted(file_version_pairs, key=lambda p: p[2]))
if not files:
raise FileNotFoundError("No source data found for DrugBank")
latest = files[-1]
_logger.debug(f"Returning {latest[0]} as most recent locally-available file.")
return latest[0], latest[1]

def _download_data(self, url: str, outfile: Path) -> None:
"""Download data file to specified location.

:param url: location of data to fetch
:param outfile: location and filename for final data file
"""
download_http(
url,
outfile,
handler=handle_zip,
tqdm_params=self._tqdm_params,
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[Path, str]:
"""Get path to latest version of data, and its version value

:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Path to location of data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
file_path, version = self._get_latest_local_file("drugbank_*.csv")
return file_path, version

latest_version, latest_url_base = self._get_latest_version()
latest_url = f"{latest_url_base}/downloads/all-drugbank-vocabulary"
latest_file = self.data_dir / f"drugbank_{latest_version}.csv"
if (not force_refresh) and latest_file.exists():
_logger.debug(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._download_data(latest_url, latest_file)
return latest_file, latest_version
49 changes: 49 additions & 0 deletions src/wags_tails/drugsatfda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Provide source fetching for Drugs@FDA."""
import datetime
from pathlib import Path

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http, handle_zip
from .utils.versioning import DATE_VERSION_PATTERN


class DrugsAtFdaData(DataSource):
"""Provide access to Drugs@FDA database."""

_src_name = "drugsatfda"
_filetype = "json"

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value

:return: latest release value
:raise RemoteDataError: if unable to parse version number from releases API
"""
r = requests.get("https://api.fda.gov/download.json")
r.raise_for_status()
r_json = r.json()
try:
date = r_json["results"]["drug"]["drugsfda"]["export_date"]
except KeyError:
raise RemoteDataError(
"Unable to parse latest DrugBank version number from releases API endpoint"
)
return datetime.datetime.strptime(date, "%Y-%m-%d").strftime(
DATE_VERSION_PATTERN
)

def _download_data(self, version: str, outfile: Path) -> None:
"""Download latest data file to specified location.

:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_http(
"https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip",
outfile,
handler=handle_zip,
tqdm_params=self._tqdm_params,
)
112 changes: 112 additions & 0 deletions src/wags_tails/guide_to_pharmacology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Provide source fetching for Guide To Pharmacology."""
import logging
import re
from pathlib import Path
from typing import NamedTuple, Tuple

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http
from .utils.storage import get_latest_local_file
from .utils.versioning import parse_file_version

_logger = logging.getLogger(__name__)


class GtoPLigandPaths(NamedTuple):
"""Container for GuideToPharmacology file paths."""

ligands: Path
ligand_id_mapping: Path


class GToPLigandData(DataSource):
"""Provide access to Guide to Pharmacology data."""

_src_name = "guidetopharmacology"
_filetype = "tsv"

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value

:return: latest release value
:raise RemoteDataError: if unable to parse version number from releases API
"""
r = requests.get("https://www.guidetopharmacology.org/")
r.raise_for_status()
r_text = r.text.split("\n")
pattern = re.compile(r"Current Release Version (\d{4}\.\d) \(.*\)")
for line in r_text:
if "Current Release Version" in line:
matches = re.findall(pattern, line.strip())
if matches:
return matches[0]
else:
raise RemoteDataError(
"Unable to parse latest Guide to Pharmacology version number homepage HTML."
)

def _download_data(self, file_paths: GtoPLigandPaths) -> None:
"""Perform file downloads.

:param file_paths: locations to save files at
"""
download_http(
"https://www.guidetopharmacology.org/DATA/ligands.tsv",
file_paths.ligands,
tqdm_params=self._tqdm_params,
)
download_http(
"https://www.guidetopharmacology.org/DATA/ligand_id_mapping.tsv",
file_paths.ligand_id_mapping,
tqdm_params=self._tqdm_params,
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[GtoPLigandPaths, str]:
Comment on lines +67 to +69
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hemonc and gtop both acquire multiple files. With HemOnc, it's basically a bundle (each one is meaningless without the others) but with GtoP, it's just a selection of useful files (and in fact, DGIdb uses a different selection of GtoP files). I thought about a few different ways to handle this (including making a separate class for each file) and all of them were sort of awkward. I think returning a namedtuple of paths like this is the easiest way to handle it for data consumers.

"""Get path to latest version of data, and its version value

:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Paths to data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
ligands_path = get_latest_local_file(self.data_dir, "gtop_ligands_*.tsv")
ligand_id_mapping_path = get_latest_local_file(
self.data_dir, "gtop_ligand_id_mapping_*.tsv"
)
file_paths = GtoPLigandPaths(
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path
)
return file_paths, parse_file_version(
ligands_path, r"gtop_ligands_(\d{4}\.\d+).tsv"
)

latest_version = self._get_latest_version()
ligands_path = self.data_dir / f"gtop_ligands_{latest_version}.tsv"
ligand_id_mapping_path = (
self.data_dir / f"gtop_ligand_id_mapping_{latest_version}.tsv"
)
file_paths = GtoPLigandPaths(
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path
)
if not force_refresh:
if ligands_path.exists() and ligand_id_mapping_path.exists():
_logger.debug(
f"Found existing files, {file_paths}, matching latest version {latest_version}."
)
return file_paths, latest_version
elif ligands_path.exists() or ligand_id_mapping_path.exists():
_logger.warning(
f"Existing files, {file_paths}, not all available -- attempting full download."
)
self._download_data(file_paths)
return file_paths, latest_version
Loading