Skip to content

Commit

Permalink
feat: add therapy-normalizer data sources (#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson authored Nov 16, 2023
1 parent f2c5c1c commit bd3dce2
Show file tree
Hide file tree
Showing 29 changed files with 13,629 additions and 9 deletions.
9 changes: 5 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,15 @@ docstring-quotes = "double"

[tool.ruff.per-file-ignores]
# ANN001 - missing-type-function-argument
# ANN2 - missing-return-type
# ANN201 - Missing type annotation
# ANN102 - missing-type-cls
# D103 - Missing docstring in public function
# ANN2 - missing-return-type
# ANN201 - missing-return-type-undocumented-public-function
# D103 - undocumented-public-function
# F821 - undefined-name
# F401 - unused-import
# I001 - Import block unsorted or unformatted
# I001 - unsorted-imports
# N805 - invalid-first-argument-name-for-method
"tests/*" = ["ANN001", "ANN102", "ANN2"]
"*__init__.py" = ["F401"]
"docs/source/conf.py" = ["D100", "I001", "D103", "ANN201", "ANN001"]
"src/wags_tails/base_source.py" = ["ANN102"]
12 changes: 12 additions & 0 deletions src/wags_tails/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
"""Data acquisition tools for Wagnerds."""
from .base_source import DataSource, RemoteDataError
from .chembl import ChemblData
from .chemidplus import ChemIDplusData
from .custom import CustomData
from .do import DoData
from .drugbank import DrugBankData
from .drugsatfda import DrugsAtFdaData
from .guide_to_pharmacology import GToPLigandData
from .hemonc import HemOncData
from .mondo import MondoData
from .ncit import NcitData
from .oncotree import OncoTreeData
from .rxnorm import RxNormData

__all__ = [
"DataSource",
"RemoteDataError",
"ChemblData",
"ChemIDplusData",
"CustomData",
"DoData",
"DrugBankData",
"DrugsAtFdaData",
"GToPLigandData",
"HemOncData",
"MondoData",
"NcitData",
"OncoTreeData",
"RxNormData",
]
52 changes: 52 additions & 0 deletions src/wags_tails/chemidplus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Provide source fetching for ChemIDplus."""
import re
from datetime import datetime
from pathlib import Path

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http
from .utils.versioning import DATE_VERSION_PATTERN


class ChemIDplusData(DataSource):
"""Provide access to ChemIDplus database."""

_src_name = "chemidplus"
_filetype = "xml"

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from data file
"""
latest_url = "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml"
headers = {"Range": "bytes=0-300"} # leave some slack to capture date
r = requests.get(latest_url, headers=headers)
r.raise_for_status()
result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text)
if result:
raw_date = result.groups()[0]
return datetime.strptime(raw_date, "%Y-%m-%d").strftime(
DATE_VERSION_PATTERN
)
else:
raise RemoteDataError(
"Unable to parse latest ChemIDplus version number from partial access to latest file"
)

def _download_data(self, version: str, outfile: Path) -> None:
"""Download data file to specified location. ChemIDplus data is no longer
updated, so versioning is irrelevant.
:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_http(
"https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml",
outfile,
tqdm_params=self._tqdm_params,
)
100 changes: 100 additions & 0 deletions src/wags_tails/drugbank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Provide source fetching for DrugBank."""
import logging
from pathlib import Path
from typing import Tuple

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http, handle_zip
from .utils.versioning import parse_file_version

_logger = logging.getLogger(__name__)


class DrugBankData(DataSource):
"""Provide access to DrugBank database."""

_src_name = "drugbank"
_filetype = "csv"

@staticmethod
def _get_latest_version() -> Tuple[str, str]:
"""Retrieve latest version value
:return: latest release value and base download URL
:raise RemoteDataError: if unable to parse version number from releases API
"""
releases_url = "https://go.drugbank.com/releases.json"
r = requests.get(releases_url)
r.raise_for_status()
try:
latest = r.json()[0]
return latest["version"], latest["url"]
except (KeyError, IndexError):
raise RemoteDataError(
"Unable to parse latest DrugBank version number from releases API endpoint"
)

def _get_latest_local_file(self, glob: str) -> Tuple[Path, str]:
"""Get most recent locally-available file. DrugBank uses versioning that isn't
easily sortable by default so we have to use some extra magic.
:param glob: file pattern to match against
:return: Path to most recent file, and its version
:raise FileNotFoundError: if no local data is available
"""
_logger.debug(f"Getting local match against pattern {glob}...")
file_version_pairs = []
for file in self.data_dir.glob(glob):
version = parse_file_version(file, r"drugbank_([\d\.]+).csv")
formatted_version = [int(digits) for digits in version.split(".")]
file_version_pairs.append((file, version, formatted_version))
files = list(sorted(file_version_pairs, key=lambda p: p[2]))
if not files:
raise FileNotFoundError("No source data found for DrugBank")
latest = files[-1]
_logger.debug(f"Returning {latest[0]} as most recent locally-available file.")
return latest[0], latest[1]

def _download_data(self, url: str, outfile: Path) -> None:
"""Download data file to specified location.
:param url: location of data to fetch
:param outfile: location and filename for final data file
"""
download_http(
url,
outfile,
handler=handle_zip,
tqdm_params=self._tqdm_params,
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[Path, str]:
"""Get path to latest version of data, and its version value
:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Path to location of data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
file_path, version = self._get_latest_local_file("drugbank_*.csv")
return file_path, version

latest_version, latest_url_base = self._get_latest_version()
latest_url = f"{latest_url_base}/downloads/all-drugbank-vocabulary"
latest_file = self.data_dir / f"drugbank_{latest_version}.csv"
if (not force_refresh) and latest_file.exists():
_logger.debug(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._download_data(latest_url, latest_file)
return latest_file, latest_version
49 changes: 49 additions & 0 deletions src/wags_tails/drugsatfda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Provide source fetching for Drugs@FDA."""
import datetime
from pathlib import Path

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http, handle_zip
from .utils.versioning import DATE_VERSION_PATTERN


class DrugsAtFdaData(DataSource):
"""Provide access to Drugs@FDA database."""

_src_name = "drugsatfda"
_filetype = "json"

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from releases API
"""
r = requests.get("https://api.fda.gov/download.json")
r.raise_for_status()
r_json = r.json()
try:
date = r_json["results"]["drug"]["drugsfda"]["export_date"]
except KeyError:
raise RemoteDataError(
"Unable to parse latest DrugBank version number from releases API endpoint"
)
return datetime.datetime.strptime(date, "%Y-%m-%d").strftime(
DATE_VERSION_PATTERN
)

def _download_data(self, version: str, outfile: Path) -> None:
"""Download latest data file to specified location.
:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_http(
"https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip",
outfile,
handler=handle_zip,
tqdm_params=self._tqdm_params,
)
112 changes: 112 additions & 0 deletions src/wags_tails/guide_to_pharmacology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Provide source fetching for Guide To Pharmacology."""
import logging
import re
from pathlib import Path
from typing import NamedTuple, Tuple

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import download_http
from .utils.storage import get_latest_local_file
from .utils.versioning import parse_file_version

_logger = logging.getLogger(__name__)


class GtoPLigandPaths(NamedTuple):
"""Container for GuideToPharmacology file paths."""

ligands: Path
ligand_id_mapping: Path


class GToPLigandData(DataSource):
"""Provide access to Guide to Pharmacology data."""

_src_name = "guidetopharmacology"
_filetype = "tsv"

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from releases API
"""
r = requests.get("https://www.guidetopharmacology.org/")
r.raise_for_status()
r_text = r.text.split("\n")
pattern = re.compile(r"Current Release Version (\d{4}\.\d) \(.*\)")
for line in r_text:
if "Current Release Version" in line:
matches = re.findall(pattern, line.strip())
if matches:
return matches[0]
else:
raise RemoteDataError(
"Unable to parse latest Guide to Pharmacology version number homepage HTML."
)

def _download_data(self, file_paths: GtoPLigandPaths) -> None:
"""Perform file downloads.
:param file_paths: locations to save files at
"""
download_http(
"https://www.guidetopharmacology.org/DATA/ligands.tsv",
file_paths.ligands,
tqdm_params=self._tqdm_params,
)
download_http(
"https://www.guidetopharmacology.org/DATA/ligand_id_mapping.tsv",
file_paths.ligand_id_mapping,
tqdm_params=self._tqdm_params,
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[GtoPLigandPaths, str]:
"""Get path to latest version of data, and its version value
:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Paths to data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
ligands_path = get_latest_local_file(self.data_dir, "gtop_ligands_*.tsv")
ligand_id_mapping_path = get_latest_local_file(
self.data_dir, "gtop_ligand_id_mapping_*.tsv"
)
file_paths = GtoPLigandPaths(
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path
)
return file_paths, parse_file_version(
ligands_path, r"gtop_ligands_(\d{4}\.\d+).tsv"
)

latest_version = self._get_latest_version()
ligands_path = self.data_dir / f"gtop_ligands_{latest_version}.tsv"
ligand_id_mapping_path = (
self.data_dir / f"gtop_ligand_id_mapping_{latest_version}.tsv"
)
file_paths = GtoPLigandPaths(
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path
)
if not force_refresh:
if ligands_path.exists() and ligand_id_mapping_path.exists():
_logger.debug(
f"Found existing files, {file_paths}, matching latest version {latest_version}."
)
return file_paths, latest_version
elif ligands_path.exists() or ligand_id_mapping_path.exists():
_logger.warning(
f"Existing files, {file_paths}, not all available -- attempting full download."
)
self._download_data(file_paths)
return file_paths, latest_version
Loading

0 comments on commit bd3dce2

Please sign in to comment.