-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add therapy-normalizer data sources (#13)
- Loading branch information
1 parent
f2c5c1c
commit bd3dce2
Showing
29 changed files
with
13,629 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,31 @@ | ||
"""Data acquisition tools for Wagnerds.""" | ||
from .base_source import DataSource, RemoteDataError | ||
from .chembl import ChemblData | ||
from .chemidplus import ChemIDplusData | ||
from .custom import CustomData | ||
from .do import DoData | ||
from .drugbank import DrugBankData | ||
from .drugsatfda import DrugsAtFdaData | ||
from .guide_to_pharmacology import GToPLigandData | ||
from .hemonc import HemOncData | ||
from .mondo import MondoData | ||
from .ncit import NcitData | ||
from .oncotree import OncoTreeData | ||
from .rxnorm import RxNormData | ||
|
||
__all__ = [ | ||
"DataSource", | ||
"RemoteDataError", | ||
"ChemblData", | ||
"ChemIDplusData", | ||
"CustomData", | ||
"DoData", | ||
"DrugBankData", | ||
"DrugsAtFdaData", | ||
"GToPLigandData", | ||
"HemOncData", | ||
"MondoData", | ||
"NcitData", | ||
"OncoTreeData", | ||
"RxNormData", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""Provide source fetching for ChemIDplus.""" | ||
import re | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http | ||
from .utils.versioning import DATE_VERSION_PATTERN | ||
|
||
|
||
class ChemIDplusData(DataSource): | ||
"""Provide access to ChemIDplus database.""" | ||
|
||
_src_name = "chemidplus" | ||
_filetype = "xml" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from data file | ||
""" | ||
latest_url = "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml" | ||
headers = {"Range": "bytes=0-300"} # leave some slack to capture date | ||
r = requests.get(latest_url, headers=headers) | ||
r.raise_for_status() | ||
result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text) | ||
if result: | ||
raw_date = result.groups()[0] | ||
return datetime.strptime(raw_date, "%Y-%m-%d").strftime( | ||
DATE_VERSION_PATTERN | ||
) | ||
else: | ||
raise RemoteDataError( | ||
"Unable to parse latest ChemIDplus version number from partial access to latest file" | ||
) | ||
|
||
def _download_data(self, version: str, outfile: Path) -> None: | ||
"""Download data file to specified location. ChemIDplus data is no longer | ||
updated, so versioning is irrelevant. | ||
:param version: version to acquire | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
"https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml", | ||
outfile, | ||
tqdm_params=self._tqdm_params, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
"""Provide source fetching for DrugBank.""" | ||
import logging | ||
from pathlib import Path | ||
from typing import Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http, handle_zip | ||
from .utils.versioning import parse_file_version | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class DrugBankData(DataSource): | ||
"""Provide access to DrugBank database.""" | ||
|
||
_src_name = "drugbank" | ||
_filetype = "csv" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> Tuple[str, str]: | ||
"""Retrieve latest version value | ||
:return: latest release value and base download URL | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
releases_url = "https://go.drugbank.com/releases.json" | ||
r = requests.get(releases_url) | ||
r.raise_for_status() | ||
try: | ||
latest = r.json()[0] | ||
return latest["version"], latest["url"] | ||
except (KeyError, IndexError): | ||
raise RemoteDataError( | ||
"Unable to parse latest DrugBank version number from releases API endpoint" | ||
) | ||
|
||
def _get_latest_local_file(self, glob: str) -> Tuple[Path, str]: | ||
"""Get most recent locally-available file. DrugBank uses versioning that isn't | ||
easily sortable by default so we have to use some extra magic. | ||
:param glob: file pattern to match against | ||
:return: Path to most recent file, and its version | ||
:raise FileNotFoundError: if no local data is available | ||
""" | ||
_logger.debug(f"Getting local match against pattern {glob}...") | ||
file_version_pairs = [] | ||
for file in self.data_dir.glob(glob): | ||
version = parse_file_version(file, r"drugbank_([\d\.]+).csv") | ||
formatted_version = [int(digits) for digits in version.split(".")] | ||
file_version_pairs.append((file, version, formatted_version)) | ||
files = list(sorted(file_version_pairs, key=lambda p: p[2])) | ||
if not files: | ||
raise FileNotFoundError("No source data found for DrugBank") | ||
latest = files[-1] | ||
_logger.debug(f"Returning {latest[0]} as most recent locally-available file.") | ||
return latest[0], latest[1] | ||
|
||
def _download_data(self, url: str, outfile: Path) -> None: | ||
"""Download data file to specified location. | ||
:param url: location of data to fetch | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
url, | ||
outfile, | ||
handler=handle_zip, | ||
tqdm_params=self._tqdm_params, | ||
) | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[Path, str]: | ||
"""Get path to latest version of data, and its version value | ||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Path to location of data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
file_path, version = self._get_latest_local_file("drugbank_*.csv") | ||
return file_path, version | ||
|
||
latest_version, latest_url_base = self._get_latest_version() | ||
latest_url = f"{latest_url_base}/downloads/all-drugbank-vocabulary" | ||
latest_file = self.data_dir / f"drugbank_{latest_version}.csv" | ||
if (not force_refresh) and latest_file.exists(): | ||
_logger.debug( | ||
f"Found existing file, {latest_file.name}, matching latest version {latest_version}." | ||
) | ||
return latest_file, latest_version | ||
self._download_data(latest_url, latest_file) | ||
return latest_file, latest_version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""Provide source fetching for Drugs@FDA.""" | ||
import datetime | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http, handle_zip | ||
from .utils.versioning import DATE_VERSION_PATTERN | ||
|
||
|
||
class DrugsAtFdaData(DataSource): | ||
"""Provide access to Drugs@FDA database.""" | ||
|
||
_src_name = "drugsatfda" | ||
_filetype = "json" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
r = requests.get("https://api.fda.gov/download.json") | ||
r.raise_for_status() | ||
r_json = r.json() | ||
try: | ||
date = r_json["results"]["drug"]["drugsfda"]["export_date"] | ||
except KeyError: | ||
raise RemoteDataError( | ||
"Unable to parse latest DrugBank version number from releases API endpoint" | ||
) | ||
return datetime.datetime.strptime(date, "%Y-%m-%d").strftime( | ||
DATE_VERSION_PATTERN | ||
) | ||
|
||
def _download_data(self, version: str, outfile: Path) -> None: | ||
"""Download latest data file to specified location. | ||
:param version: version to acquire | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
"https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip", | ||
outfile, | ||
handler=handle_zip, | ||
tqdm_params=self._tqdm_params, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
"""Provide source fetching for Guide To Pharmacology.""" | ||
import logging | ||
import re | ||
from pathlib import Path | ||
from typing import NamedTuple, Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http | ||
from .utils.storage import get_latest_local_file | ||
from .utils.versioning import parse_file_version | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class GtoPLigandPaths(NamedTuple): | ||
"""Container for GuideToPharmacology file paths.""" | ||
|
||
ligands: Path | ||
ligand_id_mapping: Path | ||
|
||
|
||
class GToPLigandData(DataSource): | ||
"""Provide access to Guide to Pharmacology data.""" | ||
|
||
_src_name = "guidetopharmacology" | ||
_filetype = "tsv" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
r = requests.get("https://www.guidetopharmacology.org/") | ||
r.raise_for_status() | ||
r_text = r.text.split("\n") | ||
pattern = re.compile(r"Current Release Version (\d{4}\.\d) \(.*\)") | ||
for line in r_text: | ||
if "Current Release Version" in line: | ||
matches = re.findall(pattern, line.strip()) | ||
if matches: | ||
return matches[0] | ||
else: | ||
raise RemoteDataError( | ||
"Unable to parse latest Guide to Pharmacology version number homepage HTML." | ||
) | ||
|
||
def _download_data(self, file_paths: GtoPLigandPaths) -> None: | ||
"""Perform file downloads. | ||
:param file_paths: locations to save files at | ||
""" | ||
download_http( | ||
"https://www.guidetopharmacology.org/DATA/ligands.tsv", | ||
file_paths.ligands, | ||
tqdm_params=self._tqdm_params, | ||
) | ||
download_http( | ||
"https://www.guidetopharmacology.org/DATA/ligand_id_mapping.tsv", | ||
file_paths.ligand_id_mapping, | ||
tqdm_params=self._tqdm_params, | ||
) | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[GtoPLigandPaths, str]: | ||
"""Get path to latest version of data, and its version value | ||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Paths to data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
ligands_path = get_latest_local_file(self.data_dir, "gtop_ligands_*.tsv") | ||
ligand_id_mapping_path = get_latest_local_file( | ||
self.data_dir, "gtop_ligand_id_mapping_*.tsv" | ||
) | ||
file_paths = GtoPLigandPaths( | ||
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path | ||
) | ||
return file_paths, parse_file_version( | ||
ligands_path, r"gtop_ligands_(\d{4}\.\d+).tsv" | ||
) | ||
|
||
latest_version = self._get_latest_version() | ||
ligands_path = self.data_dir / f"gtop_ligands_{latest_version}.tsv" | ||
ligand_id_mapping_path = ( | ||
self.data_dir / f"gtop_ligand_id_mapping_{latest_version}.tsv" | ||
) | ||
file_paths = GtoPLigandPaths( | ||
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path | ||
) | ||
if not force_refresh: | ||
if ligands_path.exists() and ligand_id_mapping_path.exists(): | ||
_logger.debug( | ||
f"Found existing files, {file_paths}, matching latest version {latest_version}." | ||
) | ||
return file_paths, latest_version | ||
elif ligands_path.exists() or ligand_id_mapping_path.exists(): | ||
_logger.warning( | ||
f"Existing files, {file_paths}, not all available -- attempting full download." | ||
) | ||
self._download_data(file_paths) | ||
return file_paths, latest_version |
Oops, something went wrong.