-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add therapy-normalizer data sources #13
Merged
Merged
Changes from all commits
Commits
Show all changes
23 commits
Select commit
Hold shift + click to select a range
70f4180
add init therapy
jsstevenson 3699053
feat: add drugbank
jsstevenson c45e65b
feat: add NCIT
jsstevenson 0c5675f
feat: add drugsatfda
jsstevenson 9b82551
feat: add hemonc
jsstevenson 67c2e95
feat: add gtop ligands
jsstevenson 244bf42
clean up
jsstevenson d158d36
more cleanup
jsstevenson 69cb105
cleanup: various things
jsstevenson 3a5f7bd
feat: add custom source
jsstevenson 510f19b
more things
jsstevenson b380b28
Add disease normalizer sources
jsstevenson adddae3
more!
jsstevenson 8e565b7
final cleanup
jsstevenson ad17a9e
final refactoring
jsstevenson 985c372
Merge branch 'main' into therapy
jsstevenson f4c539b
incorporate disease updates
jsstevenson 7177520
clean up ruff notes
jsstevenson 3ebd743
cleanup
jsstevenson e66c0c5
cleanup
jsstevenson baf9c00
cleanup
jsstevenson 60c0a9b
add PR feedback
jsstevenson 37f1c16
cleanup
jsstevenson File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,31 @@ | ||
"""Data acquisition tools for Wagnerds.""" | ||
from .base_source import DataSource, RemoteDataError | ||
from .chembl import ChemblData | ||
from .chemidplus import ChemIDplusData | ||
from .custom import CustomData | ||
from .do import DoData | ||
from .drugbank import DrugBankData | ||
from .drugsatfda import DrugsAtFdaData | ||
from .guide_to_pharmacology import GToPLigandData | ||
from .hemonc import HemOncData | ||
from .mondo import MondoData | ||
from .ncit import NcitData | ||
from .oncotree import OncoTreeData | ||
from .rxnorm import RxNormData | ||
|
||
__all__ = [ | ||
"DataSource", | ||
"RemoteDataError", | ||
"ChemblData", | ||
"ChemIDplusData", | ||
"CustomData", | ||
"DoData", | ||
"DrugBankData", | ||
"DrugsAtFdaData", | ||
"GToPLigandData", | ||
"HemOncData", | ||
"MondoData", | ||
"NcitData", | ||
"OncoTreeData", | ||
"RxNormData", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""Provide source fetching for ChemIDplus.""" | ||
import re | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http | ||
from .utils.versioning import DATE_VERSION_PATTERN | ||
|
||
|
||
class ChemIDplusData(DataSource): | ||
"""Provide access to ChemIDplus database.""" | ||
|
||
_src_name = "chemidplus" | ||
_filetype = "xml" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
|
||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from data file | ||
""" | ||
latest_url = "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml" | ||
headers = {"Range": "bytes=0-300"} # leave some slack to capture date | ||
r = requests.get(latest_url, headers=headers) | ||
r.raise_for_status() | ||
result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text) | ||
if result: | ||
raw_date = result.groups()[0] | ||
return datetime.strptime(raw_date, "%Y-%m-%d").strftime( | ||
DATE_VERSION_PATTERN | ||
) | ||
else: | ||
raise RemoteDataError( | ||
"Unable to parse latest ChemIDplus version number from partial access to latest file" | ||
) | ||
|
||
def _download_data(self, version: str, outfile: Path) -> None: | ||
"""Download data file to specified location. ChemIDplus data is no longer | ||
updated, so versioning is irrelevant. | ||
|
||
:param version: version to acquire | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
"https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml", | ||
outfile, | ||
tqdm_params=self._tqdm_params, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
"""Provide source fetching for DrugBank.""" | ||
import logging | ||
from pathlib import Path | ||
from typing import Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http, handle_zip | ||
from .utils.versioning import parse_file_version | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class DrugBankData(DataSource): | ||
"""Provide access to DrugBank database.""" | ||
|
||
_src_name = "drugbank" | ||
_filetype = "csv" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> Tuple[str, str]: | ||
"""Retrieve latest version value | ||
|
||
:return: latest release value and base download URL | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
releases_url = "https://go.drugbank.com/releases.json" | ||
r = requests.get(releases_url) | ||
r.raise_for_status() | ||
try: | ||
latest = r.json()[0] | ||
return latest["version"], latest["url"] | ||
except (KeyError, IndexError): | ||
raise RemoteDataError( | ||
"Unable to parse latest DrugBank version number from releases API endpoint" | ||
) | ||
|
||
def _get_latest_local_file(self, glob: str) -> Tuple[Path, str]: | ||
"""Get most recent locally-available file. DrugBank uses versioning that isn't | ||
easily sortable by default so we have to use some extra magic. | ||
|
||
:param glob: file pattern to match against | ||
:return: Path to most recent file, and its version | ||
:raise FileNotFoundError: if no local data is available | ||
""" | ||
_logger.debug(f"Getting local match against pattern {glob}...") | ||
file_version_pairs = [] | ||
for file in self.data_dir.glob(glob): | ||
version = parse_file_version(file, r"drugbank_([\d\.]+).csv") | ||
formatted_version = [int(digits) for digits in version.split(".")] | ||
file_version_pairs.append((file, version, formatted_version)) | ||
files = list(sorted(file_version_pairs, key=lambda p: p[2])) | ||
if not files: | ||
raise FileNotFoundError("No source data found for DrugBank") | ||
latest = files[-1] | ||
_logger.debug(f"Returning {latest[0]} as most recent locally-available file.") | ||
return latest[0], latest[1] | ||
|
||
def _download_data(self, url: str, outfile: Path) -> None: | ||
"""Download data file to specified location. | ||
|
||
:param url: location of data to fetch | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
url, | ||
outfile, | ||
handler=handle_zip, | ||
tqdm_params=self._tqdm_params, | ||
) | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[Path, str]: | ||
"""Get path to latest version of data, and its version value | ||
|
||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Path to location of data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
file_path, version = self._get_latest_local_file("drugbank_*.csv") | ||
return file_path, version | ||
|
||
latest_version, latest_url_base = self._get_latest_version() | ||
latest_url = f"{latest_url_base}/downloads/all-drugbank-vocabulary" | ||
latest_file = self.data_dir / f"drugbank_{latest_version}.csv" | ||
if (not force_refresh) and latest_file.exists(): | ||
_logger.debug( | ||
f"Found existing file, {latest_file.name}, matching latest version {latest_version}." | ||
) | ||
return latest_file, latest_version | ||
self._download_data(latest_url, latest_file) | ||
return latest_file, latest_version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""Provide source fetching for Drugs@FDA.""" | ||
import datetime | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http, handle_zip | ||
from .utils.versioning import DATE_VERSION_PATTERN | ||
|
||
|
||
class DrugsAtFdaData(DataSource): | ||
"""Provide access to Drugs@FDA database.""" | ||
|
||
_src_name = "drugsatfda" | ||
_filetype = "json" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
|
||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
r = requests.get("https://api.fda.gov/download.json") | ||
r.raise_for_status() | ||
r_json = r.json() | ||
try: | ||
date = r_json["results"]["drug"]["drugsfda"]["export_date"] | ||
except KeyError: | ||
raise RemoteDataError( | ||
"Unable to parse latest DrugBank version number from releases API endpoint" | ||
) | ||
return datetime.datetime.strptime(date, "%Y-%m-%d").strftime( | ||
DATE_VERSION_PATTERN | ||
) | ||
|
||
def _download_data(self, version: str, outfile: Path) -> None: | ||
"""Download latest data file to specified location. | ||
|
||
:param version: version to acquire | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
"https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip", | ||
outfile, | ||
handler=handle_zip, | ||
tqdm_params=self._tqdm_params, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
"""Provide source fetching for Guide To Pharmacology.""" | ||
import logging | ||
import re | ||
from pathlib import Path | ||
from typing import NamedTuple, Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import download_http | ||
from .utils.storage import get_latest_local_file | ||
from .utils.versioning import parse_file_version | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class GtoPLigandPaths(NamedTuple): | ||
"""Container for GuideToPharmacology file paths.""" | ||
|
||
ligands: Path | ||
ligand_id_mapping: Path | ||
|
||
|
||
class GToPLigandData(DataSource): | ||
"""Provide access to Guide to Pharmacology data.""" | ||
|
||
_src_name = "guidetopharmacology" | ||
_filetype = "tsv" | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
|
||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
r = requests.get("https://www.guidetopharmacology.org/") | ||
r.raise_for_status() | ||
r_text = r.text.split("\n") | ||
pattern = re.compile(r"Current Release Version (\d{4}\.\d) \(.*\)") | ||
for line in r_text: | ||
if "Current Release Version" in line: | ||
matches = re.findall(pattern, line.strip()) | ||
if matches: | ||
return matches[0] | ||
else: | ||
raise RemoteDataError( | ||
"Unable to parse latest Guide to Pharmacology version number homepage HTML." | ||
) | ||
|
||
def _download_data(self, file_paths: GtoPLigandPaths) -> None: | ||
"""Perform file downloads. | ||
|
||
:param file_paths: locations to save files at | ||
""" | ||
download_http( | ||
"https://www.guidetopharmacology.org/DATA/ligands.tsv", | ||
file_paths.ligands, | ||
tqdm_params=self._tqdm_params, | ||
) | ||
download_http( | ||
"https://www.guidetopharmacology.org/DATA/ligand_id_mapping.tsv", | ||
file_paths.ligand_id_mapping, | ||
tqdm_params=self._tqdm_params, | ||
) | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[GtoPLigandPaths, str]: | ||
"""Get path to latest version of data, and its version value | ||
|
||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Paths to data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
ligands_path = get_latest_local_file(self.data_dir, "gtop_ligands_*.tsv") | ||
ligand_id_mapping_path = get_latest_local_file( | ||
self.data_dir, "gtop_ligand_id_mapping_*.tsv" | ||
) | ||
file_paths = GtoPLigandPaths( | ||
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path | ||
) | ||
return file_paths, parse_file_version( | ||
ligands_path, r"gtop_ligands_(\d{4}\.\d+).tsv" | ||
) | ||
|
||
latest_version = self._get_latest_version() | ||
ligands_path = self.data_dir / f"gtop_ligands_{latest_version}.tsv" | ||
ligand_id_mapping_path = ( | ||
self.data_dir / f"gtop_ligand_id_mapping_{latest_version}.tsv" | ||
) | ||
file_paths = GtoPLigandPaths( | ||
ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path | ||
) | ||
if not force_refresh: | ||
if ligands_path.exists() and ligand_id_mapping_path.exists(): | ||
_logger.debug( | ||
f"Found existing files, {file_paths}, matching latest version {latest_version}." | ||
) | ||
return file_paths, latest_version | ||
elif ligands_path.exists() or ligand_id_mapping_path.exists(): | ||
_logger.warning( | ||
f"Existing files, {file_paths}, not all available -- attempting full download." | ||
) | ||
self._download_data(file_paths) | ||
return file_paths, latest_version |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hemonc and gtop both acquire multiple files. With HemOnc, it's basically a bundle (each one is meaningless without the others) but with GtoP, it's just a selection of useful files (and in fact, DGIdb uses a different selection of GtoP files). I thought about a few different ways to handle this (including making a separate class for each file) and all of them were sort of awkward. I think returning a namedtuple of paths like this is the easiest way to handle it for data consumers.