GenomicMedLab · jsstevenson · Nov 16, 2023 · Oct 20, 2023 · Oct 23, 2023 · Oct 23, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,14 +86,15 @@ docstring-quotes = "double"
 
 [tool.ruff.per-file-ignores]
 # ANN001 - missing-type-function-argument
-# ANN2 - missing-return-type
-# ANN201 - Missing type annotation
 # ANN102 - missing-type-cls
-# D103 - Missing docstring in public function
+# ANN2 - missing-return-type
+# ANN201 - missing-return-type-undocumented-public-function
+# D103 - undocumented-public-function
 # F821 - undefined-name
 # F401 - unused-import
-# I001 - Import block unsorted or unformatted
+# I001 - unsorted-imports
 # N805 - invalid-first-argument-name-for-method
 "tests/*" = ["ANN001", "ANN102", "ANN2"]
 "*__init__.py" = ["F401"]
 "docs/source/conf.py" = ["D100", "I001", "D103", "ANN201", "ANN001"]
+"src/wags_tails/base_source.py" = ["ANN102"]
diff --git a/src/wags_tails/__init__.py b/src/wags_tails/__init__.py
@@ -1,19 +1,31 @@
 """Data acquisition tools for Wagnerds."""
 from .base_source import DataSource, RemoteDataError
 from .chembl import ChemblData
+from .chemidplus import ChemIDplusData
 from .custom import CustomData
 from .do import DoData
+from .drugbank import DrugBankData
+from .drugsatfda import DrugsAtFdaData
+from .guide_to_pharmacology import GToPLigandData
+from .hemonc import HemOncData
 from .mondo import MondoData
 from .ncit import NcitData
 from .oncotree import OncoTreeData
+from .rxnorm import RxNormData
 
 __all__ = [
     "DataSource",
     "RemoteDataError",
     "ChemblData",
+    "ChemIDplusData",
     "CustomData",
     "DoData",
+    "DrugBankData",
+    "DrugsAtFdaData",
+    "GToPLigandData",
+    "HemOncData",
     "MondoData",
     "NcitData",
     "OncoTreeData",
+    "RxNormData",
 ]
diff --git a/src/wags_tails/chemidplus.py b/src/wags_tails/chemidplus.py
@@ -0,0 +1,52 @@
+"""Provide source fetching for ChemIDplus."""
+import re
+from datetime import datetime
+from pathlib import Path
+
+import requests
+
+from .base_source import DataSource, RemoteDataError
+from .utils.downloads import download_http
+from .utils.versioning import DATE_VERSION_PATTERN
+
+
+class ChemIDplusData(DataSource):
+    """Provide access to ChemIDplus database."""
+
+    _src_name = "chemidplus"
+    _filetype = "xml"
+
+    @staticmethod
+    def _get_latest_version() -> str:
+        """Retrieve latest version value
+
+        :return: latest release value
+        :raise RemoteDataError: if unable to parse version number from data file
+        """
+        latest_url = "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml"
+        headers = {"Range": "bytes=0-300"}  # leave some slack to capture date
+        r = requests.get(latest_url, headers=headers)
+        r.raise_for_status()
+        result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text)
+        if result:
+            raw_date = result.groups()[0]
+            return datetime.strptime(raw_date, "%Y-%m-%d").strftime(
+                DATE_VERSION_PATTERN
+            )
+        else:
+            raise RemoteDataError(
+                "Unable to parse latest ChemIDplus version number from partial access to latest file"
+            )
+
+    def _download_data(self, version: str, outfile: Path) -> None:
+        """Download data file to specified location. ChemIDplus data is no longer
+        updated, so versioning is irrelevant.
+
+        :param version: version to acquire
+        :param outfile: location and filename for final data file
+        """
+        download_http(
+            "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml",
+            outfile,
+            tqdm_params=self._tqdm_params,
+        )
diff --git a/src/wags_tails/drugbank.py b/src/wags_tails/drugbank.py
@@ -0,0 +1,100 @@
+"""Provide source fetching for DrugBank."""
+import logging
+from pathlib import Path
+from typing import Tuple
+
+import requests
+
+from .base_source import DataSource, RemoteDataError
+from .utils.downloads import download_http, handle_zip
+from .utils.versioning import parse_file_version
+
+_logger = logging.getLogger(__name__)
+
+
+class DrugBankData(DataSource):
+    """Provide access to DrugBank database."""
+
+    _src_name = "drugbank"
+    _filetype = "csv"
+
+    @staticmethod
+    def _get_latest_version() -> Tuple[str, str]:
+        """Retrieve latest version value
+
+        :return: latest release value and base download URL
+        :raise RemoteDataError: if unable to parse version number from releases API
+        """
+        releases_url = "https://go.drugbank.com/releases.json"
+        r = requests.get(releases_url)
+        r.raise_for_status()
+        try:
+            latest = r.json()[0]
+            return latest["version"], latest["url"]
+        except (KeyError, IndexError):
+            raise RemoteDataError(
+                "Unable to parse latest DrugBank version number from releases API endpoint"
+            )
+
+    def _get_latest_local_file(self, glob: str) -> Tuple[Path, str]:
+        """Get most recent locally-available file. DrugBank uses versioning that isn't
+        easily sortable by default so we have to use some extra magic.
+
+        :param glob: file pattern to match against
+        :return: Path to most recent file, and its version
+        :raise FileNotFoundError: if no local data is available
+        """
+        _logger.debug(f"Getting local match against pattern {glob}...")
+        file_version_pairs = []
+        for file in self.data_dir.glob(glob):
+            version = parse_file_version(file, r"drugbank_([\d\.]+).csv")
+            formatted_version = [int(digits) for digits in version.split(".")]
+            file_version_pairs.append((file, version, formatted_version))
+        files = list(sorted(file_version_pairs, key=lambda p: p[2]))
+        if len(files) < 1:
+            raise FileNotFoundError("No source data found for DrugBank")
+        latest = files[-1]
+        _logger.debug(f"Returning {latest[0]} as most recent locally-available file.")
+        return latest[0], latest[1]
+
+    def _download_data(self, url: str, outfile: Path) -> None:
+        """Download data file to specified location.
+
+        :param url: location of data to fetch
+        :param outfile: location and filename for final data file
+        """
+        download_http(
+            url,
+            outfile,
+            handler=handle_zip,
+            tqdm_params=self._tqdm_params,
+        )
+
+    def get_latest(
+        self, from_local: bool = False, force_refresh: bool = False
+    ) -> Tuple[Path, str]:
+        """Get path to latest version of data, and its version value
+
+        :param from_local: if True, use latest available local file
+        :param force_refresh: if True, fetch and return data from remote regardless of
+            whether a local copy is present
+        :return: Path to location of data, and version value of it
+        :raise ValueError: if both ``force_refresh`` and ``from_local`` are True
+        """
+        if force_refresh and from_local:
+            raise ValueError("Cannot set both `force_refresh` and `from_local`")
+
+        if from_local:
+            file_path, version = self._get_latest_local_file("drugbank_*.csv")
+            return file_path, version
+
+        latest_version, latest_url_base = self._get_latest_version()
+        latest_url = f"{latest_url_base}/downloads/all-drugbank-vocabulary"
+        latest_file = self.data_dir / f"drugbank_{latest_version}.csv"
+        if (not force_refresh) and latest_file.exists():
+            _logger.debug(
+                f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
+            )
+            return latest_file, latest_version
+        self._download_data(latest_url, latest_file)
+        return latest_file, latest_version
diff --git a/src/wags_tails/drugsatfda.py b/src/wags_tails/drugsatfda.py
@@ -0,0 +1,49 @@
+"""Provide source fetching for Drugs@FDA."""
+import datetime
+from pathlib import Path
+
+import requests
+
+from .base_source import DataSource, RemoteDataError
+from .utils.downloads import download_http, handle_zip
+from .utils.versioning import DATE_VERSION_PATTERN
+
+
+class DrugsAtFdaData(DataSource):
+    """Provide access to Drugs@FDA database."""
+
+    _src_name = "drugsatfda"
+    _filetype = "json"
+
+    @staticmethod
+    def _get_latest_version() -> str:
+        """Retrieve latest version value
+
+        :return: latest release value
+        :raise RemoteDataError: if unable to parse version number from releases API
+        """
+        r = requests.get("https://api.fda.gov/download.json")
+        r.raise_for_status()
+        r_json = r.json()
+        try:
+            date = r_json["results"]["drug"]["drugsfda"]["export_date"]
+        except KeyError:
+            raise RemoteDataError(
+                "Unable to parse latest DrugBank version number from releases API endpoint"
+            )
+        return datetime.datetime.strptime(date, "%Y-%m-%d").strftime(
+            DATE_VERSION_PATTERN
+        )
+
+    def _download_data(self, version: str, outfile: Path) -> None:
+        """Download latest data file to specified location.
+
+        :param version: version to acquire
+        :param outfile: location and filename for final data file
+        """
+        download_http(
+            "https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip",
+            outfile,
+            handler=handle_zip,
+            tqdm_params=self._tqdm_params,
+        )
diff --git a/src/wags_tails/guide_to_pharmacology.py b/src/wags_tails/guide_to_pharmacology.py
@@ -0,0 +1,112 @@
+"""Provide source fetching for Guide To Pharmacology."""
+import logging
+import re
+from pathlib import Path
+from typing import NamedTuple, Tuple
+
+import requests
+
+from .base_source import DataSource, RemoteDataError
+from .utils.downloads import download_http
+from .utils.storage import get_latest_local_file
+from .utils.versioning import parse_file_version
+
+_logger = logging.getLogger(__name__)
+
+
+class GtoPLigandPaths(NamedTuple):
+    """Container for GuideToPharmacology file paths."""
+
+    ligands: Path
+    ligand_id_mapping: Path
+
+
+class GToPLigandData(DataSource):
+    """Provide access to Guide to Pharmacology data."""
+
+    _src_name = "guidetopharmacology"
+    _filetype = "tsv"
+
+    @staticmethod
+    def _get_latest_version() -> str:
+        """Retrieve latest version value
+
+        :return: latest release value
+        :raise RemoteDataError: if unable to parse version number from releases API
+        """
+        r = requests.get("https://www.guidetopharmacology.org/")
+        r.raise_for_status()
+        r_text = r.text.split("\n")
+        pattern = re.compile(r"Current Release Version (\d{4}\.\d) \(.*\)")
+        for line in r_text:
+            if "Current Release Version" in line:
+                matches = re.findall(pattern, line.strip())
+                if matches:
+                    return matches[0]
+        else:
+            raise RemoteDataError(
+                "Unable to parse latest Guide to Pharmacology version number homepage HTML."
+            )
+
+    def _download_data(self, file_paths: GtoPLigandPaths) -> None:
+        """Perform file downloads.
+
+        :param file_paths: locations to save files at
+        """
+        download_http(
+            "https://www.guidetopharmacology.org/DATA/ligands.tsv",
+            file_paths.ligands,
+            tqdm_params=self._tqdm_params,
+        )
+        download_http(
+            "https://www.guidetopharmacology.org/DATA/ligand_id_mapping.tsv",
+            file_paths.ligand_id_mapping,
+            tqdm_params=self._tqdm_params,
+        )
+
+    def get_latest(
+        self, from_local: bool = False, force_refresh: bool = False
+    ) -> Tuple[GtoPLigandPaths, str]:
+        """Get path to latest version of data, and its version value
+
+        :param from_local: if True, use latest available local file
+        :param force_refresh: if True, fetch and return data from remote regardless of
+            whether a local copy is present
+        :return: Paths to data, and version value of it
+        :raise ValueError: if both ``force_refresh`` and ``from_local`` are True
+        """
+        if force_refresh and from_local:
+            raise ValueError("Cannot set both `force_refresh` and `from_local`")
+
+        if from_local:
+            ligands_path = get_latest_local_file(self.data_dir, "gtop_ligands_*.tsv")
+            ligand_id_mapping_path = get_latest_local_file(
+                self.data_dir, "gtop_ligand_id_mapping_*.tsv"
+            )
+            file_paths = GtoPLigandPaths(
+                ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path
+            )
+            return file_paths, parse_file_version(
+                ligands_path, r"gtop_ligands_(\d{4}\.\d+).tsv"
+            )
+
+        latest_version = self._get_latest_version()
+        ligands_path = self.data_dir / f"gtop_ligands_{latest_version}.tsv"
+        ligand_id_mapping_path = (
+            self.data_dir / f"gtop_ligand_id_mapping_{latest_version}.tsv"
+        )
+        file_paths = GtoPLigandPaths(
+            ligands=ligands_path, ligand_id_mapping=ligand_id_mapping_path
+        )
+        if not force_refresh:
+            if ligands_path.exists() and ligand_id_mapping_path.exists():
+                _logger.debug(
+                    f"Found existing files, {file_paths}, matching latest version {latest_version}."
+                )
+                return file_paths, latest_version
+            elif ligands_path.exists() or ligand_id_mapping_path.exists():
+                _logger.warning(
+                    f"Existing files, {file_paths}, not all available -- attempting full download."
+                )
+        self._download_data(file_paths)
+        return file_paths, latest_version