Skip to content

Commit

Permalink
add downloader for pfam-A
Browse files Browse the repository at this point in the history
  • Loading branch information
Robaina committed Sep 28, 2023
1 parent 359fae1 commit b472abb
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 139 deletions.
16 changes: 16 additions & 0 deletions src/pynteny/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,22 @@ def download() -> argparse.ArgumentParser:
action="store_true",
help="force-download database again if already downloaded",
)
optional.add_argument(
"-pgap",
"--pgap",
dest="pgap",
default=False,
action="store_true",
help="download PGAP database (default)",
)
optional.add_argument(
"-pfam",
"--pfam",
dest="pfam",
default=False,
action="store_true",
help="download PFAM database",
)
optional.add_argument(
"-l",
"--log",
Expand Down
8 changes: 5 additions & 3 deletions src/pynteny/config.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
{
"database_dir": "",
"upack_PGAP_database": false,
"data_downloaded": false,
"upack_PFAM_database": false,
"PGAP_data_downloaded": false,
"PFAM_data_downloaded": false,
"PGAP_database": "",
"PGAP_meta_file": "",
"streamlit_process": "",
"streamlit_log": ""
"PFAM_database": "",
"PFAM_meta_file": ""
}
180 changes: 82 additions & 98 deletions src/pynteny/hmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import logging
import os
import sys
from typing import Callable
from collections import defaultdict
from pathlib import Path
import tempfile
Expand Down Expand Up @@ -250,12 +249,13 @@ def get_meta_info_for_HMM(self, hmm_name: str) -> dict:
class PGAP(HMMDatabase):
"""Tools to parse PGAP hmm database metadata"""

def __init__(self):
def __init__(self, *args, **kwargs):
"""Initialize class PGAP"""
super().__init__()
super().__init__(*args, **kwargs)
self._meta = self._meta.rename(columns={"#ncbi_accession": "accession"})
self._meta = self.remove_missing_HMMs_from_metadata(meta_outfile=None)
# self._meta = self.remove_missing_HMMs_from_metadata(meta_outfile=None)

@staticmethod
def remove_missing_HMMs_from_metadata(self, meta_outfile: Path = None) -> None:
"""Remove HMMs from metadata that are not in HMM directory
Expand Down Expand Up @@ -290,10 +290,6 @@ def remove_missing_HMMs_from_metadata(self, meta_outfile: Path = None) -> None:
class PFAM(HMMDatabase):
"""Tools to preprocess the PFAM-A hmm database"""

def __init__(self):
"""Initialize class PFAM"""
super().__init__()

@classmethod
def from_gz_file(
cls, hmm_gz_file: Path, hmm_outdir: Path = None, meta_outfile: Path = None
Expand Down Expand Up @@ -359,95 +355,83 @@ def construct_meta_file(self, meta_outfile: Path = None) -> None:
self._meta = pd.read_csv(meta_outfile, sep="\t")


class Downloader:
"""Tools to download and preprocess HMM databases"""

def __init__(self, download_dir: Path):
"""Initialize class Downloader
Args:
output_dir (Path): path to output directory.
"""
self._download_dir = Path(download_dir)
if self._download_dir.exists():
logger.warning(
f"{self._download_dir} already exists. Downloader may overwrite files."
)

def download_pgap(self, unpack: bool = False) -> None:
"""Download PGAP database
Args:
unpack (bool, optional): if True then PGAP database will be extracted
"""

data_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.HMM.tgz"
meta_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv"
logger.info("Downloading PGAP database")
try:
PGAP_file = self._download_dir / "hmm_PGAP.HMM.tgz"
meta_file = self._download_dir / "hmm_PGAP.tsv"
download_file(data_url, PGAP_file)
download_file(meta_url, meta_file)
except Exception:
logger.exception(
"Failed to download PGAP database. Please check your internet connection."
)
sys.exit(1)
if unpack:
self.extract_pgap_to_directory(PGAP_file)
logger.info("Database downloaded successfully\n")

def download_pfam(self, unpack: bool = False) -> None:
"""Download PFAM database
Args:
unpack (bool, optional): if True then PFAM database will be extracted
"""
pfam_file = self.download_dir / "Pfam-A.gz"
# hmm_outdir = self._output_dir.parent / "pfam_hmms"
# meta_outfile = hmm_outdir / f"{pfam_file.stem}_meta.tsv"
logger.info("Downloading PFAM-A hmm database")
try:
url = (
"https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz"
)
download_file(url, pfam_file)
except Exception:
logger.exception(
"Failed to download PFAM-A database. Please check your internet connection."
)
sys.exit(1)
if unpack:
self.extract_pfam_to_directory(pfam_file)
logger.info("Database downloaded successfully")

def extract_pgap_to_directory(self, pgap_tar: Path) -> None:
"""Extract PGAP hmm database (tar.gz) to downlaod directory
def download_pgap(download_dir: Path, unpack: bool = False) -> tuple[Path, Path]:
"""Download PGAP database
Args:
pgap_tar (Path): path to compressed PGAP database.
"""
pgap_tar = Path(pgap_tar)
if not is_tar_file(pgap_tar):
logger.warning(f"{pgap_tar} is not a tar file. Skipping extraction")
return
logger.info("Extracting hmm files to target directory")
extract_tar_file(pgap_tar, self._download_dir)
flatten_directory(self._download_dir)
logger.info("PGAP database unpacked successfully")

def extract_pfam_to_directory(self, pfam_gz: Path) -> None:
"""Extract PFAM hmm database (gz) to downlaod directory
Args:
download_dir (Path): path to output directory.
unpack (bool, optional): if True then PGAP database will be extracted
"""
if download_dir.exists():
logger.warning(
f"{download_dir} already exists. Downloader may overwrite files."
)

Args:
pfam_gz (Path): path to compressed PFAM database.
"""
pfam_gz = Path(pfam_gz)
if not pfam_gz.is_file():
logger.warning(f"{pfam_gz} is not a file. Skipping extraction")
return
logger.info("Extracting hmm files to target directory")
extract_gz_file(pfam_gz, self._download_dir)
flatten_directory(self.download_dir)
logger.info("PGAP database unpacked successfully")
data_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.HMM.tgz"
meta_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv"
PGAP_file = download_dir / "hmm_PGAP.HMM.tgz"
meta_file = download_dir / "hmm_PGAP.tsv"
download_file(data_url, PGAP_file)
download_file(meta_url, meta_file)
if unpack:
destination_path = download_dir / "pgap_hmms"
extract_pgap_to_directory(PGAP_file, destination_dir=destination_path)
return destination_path, meta_file
else:
return PGAP_file, meta_file


def download_pfam(download_dir: Path, unpack: bool = False) -> Path:
"""Download PFAM database
Args:
unpack (bool, optional): if True then PFAM database will be extracted
"""
if download_dir.exists():
logger.warning(
f"{download_dir} already exists. Downloader may overwrite files."
)
PFAM_file = download_dir / "Pfam-A.gz"
logger.info("Downloading PFAM-A hmm database")
url = "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz"
download_file(url, PFAM_file)
if unpack:
destination_path = download_dir / "pfam_hmms"
extract_pfam_to_directory(PFAM_file, destination_dir=destination_path)
return destination_path
else:
return PFAM_file


def extract_pgap_to_directory(pgap_tar: Path, destination_dir: Path) -> None:
"""Extract PGAP hmm database (tar.gz) to downlaod directory
Args:
pgap_tar (Path): path to compressed PGAP database.
"""
pgap_tar = Path(pgap_tar)
if not is_tar_file(pgap_tar):
logger.warning(f"{pgap_tar} is not a tar file. Skipping extraction")
return
logger.info("Extracting hmm files to target directory")
extract_tar_file(pgap_tar, destination_dir)
flatten_directory(destination_dir)
os.remove(pgap_tar)
logger.info("PGAP database unpacked successfully")


def extract_pfam_to_directory(pfam_gz: Path, destination_dir: Path) -> None:
"""Extract PFAM hmm database (gz) to downlaod directory
Args:
pfam_gz (Path): path to compressed PFAM database.
"""
pfam_gz = Path(pfam_gz)
if not pfam_gz.is_file():
logger.warning(f"{pfam_gz} is not a file. Skipping extraction")
return
logger.info("Extracting hmm files to target directory")
extract_gz_file(pfam_gz, destination_dir)
flatten_directory(destination_dir)
os.remove(pfam_gz)
logger.info("PGAP database unpacked successfully")
102 changes: 66 additions & 36 deletions src/pynteny/subcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@

import pynteny.parsers.syntenyparser as syntenyparser
from pynteny.filter import SyntenyHits, filter_FASTA_by_synteny_structure
from pynteny.hmm import PGAP, PFAM, Downloader
from pynteny.hmm import PGAP, PFAM, download_pgap, download_pfam
from pynteny.preprocessing import Database
from pynteny.utils import (
CommandArgs,
ConfigParser,
download_file,
is_tar_file,
)

Expand Down Expand Up @@ -74,7 +73,10 @@ def synteny_search(args: Union[CommandArgs, ArgumentParser]) -> SyntenyHits:
)
sys.exit(1)
if args.hmm_dir is None:
if not config.get_field("data_downloaded"):
if not (
config.get_field("PGAP_data_downloaded")
or config.get_field("PFAM_data_downloaded")
):
logger.warning(
"HMM database not found. Downloading PGAP database from NCBI"
)
Expand All @@ -84,7 +86,10 @@ def synteny_search(args: Union[CommandArgs, ArgumentParser]) -> SyntenyHits:
args.hmm_dir = Path(config.get_field("PGAP_database"))
if args.gene_ids:
if args.hmm_meta is None:
if not config.get_field("data_downloaded"):
if not (
config.get_field("PGAP_data_downloaded")
or config.get_field("PFAM_data_downloaded")
):
logger.error(
"Please download hmm database first or provide path to hmm metadata file."
)
Expand Down Expand Up @@ -201,7 +206,10 @@ def parse_gene_ids(args: Union[CommandArgs, ArgumentParser]) -> str:
logger = init_logger(args)
config = ConfigParser.get_default_config()
if args.hmm_meta is None:
if not config.get_field("data_downloaded"):
if not (
config.get_field("PGAP_data_downloaded")
or config.get_field("PFAM_data_downloaded")
):
logger.error(
"Please download hmm database meta file or provide path to existing one first."
)
Expand Down Expand Up @@ -229,7 +237,19 @@ def download_hmms(args: Union[CommandArgs, ArgumentParser]) -> None:
"""
logger = init_logger(args)
config = ConfigParser.get_default_config()
if (config.get_field("data_downloaded")) and (not args.force):
if (config.get_field("PGAP_data_downloaded")) and (args.pgap) and (not args.force):
logger.info("PGAP HMM database already downloaded. Skipping download")
elif (
(config.get_field("PFAM_data_downloaded")) and (args.pfam) and (not args.force)
):
logger.info("PFAM HMM database already downloaded. Skipping download")
elif (
(config.get_field("PGAP_data_downloaded"))
and (args.pgap)
and (config.get_field("PFAM_data_downloaded"))
and (args.pfam)
and (not args.force)
):
logger.info("HMM databases already downloaded. Skipping download")
sys.exit(1)
if args.outdir is None:
Expand All @@ -241,36 +261,46 @@ def download_hmms(args: Union[CommandArgs, ArgumentParser]) -> None:
download_dir.mkdir(parents=True, exist_ok=True)

config.update_config("database_dir", download_dir.as_posix())
config.update_config("unpack_PGAP_database", args.unpack)

downloader = Downloader(download_dir)

# data_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.HMM.tgz"
# meta_url = "https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv"
# logger.info("Downloading PGAP database")
# try:
# PGAP_file = download_dir / "hmm_PGAP.HMM.tgz"
# meta_file = download_dir / "hmm_PGAP.tsv"
# download_file(data_url, PGAP_file)
# download_file(meta_url, meta_file)
# logger.info("Database dowloaded successfully\n")
# config.update_config("data_downloaded", True)
# config.update_config("PGAP_database", PGAP_file.as_posix())
# config.update_config("PGAP_meta_file", meta_file.as_posix())
# except Exception:
# logger.exception(
# "Failed to download PGAP database. Please check your internet connection."
# )
# sys.exit(1)
# logger.info("Removing missing entries from PGAP metadata file")
# PGAP(meta_file).remove_missing_HMMs_from_metadata(PGAP_file, meta_file)
# if args.unpack:
# logger.info("Unpacking PGAP database")
# unpacked_PGAP_dir = download_dir / "hmm_PGAP"
# PGAP.extract_PGAP_to_directory(PGAP_file, output_dir=unpacked_PGAP_dir)
# os.remove(PGAP_file)
# config.update_config("PGAP_database", unpacked_PGAP_dir.as_posix())
# logger.info("PGAP database unpacked successfully")

if args.pgap:
logger.info("Downloading PGAP database")
try:
PGAP_path, PGAP_meta_file = download_pgap(download_dir, unpack=args.unpack)
PGAP(PGAP_path, PGAP_meta_file).remove_missing_HMMs_from_metadata(
PGAP_meta_file
)
config.update_config("unpack_PGAP_database", args.unpack)
logger.info("PGAP database downloaded successfully\n")
config.update_config("PGAP_data_downloaded", True)
config.update_config("PGAP_database", PGAP_path.as_posix())
config.update_config("PGAP_meta_file", PGAP_meta_file.as_posix())
except Exception:
logger.exception(
"Failed to download PGAP database. Please check your internet connection."
)
sys.exit(1)

if args.pfam:
logger.info("Downloading PFAM-A database")
try:
PFAM_meta_file = download_dir / "hmm_PFAM.tsv"
PFAM_path = download_dir / "PFAM_hmms"
PFAM_gz_file = download_pfam(download_dir, unpack=True)
pfam = PFAM.from_gz_file(
PFAM_gz_file,
hmm_outdir=PFAM_path,
meta_outfile=PFAM_meta_file,
)
config.update_config("unpack_PFAM_database", True)
logger.info("PFAM database downloaded successfully\n")
config.update_config("PFAM_data_downloaded", True)
config.update_config("PFAM_database", PFAM_path.as_posix())
config.update_config("PFAM_meta_file", PFAM_meta_file.as_posix())
except Exception:
logger.exception(
"Failed to download PFAM-A database. Please check your internet connection."
)
sys.exit(1)
logging.shutdown()


Expand Down
Loading

0 comments on commit b472abb

Please sign in to comment.