From c16b01cad95cae4591c4286c5fc7c7760078f4f7 Mon Sep 17 00:00:00 2001 From: Istvan Albert Date: Thu, 19 Oct 2023 14:10:26 -0400 Subject: [PATCH] adding fastq download --- biorun/srr.py | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 biorun/srr.py diff --git a/biorun/srr.py b/biorun/srr.py new file mode 100644 index 0000000..357fc35 --- /dev/null +++ b/biorun/srr.py @@ -0,0 +1,178 @@ +''' +Module that deals with SRR numbers. +''' +from biorun.libs import placlib as plac +import re, json, sys, gzip, os, subprocess +from biorun import utils +from itertools import * +import requests +from urllib.parse import urlparse + +# SRR numbers: SRR5260547 +SRR = re.compile(r'(ERR|SRR|DRR|SRP|ERP)\d+') + +logger = utils.logger + +# ENA API points +ENA_API = "https://www.ebi.ac.uk/ena/portal/api" +ENA_FIELDS = f"{ENA_API}/returnFields" +ENA_REPORT = f"{ENA_API}/filereport" + +# The download command with aria2c +CMD = 'aria2c -x 5 -c --summary-interval 10' + +# Match SRR numbers +def match_srr(text): + """ + Pattern for SRR numbers. + """ + return bool(SRR.search(text)) + +def sumbit_request(url, params={}): + + try: + r = requests.get(url, params=params) + if not r.ok: + r.raise_for_status() + sys.exit() + return r.text + + except Exception as exc: + utils.error(f"Error when searching for {srr}") + + +def read_lines(url, N=100): + + try: + r = requests.get(url, stream=True) + + r.raise_for_status() + + stream = gzip.open(r.raw, mode='rt', encoding='utf-8') + stream = islice(stream, N) + + for line in stream: + yield (line) + + except Exception as exc: + print(f"# Error: {exc}") + +# Fetch metadata +def get_metadata(srr): + logger.info(f"Searching Ensembl for {srr}") + + url = ENA_REPORT + + fields = [ + 'run_accession', + "sample_accession", + "sample_alias", + "sample_description", + 'first_public', + 'country', + 'scientific_name', + 'fastq_bytes', + 'base_count', + 'read_count', + 'library_name', + "library_strategy", + "library_source", + 'library_layout', + 'instrument_platform', + 'instrument_model', + 'study_title', + 'fastq_ftp', + ] + + fields = ",".join(fields) + + params = dict( + accession=srr, + fields=fields, + format='json', + result='read_run', + ) + + resp = sumbit_request(url, params=params) + + try: + resp = json.loads(resp) + except Exception as exc: + utils.error(f"JSON decoding error: {exc}") + + return resp + + +def test_metadata(srr='ERR12058121'): + + meta = get_metadata(srr=srr) + + fastq_ftps = meta[0]['fastq_ftp'].split(';') + + print (fastq_ftps) + + +@plac.pos("srr", help="the srr numbers", ) +@plac.opt("dname", help="the directory name", abbrev='d') +@plac.opt("prefix", help="optional prefix to the filenames", abbrev='p') +@plac.opt("n", help="how many reads to download", abbrev='n') +@plac.flg("all", help="download all reads", abbrev='a') +def run(srr='ERR12058121', dname='reads', n=12, prefix='', all=False): + + # Set the prefix + prefix = prefix or srr + + logger.setLevel("INFO") + + # Make directory dname + os.makedirs(dname, exist_ok=True) + + meta = get_metadata(srr=srr) + urls = meta[0]['fastq_ftp'].split(';') + urls = map(lambda x: f"https://{x}", urls) + urls = list(urls) + + for idx, url in enumerate(urls): + + fname = f"{prefix}_{idx+1}.fastq.gz" + fpath = os.path.join(dname, fname) + + if all: + # Download all reads + logger.info(f"Downloading all reads with aria2") + pname = CMD.split()[0] + exit_code = os.system(f"command -v {pname} > /dev/null 2>&1") + if exit_code != 0: + logger.error(f"Unable to run: {pname}") + logger.error(f"Installation: micromamba install aria2c") + sys.exit(1) + + cmd = f"{CMD} -o {fpath} {url}" + logger.info(f"Running: {cmd}") + sys.stderr.flush() + exit_code = os.system(cmd) + if exit_code != 0: + logger.error(f"Error when running: {cmd}") + sys.exit(1) + + else: + # Stream to a file + logger.info(f"Limit to {n} reads") + + logger.info(f"Downloading {url}") + + fp = gzip.open(fpath, mode='wb') + stream = read_lines(url, N=4) + + logger.info(f"Saving to {fpath}") + + for line in stream: + line = line.encode("utf-8") + fp.write(line) + + + +if __name__ == '__main__': + plac.call(run) + +