forked from s-andrews/sradownloader
-
Notifications
You must be signed in to change notification settings - Fork 1
/
sradownloader
executable file
·377 lines (278 loc) · 14 KB
/
sradownloader
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/usr/bin/env python3
#############################################################################
# Copyright 2020 Simon Andrews
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
############################################################################
import sys
import subprocess
import os
import pandas as pd
import argparse
import glob
import urllib.request
import re
import threading
from ftplib import FTP
VERSION = "3.9"
# These are the symbols we're going to need to remove
# from any proposed file names
invalid_characters = [" ","/",">","<","|","\"","?","*","(",")","\\","[","]","{","}","!"]
def download_sample_ena (sample, options):
if not options.quiet:
print (f"[ENA] Downloading {sample['accession']} into {sample['file_base']}", flush=True)
# Get the FTP locations for the sample
ena_rest_url = f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={sample['accession']}&result=read_run&fields=run_accession,fastq_ftp"
with urllib.request.urlopen(ena_rest_url) as response:
if response.getcode() != 200:
raise IOError(f"[ENA] Got error code {response.getcode()} from ENA REST for accession {sample['accession']}")
url_list = []
found_accession_line = False
for line in response:
line = line.decode("UTF-8").strip()
print(line)
if line.startswith("SRR") or line.startswith("ERR"):
found_accession_line = True
sections = line.split("\t")
if len(sections) != 2:
raise IOError(f"[ENA] No ENA fastq files found for accession {sample['accession']}")
url_list = sections[1].split(";")
break
if not found_accession_line:
raise IOError(f"[ENA] Found no accession in response from ENA REST for accession {sample['accession']}")
for url in url_list:
# We need to extract the various sections from the URL. This will be the server,
# the folder and the file
url_sections = url.split("/")
server = url_sections[0]
file = url_sections[-1]
folder = "/".join(url_sections[1:-1])
#print(f"URL is {url} server={server} folder={folder} file={file}")
# Work out the output file name. We're assuming there's never
# going to be more than 2 files per sample
if url.endswith("_2.fastq.gz"):
outfile=sample['file_base'] + "_2.fastq.gz"
else:
outfile=sample['file_base'] + "_1.fastq.gz"
if options.outdir != ".":
outfile = f"{options.outdir}/{outfile}"
# Check if the output file already exists. If it does then don't overwrite
# unless --force has been specified
if os.path.exists(outfile) and os.path.getsize(outfile) > 0:
if not options.force:
if not options.quiet:
print (f"Skipping {url} as outfile {outfile} exists (override with --force)", flush=True)
continue
attempt_number = 1
while (attempt_number <= options.retries):
if not options.quiet:
print (f"[ENA Attempt {attempt_number}] Downloading {url} into {outfile}", flush=True)
try:
#print(f"Connecting to {server}")
ftp = FTP(server)
#print(f"Logging in")
ftp.login()
#print(f"Moving to {folder}")
ftp.cwd(folder)
# We had problems with a simple retbinary causing the control channel to time
# out as the transfer took so long so we're going with this more complicated
# multithreaded version where we have one thread doing the download and another
# sending NOOP commands down the control channel every minute to keep it alive.
# Let's see if this fixes things.
with open(outfile, 'wb') as ftp_out:
sock = ftp.transfercmd('RETR ' + file)
def background():
while True:
block = sock.recv(1024*1024)
if not block:
break
ftp_out.write(block)
sock.close()
t = threading.Thread(target=background)
t.start()
while t.is_alive():
t.join(60)
ftp.voidcmd('NOOP')
#print(f"Retrieving {file}")
ftp.quit()
# Check that the file we downloaded had a non-zero size
if os.stat(outfile).st_size == 0:
# It thinks it worked, but nothing was downloaded
os.unlink(outfile)
raise IOError("Received zero sized file")
except Exception as ex:
print(ex)
if attempt_number == options.retries:
attempt_number += 1
raise IOError(f"[ENA] Download repeatedly failed for {sample['accession']} - giving up")
else:
print(f"[ENA] Download failed for {sample['accession']} - going around for another try", flush=True)
attempt_number += 1
continue
# It worked
break
def download_sample_ncbi (sample, options):
if not options.quiet:
print (f"[NCBI] Downloading {sample['accession']} into {sample['file_base']}", flush=True)
# We add the --include-technical because 10X runs mark the essential barcode
# reads as technical and don't download them otherwise.
command_options = [options.fqdump,"--split-files","--include-technical","--threads",options.threads,"--temp",options.outdir,"--outfile",sample["file_base"]+".fastq"]
# If they're attached to a terminal and they're not being quiet then we'll show progress
if not options.quiet:
if sys.stdin.isatty():
command_options.append("--progress")
# If they're not using the current directory as output we'll say where it should go
if options.outdir != ".":
command_options.append("--outdir")
command_options.append(options.outdir)
# Finally add the accession
command_options.append(sample['accession'])
command_options = [str(x) for x in command_options]
attempt_number = 1
while (attempt_number <= options.retries):
if not options.quiet:
print(f"[NCBI Attempt {attempt_number}] Running: "+" ".join(command_options), flush=True)
result = subprocess.run(command_options, check=False)
if result.returncode != 0:
if attempt_number == options.retries:
attempt_number += 1
raise IOError(f"[NCBI] Sorry SRAtoolkit just isn't having it, giving up on {sample['accession']}")
else:
print(f"[NCBI] SRAtoolkit failed us - going around for another try", flush=True)
attempt_number += 1
continue
else:
# Amazingly it worked
break
if attempt_number > options.retries:
# This failed so don't try to do anything else with it
return
# Now find the files which were created and compress them
downloaded_files = glob.glob(options.outdir+"/"+sample['file_base']+"*.fastq")
if len(downloaded_files) == 0:
raise IOError(f"[NCBI] Got no files for accession {sample['accession']}")
for file in downloaded_files:
if not options.quiet:
print("Compressing "+file, flush=True)
subprocess.run(["gzip","-4",file], check=True)
def get_geo_name (srr_number, options):
sample_name = ""
if not options.quiet:
print(f"Trying to get name for {srr_number} from GEO", flush=True)
with urllib.request.urlopen(f"https://www.ncbi.nlm.nih.gov/sra/?term={srr_number}&format=text") as response:
for line in response:
line = line.decode("UTF-8")
if line.startswith("Title:"):
line = line.strip()
geosections = re.split("[:; ,]+",line)
sample_name = "_".join(geosections[1:])
break
return sample_name
def read_samples (options):
if not options.quiet:
print(f"Reading samples from {options.runtable}", flush=True)
sample_file = options.runtable
samples = []
df00_samples = pd.read_excel(sample_file)
try:
for i,row in df00_samples.iterrows():
accession = row['SRR']
file_base = row['name_full']
seqtype = row['SEorPE']
if not options.quiet:
print (f"Found sample {accession} with basename {file_base}", flush=True)
sample = {
"accession" : accession,
"file_base" : file_base,
"SEorPE" : seqtype
}
samples.append(sample)
except: FileNotFoundError
return samples
def read_options():
parser = argparse.ArgumentParser(description="Download data from the SRA")
parser.add_argument('--quiet', dest="quiet", action='store_true', default=False, help="Supress all but essential messages")
parser.add_argument('--version', action='version', version=f"SRA downloader v{VERSION}")
parser.add_argument('--outdir', type=str, help="Folder to save data to (default .)", default=".")
parser.add_argument('--threads', type=int, help="Number of threads (default 1)", default=1)
parser.add_argument('--retries', type=int, help="Number of times we'll retry a download before giving up (default 5)", default=5)
parser.add_argument('--force', type=int, help="Overwrite output files even if they exist", default=False)
parser.add_argument('--fqdump', type=str, help="Path to the fastq dump program (default fasterq-dump)", default="fasterq-dump")
parser.add_argument('--nogeo', dest="nogeo", action='store_true', help="Disable sample name lookup from GEO")
parser.add_argument('--noena', dest="noena", action='store_true', help="Don't try downloading from ENA")
parser.add_argument('--noncbi', dest="noncbi", action='store_true', help="Don't try downloading from NBCI")
parser.add_argument('runtable', type=str, help="The SraRunTable.txt file from the SRA run selector")
options = parser.parse_args()
if not options.noncbi:
# Can we find fasterq-dump
if not options.quiet:
print("Testing for fasterq-dump at "+options.fqdump, flush=True)
try:
subprocess.run([options.fqdump,"--version"], check=True, stdout=subprocess.DEVNULL)
if not options.quiet:
print("Found fasterq-dump at "+options.fqdump, flush=True)
except:
print ("WARNING: Couldn't find fasterq-dump at "+options.fqdump+". Please ensure that sratoolkit is downloaded and that you've run vdb-config if you want to download from NCBI", file=sys.stderr, flush=True)
options.noncbi = True
# Can we find gzip
if not options.quiet:
print("Testing for gzip in the path", flush=True)
try:
subprocess.run(["gzip","--version"], check=True, stdout=subprocess.DEVNULL)
if not options.quiet:
print("Found gzip", flush=True)
except:
print ("WARNING: Couldn't find gzip in the path - can't download from NCBI without this.", file=sys.stderr, flush=True)
options.noncbi = True
if options.noncbi and options.noena:
print ("ERROR: Both NCBI and ENA download options are unavailable. Giving up", file=sys.stderr, flush=True)
sys.exit(1)
# Create the output directory if it's not there already
if not os.path.isdir(options.outdir):
os.makedirs(options.outdir)
return options
def main():
options = read_options()
samples = read_samples(options)
results = {}
for sample in samples:
try:
if options.noena:
raise Exception("ENA disabled with --noena, not trying that.")
download_sample_ena(sample,options)
results[sample['accession']] = "SUCCEEDED"
except Exception as ex:
if options.noncbi:
print(f"WARNING: Failed to download via ENA: {ex}. Can't try NCBI - giving up - sorry.", file=sys.stderr, flush=True)
results[sample['accession']] = "FAILED"
else:
print(f"WARNING: Failed to download via ENA: {ex} trying NCBI instead", file=sys.stderr, flush=True)
try:
download_sample_ncbi(sample, options)
results[sample['accession']] = "SUCCEEDED"
except Exception as ex2:
print(f"WARNING: Failed to download via NCBI: {ex2}. Giving up - sorry.", file=sys.stderr, flush=True)
results[sample['accession']] = "FAILED"
#remove "_1" from SE data
if sample["SEorPE"]=="SINGLE":
file1 = options.outdir+"/"+sample['file_base'] + "_1.fastq.gz"
file2 = options.outdir+"/"+sample['file_base'] + ".fastq.gz"
subprocess.call(["mv",file1,file2])
if not options.quiet:
print(f"\nAll done!\n\nRESULTS\n-------", flush=True)
for accession in sorted(results.keys()):
print(f"{accession}:\t{results[accession]}", flush=True)
if __name__ == "__main__":
main()