-
Notifications
You must be signed in to change notification settings - Fork 0
/
rsync_from_ncbi.pl
186 lines (170 loc) · 6.35 KB
/
rsync_from_ncbi.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env perl
# Copyright 2013-2021, Derrick Wood <dwood@cs.jhu.edu>
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Reads an assembly_summary.txt file, which indicates taxids and FTP paths for
# genome/protein data. Performs the download of the complete genomes from
# that file, decompresses, and explicitly assigns taxonomy as needed.
use strict;
use warnings;
use File::Basename;
use Getopt::Std;
use Net::FTP;
use List::Util qw/max/;
my $PROG = basename $0;
my $SERVER = "ftp.ncbi.nlm.nih.gov";
my $SERVER_PATH = "/genomes";
my $FTP_USER = "anonymous";
my $FTP_PASS = "kraken2download";
my $qm_server = quotemeta $SERVER;
my $qm_server_path = quotemeta $SERVER_PATH;
my $is_protein = $ENV{"KRAKEN2_PROTEIN_DB"};
my $use_ftp = $ENV{"KRAKEN2_USE_FTP"};
my $suffix = $is_protein ? "_protein.faa.gz" : "_genomic.fna.gz";
# Manifest hash maps filenames (keys) to taxids (values)
my %manifest;
while (<>) {
next if /^#/;
chomp;
my @fields = split /\t/;
my ($taxid, $asm_level, $ftp_path) = @fields[5, 11, 19];
# Possible TODO - make the list here configurable by user-supplied flags
next unless grep {$asm_level eq $_} ("Complete Genome", "Chromosome");
next if $ftp_path eq "na"; # Skip if no provided path
my $full_path = $ftp_path . "/" . basename($ftp_path) . $suffix;
# strip off server/leading dir name to allow --files-from= to work w/ rsync
# also allows filenames to just start with "all/", which is nice
if (! ($full_path =~ s#^https://${qm_server}${qm_server_path}/##)) { # https://github.com/DerrickWood/kraken2/issues/508; https://github.com/DerrickWood/kraken2/issues/518
die "$PROG: unexpected FTP path (new server?) for $ftp_path\n";
}
$manifest{$full_path} = $taxid;
}
open MANIFEST, ">", "manifest.txt"
or die "$PROG: can't write manifest: $!\n";
print MANIFEST "$_\n" for keys %manifest;
close MANIFEST;
if ($is_protein && ! $use_ftp) {
print STDERR "Step 0/2: performing rsync dry run (only protein d/l requires this)...\n";
# Protein files aren't always present, so we have to do this two-rsync run hack
# First, do a dry run to find non-existent files, then delete them from the
# manifest; after this, execution can proceed as usual.
system("rsync --dry-run --no-motd --files-from=manifest.txt rsync://${SERVER}${SERVER_PATH} . 2> rsync.err");
open ERR_FILE, "<", "rsync.err"
or die "$PROG: can't read rsync.err file: $!\n";
while (<ERR_FILE>) {
chomp;
# I really doubt this will work across every version of rsync. :(
if (/failed: No such file or directory/ && /^rsync: link_stat "\/([^"]+)"/) {
delete $manifest{$1};
}
}
close ERR_FILE;
print STDERR "Rsync dry run complete, removing any non-existent files from manifest.\n";
# Rewrite manifest
open MANIFEST, ">", "manifest.txt"
or die "$PROG: can't write manifest: $!\n";
print MANIFEST "$_\n" for keys %manifest;
close MANIFEST;
}
sub ftp_connection {
my $ftp = Net::FTP->new($SERVER, Passive => 1)
or die "$PROG: FTP connection error: $@\n";
$ftp->login($FTP_USER, $FTP_PASS)
or die "$PROG: FTP login error: " . $ftp->message() . "\n";
$ftp->binary()
or die "$PROG: FTP binary mode error: " . $ftp->message() . "\n";
$ftp->cwd($SERVER_PATH)
or die "$PROG: FTP CD error: " . $ftp->message() . "\n";
return $ftp;
}
if ($use_ftp) {
print STDERR "Step 1/2: Performing ftp file transfer of requested files\n";
open MANIFEST, "<", "manifest.txt"
or die "$PROG: can't open manifest: $!\n";
mkdir "all" or die "$PROG: can't create 'all' directory: $!\n";
chdir "all" or die "$PROG: can't chdir into 'all' directory: $!\n";
while (<MANIFEST>) {
chomp;
my $ftp = ftp_connection();
my $try = 0;
my $ntries = 5;
my $sleepsecs = 3;
while($try < $ntries) {
$try++;
last if $ftp->get($_);
warn "$PROG: unable to download $_ on try $try of $ntries: ".$ftp->message()."\n";
last if $try == $ntries;
sleep $sleepsecs;
$sleepsecs *= 3;
}
die "$PROG: unable to download ftp://${SERVER}${SERVER_PATH}/$_\n" if $try == $ntries;
$ftp->quit;
}
close MANIFEST;
chdir ".." or die "$PROG: can't return to correct directory: $!\n";
}
else {
print STDERR "Step 1/2: Performing rsync file transfer of requested files\n";
system("rsync --no-motd --files-from=manifest.txt rsync://${SERVER}${SERVER_PATH}/ .") == 0
or die "$PROG: rsync error, exiting: $?\n";
print STDERR "Rsync file transfer complete.\n";
}
print STDERR "Step 2/2: Assigning taxonomic IDs to sequences\n";
my $output_file = $is_protein ? "library.faa" : "library.fna";
open OUT, ">", $output_file
or die "$PROG: can't write $output_file: $!\n";
my $projects_added = 0;
my $sequences_added = 0;
my $ch_added = 0;
my $ch = $is_protein ? "aa" : "bp";
my $max_out_chars = 0;
for my $in_filename (keys %manifest) {
my $taxid = $manifest{$in_filename};
if ($use_ftp) { # FTP downloading doesn't create full path locally
$in_filename = "all/" . basename($in_filename);
}
open IN, "gunzip -c $in_filename |" or die "$PROG: can't read $in_filename: $!\n";
while (<IN>) {
if (/^>/) {
s/^>/>kraken:taxid|$taxid|/;
$sequences_added++;
}
else {
$ch_added += length($_) - 1;
}
print OUT;
}
close IN;
unlink $in_filename;
$projects_added++;
my $out_line = progress_line($projects_added, scalar keys %manifest, $sequences_added, $ch_added) . "...";
$max_out_chars = max(length($out_line), $max_out_chars);
my $space_line = " " x $max_out_chars;
print STDERR "\r$space_line\r$out_line" if -t STDERR;
}
close OUT;
print STDERR " done.\n" if -t STDERR;
print STDERR "All files processed, cleaning up extra sequence files...";
system("rm -rf all/") == 0
or die "$PROG: can't clean up all/ directory: $?\n";
print STDERR " done, library complete.\n";
sub progress_line {
my ($projs, $total_projs, $seqs, $chs) = @_;
my $line = "Processed ";
$line .= ($projs == $total_projs) ? "$projs" : "$projs/$total_projs";
$line .= " project" . ($total_projs > 1 ? 's' : '') . " ";
$line .= "($seqs sequence" . ($seqs > 1 ? 's' : '') . ", ";
my $prefix;
my @prefixes = qw/k M G T P E/;
while (@prefixes && $chs >= 1000) {
$prefix = shift @prefixes;
$chs /= 1000;
}
if (defined $prefix) {
$line .= sprintf '%.2f %s%s)', $chs, $prefix, $ch;
}
else {
$line .= "$chs $ch)";
}
return $line;
}