-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgi2taxonomy2.pl
executable file
·209 lines (176 loc) · 6.5 KB
/
gi2taxonomy2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
# script gi2taxonomy2.pl
# from a text file with gi accession (one per line)
# get corresponding taxid from the ncbi dump nucl_gb.accession2taxid.gz
# and from nucl_wgs.accession2taxid.gz (whole genomes)
# using the taxid, get the corresponding 7-level taxonomy from the ncbi taxonomy
# (created using https://github.com/zyxue/ncbitax2lin)
# return a new text file with:
# gi,taxid,domain,kingdom,phylum,class,order,family,genus,species
# save hashes to gzipped dumps for next runs
# NOTE: this script uses a lot of RAM to store large hashes
# it may not work on every computer without 'complaining'
# SP@NC 2023-08-30; v1.0
#
# visit our Git: https://github.com/Nucleomics-VIB
$| = 1; # Turn off output buffering
my $gi_file;
my $gbaccession2taxid = 'nucl_gb.accession2taxid.gz';
my $wgsaccession2taxid = 'nucl_wgs.accession2taxid.gz';
my $lineages_file;
my $output_file;
# Define hash filenames
my $gi_taxid_dump_file0 = 'gi_taxid_dump.dat0.gz';
my $gi_taxid_dump_file = 'gi_taxid_dump.dat.gz';
my $taxonomy_levels_dump_file = 'taxonomy_levels_dump.dat.gz';
# Parse command line arguments using Getopt::Long
GetOptions(
"i=s" => \$gi_file,
"n=s" => \$gbaccession2taxid,
"g=s" => \$wgsaccession2taxid,
"l=s" => \$lineages_file,
"o=s" => \$output_file,
);
# Check for the correct number of arguments
if (!defined $gi_file || !defined $lineages_file || !defined $output_file) {
die "Usage: $0 -i gi_list.txt -l lineages.gz -o output.csv [-n gbaccession2taxid] [-g wgsaccession2taxid]\n";
}
###############
# Read GI list
###############
open my $gi_fh, '<', $gi_file or die "Error opening $gi_file: $!";
my @gi_list = <$gi_fh>;
close $gi_fh;
#############################
# Hashes to store data
#############################
my %gi_taxid_map;
my %taxonomy_levels;
# Check if dump files exist and load data if they do
if (-e $gi_taxid_dump_file && -e $taxonomy_levels_dump_file) {
print "# Loading existing hash data from previously dumped files...\n";
%gi_taxid_map = load_dump($gi_taxid_dump_file);
%taxonomy_levels = load_dump($taxonomy_levels_dump_file);
} else {
print "# no hash dumps found, loading data from archives and creating dumps\n";
######### 1: nucl_gb.accession2taxid.gz ##########
print "# loading the ncbi gb_accession2taxid data from $gbaccession2taxid\n";
open my $gbgiacc_fh, '-|', "zcat $gbaccession2taxid" or die "Error opening $gbaccession2taxid: $!";
# code for loading gbaccession2taxid
my $progress1 = 0;
while (<$gbgiacc_fh>) {
chomp;
my @fields = split /\t/;
my ($taxid, $gi) = ($fields[2], $fields[3]); # Taxid is in column 3, GI is in column 4
$gi_taxid_map{$gi} = $taxid;
$progress1++;
if ($progress1 % 500000 == 0) {
print ".";
}
}
close $gbgiacc_fh;
print "\n";
# to be removed: debug
print_top_values(\%gi_taxid_map, 20);
print "# saving the gb part of the gi_taxid_map hash to disk for faster reload next run\n";
save_dump(\%gi_taxid_map, $gi_taxid_dump_file0);
######### 2: nucl_wgs.accession2taxid.gz ##########
print "# loading the ncbi wgs_accession2taxid data from $wgsaccession2taxid\n";
open my $wgsgiacc_fh, '-|', "zcat $wgsaccession2taxid" or die "Error opening $wgsaccession2taxid: $!";
# code for loading wgsaccession2taxid
my $progress2 = 0;
while (<$wgsgiacc_fh>) {
chomp;
my @fields = split /\t/;
my ($taxid, $gi) = ($fields[2], $fields[3]); # Taxid is in column 3, GI is in column 4
$gi_taxid_map{$gi} = $taxid;
$progress2++;
if ($progress2 % 500000 == 0) {
print ".";
}
}
close $wgsgiacc_fh;
print "\n";
# Save hash to dump files
print "# saving the gi_taxid_map hash to disk for faster reload next run\n";
save_dump(\%gi_taxid_map, $gi_taxid_dump_file);
#######################################
# Hash to store gi and taxonomy levels
#######################################
print "# loading the ncbi lineage data from $lineages_file\n";
open my $lineage_fh, "-|", "zcat $lineages_file" or die "Error opening $lineages_file: $!";
# code for loading lineage data
my $progress3 = 0;
while (my $line = <$lineage_fh>) {
chomp $line;
my @fields = split /,/, $line;
next if scalar(@fields) < 8; # Skip lines with less than 8 columns
my $taxid = shift @fields;
my $taxonomy_string = join(',', @fields[0..6]);
$taxonomy_levels{$taxid} = $taxonomy_string;
$progress3++;
if ($progress3 % 100000 == 0) {
print ".";
}
}
close $lineage_fh;
print "\n";
# Save hash to dump files
print_top_values(\%taxonomy_levels, 20);
print "# saving the taxonomy_levels hash to disk for faster reload next run\n";
save_dump(\%taxonomy_levels, $taxonomy_levels_dump_file);
}
####################################
# Process GI list and print results
####################################
open my $output_fh, '>', $output_file or die "Error opening $output_file for writing: $!";
foreach my $gi (@gi_list) {
chomp $gi;
if (exists $gi_taxid_map{$gi}) {
my $taxid = $gi_taxid_map{$gi};
if (exists $taxonomy_levels{$taxid}) {
print $output_fh "$gi,$taxid,$taxonomy_levels{$taxid}\n";
} else {
print $output_fh "$gi,$taxid,,\n";
}
} else {
warn "$gi not found\n";
}
}
close $output_fh;
########################################
# functions to load and dump large hash
########################################
sub save_dump {
my ($data_ref, $filename) = @_;
open my $dump_fh, '|-', "gzip -c > $filename" or die "Error opening $filename for writing: $!";
foreach my $key (keys %$data_ref) {
print $dump_fh "$key $data_ref->{$key}\n";
$dump_fh->flush();
}
close $dump_fh;
}
sub load_dump {
my ($filename) = @_;
open my $dump_fh, '-|', "gzip -dc $filename" or die "Error opening $filename for reading: $!";
my %data;
while (<$dump_fh>) {
chomp;
my ($key, $value) = split;
$data{$key} = $value;
}
close $dump_fh;
return %data;
}
# print the first N keys and values from a hash (debug)
sub print_top_values {
my ($hash_ref, $top_count) = @_;
foreach my $key (sort { $$hash_ref{$b} <=> $$hash_ref{$a} } keys %$hash_ref) {
last if $top_count <= 0;
print "Key: $key, Value: $$hash_ref{$key}\n";
$top_count--;
}
}