forked from stenglein-lab/taxonomy_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
acc_to_taxid.pm
executable file
·106 lines (86 loc) · 2.54 KB
/
acc_to_taxid.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#
# The functions in this package return NCBI Taxids for NCBI accessions #
# Depends on sorted_file_search_by_field.pm package
# Depends also on local copy of NCBI taxonomy db (see below)
# Depends on perl DBI package (to install: cpan DBI)
#
# Mark Stenglein, June 7, 2011
# Updated: 2/25/2020 to use sqlite3 db instead of mysql
#
package acc_to_taxid;
use DBI;
use strict;
use base 'Exporter';
our @EXPORT = qw(acc_to_taxid);
# connect to mysql database
# my $dbh = DBI->connect("DBI:mysql:database=NCBI_Taxonomy",
# "NCBI_Taxonomy", "NCBI_Taxonomy",
# {'RaiseError' => 1}) or die $DBI::errstr;
# TODO: don't hardcode this path
my $taxonomy_db = "/home/databases/NCBI_Taxonomy/ncbi_taxonomy.sqlite3";
# connect to sqlite database
my $dbh = DBI->connect("DBI:SQLite:dbname=$taxonomy_db",
'','',
{'RaiseError' => 1}) or die $DBI::errstr;
sub acc_to_taxid
{
my @taxids = _acc_to_taxid($dbh, @_);
return @taxids;
}
sub _acc_to_taxid
{
my $dbh = shift @_;
my @accs = @_;
my %acc_taxid_map = ();
# check to see if accession in gi|XXXXX| format
for (my $i = 0; $i < (scalar @accs); $i++)
{
# if this gi is in the form it is in NCBI BLAST results
if ($accs[$i] =~ /gi\|(\d+)\|/)
{
die ("not supported gi| format")
}
# convert from acc.version format to just acc
# TODO: should this be configurable?
if ($accs[$i] =~ /(\S+)\.\S+|/)
{
$accs[$i] = $1;
}
}
my @taxids = ();
# determine TAXID for this accession
my $num_accs = scalar @accs;
my $qs = ("?");
if ($num_accs > 1)
{
my @qs_array = ("?") x (scalar @accs);
$qs = join (", ", @qs_array);
}
my $sql_string = "SELECT acc, taxid FROM acc_taxid_map where acc in ( $qs )";
# warn "$sql_string\n";
my $sth = $dbh->prepare( $sql_string );
$sth->execute(@accs);
# warn "ACCS: @accs\n";
# iterate through rows of mysql output
# need to do this because:
#
# (1) some ACCs might not return TAXIDs
# (2) results not necessarily in order of input
#
while ( my ($acc, $taxid) = $sth->fetchrow_array())
{
# warn "$acc->$taxid\n";
$acc_taxid_map{$acc} = $taxid;
}
# create array to return
# will return an array that is 1:1 with input array
foreach my $acc (@accs)
{
push @taxids, $acc_taxid_map{$acc};
}
return @taxids;
}
# disconnect from database when module goes out of scope...
# $dbh->disconnect;
# PERL packages must return a true value
1;