forked from stenglein-lab/taxonomy_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_gi_description.pm
executable file
·135 lines (106 loc) · 2.9 KB
/
fetch_gi_description.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env perl
# given a GI, return its description
# need to search by NCBI db (nuc or prot, so have to specify)
#
# Mark Stenglein Sept 21, 2012
package fetch_gi_description;
use base 'Exporter';
our @EXPORT = qw(fetch_gi_description);
use strict;
use Getopt::Long;
use fetch_gi_esummary;
use LWP::Simple;
use Time::HiRes;
# input: an array of NCBI GIs
# output: a reference to a hash of GI->descriptions
sub fetch_gi_description
{
my @gis = @_;
# warn "fetching descriptions for GIs: @gis\n";
my $result = undef;
my %gi_desc_hash = ();
if (scalar @gis == 0) { return \%gi_desc_hash; }
# my @dbs_to_try = qw (protein nucleotide);
# it looks like searching nucleotide db w/ protein GIs works for eSummary
my @dbs_to_try = qw (nucleotide);
foreach my $db (@dbs_to_try)
{
$result = fetch_gi_esummary(@gis, $db);
# warn "esummary: result $result\n";
# parse XML results manually
open (my $result_fh, "<", \$result) or die ("error parsing results from NCBI\n");
my $gi = undef;
my $description = undef;
while (<$result_fh>)
{
chomp;
if (/<Id>(\S+)<\/Id>/)
{
$gi = $1;
$description = undef;
}
elsif (/<Item Name="Title" Type="String">(.+)<\/Item>/)
{
$description = $1;
# warn "$gi\t$description\n";
$gi_desc_hash{$gi} = $description;
}
}
}
return \%gi_desc_hash;
}
#
# retreive the results of a esummary query from an NCBI database
#
# Mark Stenglein
# September 21, 2012
sub fetch_gi_esummary
{
my $db = pop (@_);
my @ids = ();
my $id_count = 0;
my $result = undef;
foreach my $id (@_)
{
push @ids, $id;
$id_count++;
# fetch 200 at a time
if ($id_count % 200 == 0)
{
## warn "$id_count ";
$result .= fetch_one_batch(@ids, $db);
@ids = ();
# wait a third of a sec to avoid overloading NCBI servers (per their request)
Time::HiRes::usleep(333333);
}
}
if (scalar @ids > 0)
{
# fetch remaining gis
$result .= fetch_one_batch(@ids, $db);
}
return $result;
}
# here, actually fetch data from NCBI
sub fetch_one_batch()
{
my $db = pop (@_);
my @ids = @_;
# construct url
my $base_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=$db&tool=fetch_gi_summary&email=markstenglein_at_yahoo.com";
my $id_url = "&id=";
foreach my $id (@ids)
{
$id_url .= "$id".","
}
my $url = $base_url.$id_url;
# warn "URL: $url\n";
# here is actual interweb transaction
my $efetch_result = get($url);
# simply dump the data to stdout
# print "$efetch_result\n";
return $efetch_result;
# code below loads result into an XML datastructure
# my $xs = XML::Simple->new(forcearray=>1);
# my $ref = $xs->XMLin($efetch_result);
}