This repository has been archived by the owner on Oct 31, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
collect_genomevirus.pl
executable file
·109 lines (106 loc) · 5.07 KB
/
collect_genomevirus.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/perl
# $Revision: 0.5 $
# $Date: 2016/05/27 $
# $Id: collect_genomevirus.pl $
# $Author: Michaël Bekaert $
#
#
# This file is part of lamp-virus. It collects all complete genomes using the provided
# ENTREZ query and align them.
#
# Copyright (C) 2014-2016, Michaël Bekaert <michael.bekaert@stir.ac.uk>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
use strict;
use warnings;
use Getopt::Long;
use Bio::SeqIO;
use Bio::DB::Query::GenBank;
use Bio::DB::GenBank;
#----------------------------------------------------------
our ($VERSION) = 0.5;
my $pathtemplate = '/opt';
#----------------------------------------------------------
my $query_string = 'txid64320[Organism:exp]';
my ($verbose, $after, $before, $run, $ok, $skip, $missing, $genome) = (0, 0, 1900 + (localtime(time))[5], 1, 0, 0);
GetOptions('p|prefix=s' => \$genome, 'q|query=s' => \$query_string, 'a|after:i' => \$after, 'b|before:i' => \$before, 'r|run!' => \$run, 'v|verbose!' => \$verbose);
if (defined $query_string && defined $genome && !-e ($genome . '.fasta') && defined $after && defined $before && open(my $seq_out, q{>}, $genome . '.fasta'))
{
my $query = Bio::DB::Query::GenBank->new(-db => 'nucleotide', -query => $query_string . ' AND (complete[All Fields] AND genome[All Fields])', -mindate => ($after > 0 ? "$after" : "0"), -maxdate => "$before");
my $gb = Bio::DB::GenBank->new();
my $stream = $gb->get_Stream_by_query($query);
while (my $seq = $stream->next_seq)
{
print {*STDERR} '>', $seq->id(), "\n" if ($verbose);
my ($date, $country);
for my $feat_object ($seq->get_SeqFeatures)
{
if ($feat_object->primary_tag eq 'source')
{
for my $tag ($feat_object->get_all_tags)
{
if ($tag eq 'country')
{
for my $value ($feat_object->get_tag_values($tag)) { $country = $value; }
}
elsif ($tag eq 'collection_date')
{
for my $value ($feat_object->get_tag_values($tag)) { $date = $1 if ($value =~ m/(\d{4})/); }
}
}
}
}
if (defined $date && defined $country)
{
if ($date > $after && $date <= $before)
{
$ok++;
print {$seq_out} q{>} . $seq->accession_number() . q{|} . $date . q{|} . $country . "\n" . $seq->seq() . "\n";
}
else { $skip++; }
}
else { print {*STDERR} ' !No metadata for accession: ', $seq->accession_number(), "\n"; $missing++; }
}
close $seq_out;
print {*STDERR} "\nNumber of sequence retrieved ", ($ok + $skip + $missing), ($after > 0 ? "\nNumber of sequence too old too young " . $skip : q{}), "\nNumber of sequence without data ", $missing,
"\nNumber of remaining sequences ", $ok, "\n";
if ($ok > 0 && -r ($genome . '.fasta'))
{
system 'GramAlign -q -F 1 -f 2 -i ' . $genome . '.fasta -o ' . $genome . '.align.fa';
unlink '_ga_temp.page0' if (-e '_ga_temp.page0');
if (-e ($genome . '.align.fa') && $verbose)
{
my $seqio = Bio::SeqIO->new('-format' => 'fasta', -file => $genome . '.align.fa');
my $seqobj = $seqio->next_seq();
print {*STDERR} 'Alignment length ', $seqobj->length, "\n";
}
if ($run && -e ($genome . '.align.fa') && -r ($pathtemplate . '/virus.R') && (open my $input, q{<}, $pathtemplate . '/virus.R') && (open my $output, q{>}, $genome . '.R'))
{
while (<$input>) { print {$output} $_; }
close $input;
print {$output} "\nprocess_virus('$genome','$genome.align.fa','$genome.results'", ($verbose ? ',TRUE' : q{}), ");\n";
close $output;
system 'Rscript --vanilla ' . $genome . '.R';
}
exit 0;
}
else { exit 2; }
}
else
{
print {*STDOUT}
"\nUsage: collect_genomevirus.pl --prefix <output prefix> --query <NCBI entrez query> [..]\nDescription: Collect all complete genomes using the provided ENTREZ query and align them.\n\n--prefix <file prefix>\n Filename output prefix. [mandatory]\n--query <query string>\n NCBI entrez query. [mandatory]\n--after <year>\n minimum date to retrieve from (YYYY).\n--before <year>\n maximum date to retrieve from (YYYY).\n--norun\n Disable the automatic run of R and the PCA and cluster analysis\n--verbose\n Becomes very chatty.\n\n";
exit 1;
}