-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgenbank2gtf_mRNA.pl
163 lines (133 loc) · 4.98 KB
/
genbank2gtf_mRNA.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/perl -w
#############################################
#Author: Jiang Li
#email: riverlee2008@gmail.com
#Creat Time: Tue 23 Oct 2012 01:37:54 PM CDT
#Vanderbilt Center for Quantitative Sciences
#############################################
use strict;
use warnings;
=pod
=head1 SYNOPSIS
Given a genbank format file (.gb), parse its feature parts (mRNA feature to get exon regions) to get information like transcript id, gene name, etc.,
and store the result in gtf format
=head1 USAGE
perl genbank2gtf_mRNA.pl input.gb chromosome >chromosome_mRNA.gtf
=head1 RESULTS
The output is a file named as chromosome_mRNA.gtf, in which chromosome is given as the input parameter
=cut
#Script begin
#Convert a genbank file into a gtf format file (Only parse mRNA feature)
my $usage = "perl $0 GenBank.gb chromosome >chromosome.gtf \n";
die $usage unless (@ARGV==2);
genbank2gtf($ARGV[0],$ARGV[1]);
sub genbank2gtf{
my ($in,$chr) = @_;
open(IN,$in) or die "The genbank annotation is not exists \n";
my $feature="";
my $flag=0;
#Getting feature lines
while(<IN>){
if(/^FEATURES/){
$flag=1;
next;
}
last if(/^ORIGIN/); #The following lines display the sequences
$feature.=$_ if ($flag);
}
close IN;
#Begin parse the $feature;
#print $feature;
my @features;
while($feature=~/^ {5}\S.*\n(^ {21}\S.*\n)*/gm){
push @features,$&;
}
#Loop each feature and only parse those mRNA features to get the exon region, together with other
#information like gene name, transcript id, etc.
foreach my $f (@features){
if($f=~/^ {5}(\S+).*\n(^ {21}\S.*\n)*/){
#Only feature is mRNA
if($1 eq "mRNA"){
# information need to be fetched
my ($gene_id,$transcript_id,$gene_name,$transcript_name);
my $strand="+";
$strand="-" if($f=~/complement\(/);
#transcript_id and transcript_name share the same name
if($f=~/\/transcript_id="(.*?)"/){
$transcript_id=$1;
$transcript_name=$1;
}
#gene id
if($f=~/\/db_xref="GeneID:(\d+)"\n/){
$gene_id=$1;
}
#gene name
if($f=~/\/gene="(.*?)"/){
$gene_name=$1;
}
=head1 EXON REGION PARSE
=head2 CODE
if($f=~/(complement\()?(join\()?(\d+[\d\.\n,> ]+\d)(\))?/){
my $tmp=$3;
$tmp=~s/\s|\n|>//g;
my @array = split ",",$tmp;
my @start;
my @end;
foreach my $s(@array){
if($s=~/(\d+)\.\.(\d+)/){
push @start,$1;
push @end,$2;
}else{
push @start,$s;
push @end,$s+1;
}
}
if($strand eq "-"){
@start=reverse @start;
@end=reverse @end;
}
}
=head2 EXAMPLE
mRNA complement(join(102429..103045,104811..104942,
105561..105643,105732..105835,105910..106035))
/gene="CCDC115"
/product="coiled-coil domain containing 115"
/note="Derived by automated computational analysis using
gene prediction method: GNOMON. Supporting evidence
includes similarity to: 4 ESTs, 1 Protein"
/transcript_id="XM_003980242.1"
/db_xref="GI:410947098"
/db_xref="GeneID:101080349"
=cut
#Parse exons
if($f=~/(complement\()?(join\()?(\d+[\d\.\n,> ]+\d)(\))?/){
my $tmp=$3;
$tmp=~s/\s|\n|>//g;
my @array = split ",",$tmp;
my @start;
my @end;
foreach my $s(@array){
if($s=~/(\d+)\.\.(\d+)/){
push @start,$1;
push @end,$2;
}else{
push @start,$s;
push @end,$s+1;
}
}
if($strand eq "-"){
@start=reverse @start;
@end=reverse @end;
}
#if($gene_name eq "ITK"){print STDERR $tmp,"\n";print STDERR join "\n",@array;};
for(my $i=0;$i<@array;$i++){
my $j=$i+1;
my $group="gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; exon_number \"$j\"; gene_name \"$gene_name\"; transcript_name \"$transcript_name\";\n";
print join "\t",($chr,"protein_coding","exon",$start[$i],$end[$i],
".",$strand,".",$group);
}
}
}
}
}
}