-
Notifications
You must be signed in to change notification settings - Fork 0
/
CheckSeq.pl
executable file
·258 lines (199 loc) · 8.59 KB
/
CheckSeq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/common/bin/perl.new -w
# CRC=4110317807
use strict;
use FileHandle;
my $SeqFile = $ENV{SEQFILE} || '/home/njas/gauss/hisdir/strippedfry';
my $MinTerms = $ENV{MINTERMS} || 0;
$MinTerms = 20 unless ($MinTerms >= 1);
# %S A088674 ,1,3,6,45,126,750,2796,19389,75894,449562,2027796,12211794,57895596,332787324,1677545304,9766642077,50378641830,286825948194,1529968671492,8729259097158,47374697101572,269062276076868,1484430536591592,
# %S A088675 ,0,1,2,8,36,160,656,2368,7664,29440,184896,1174272,3395200,21222400,178961920,1638189056,27449296640,28875071488,3234263731200,10138343231488,422012179953664,3426627065331712,59293997091528704,
# %S A088676 ,1,2,2,2,3,3,3,3,4,4,5,5,5,5,6,6,6,6,6,6,7,7,8,9,9,9,9,9,9,9,10,11,12,12,14,
# %S A088677 ,2,1,2,3,1,2,5,6,2,5,1,4,5,11,6,9,10,11,13,14,8,17,11,4,11,16,19,20,22,27,16,22,4,23,24,
# %S A088679 ,0,1,2,6,48,2880,9953280,115579079884800,15266884236590834264309760000,262212473580148912869121218589990322256745385164800000000,
# %S A088682 ,4,6,10,10,10,8,8,14,6,18,8,8,10,12,6,16,10,16,8,6,18,18,12,14,10,12,12,8,14,6,12,10,20,16,8,12,12,14,6,8,10,18,14,12,12,24,12,6,18,18,6,12,12,20,12,18,8,8,12,24,6,14,28,18,12,16,8,22,6,8,6,8,12,28,6,14,8,12,6,24,
# %S A088683 ,6,6,8,6,12,10,10,12,6,18,12,12,12,12,14,6,8,12,8,12,6,20,6,14,10,12,12,10,12,16,12,18,24,12,16,8,10,22,10,14,14,18,12,14,12,22,12,12,6,18,24,18,10,18,14,16,12,16,12,22,10,14,24,18,14,10,8,28,10,10,16,40,14,24,6,
# %S A088684 ,6,6,8,6,8,6,10,6,6,10,12,10,8,6,24,6,12,12,10,24,6,16,10,12,14,18,12,10,6,20,12,14,16,8,16,8,6,12,12,16,18,18,10,10,18,14,6,24,6,6,24,18,12,10,14,10,12,12,8,6,12,12,12,16,20,12,18,20,8,6,14,40,26,18,10,6,22,6,
# %S A088680 ,2,4,4,4,2,4,4,6,6,2,4,8,2,2,14,6,10,6,4,6,10,4,12,4,4,2,6,6,6,2,14,2,14,10,4,8,6,6,4,10,10,6,6,4,4,8,8,6,2,6,6,2,10,6,6,4,12,2,6,2,4,8,8,8,6,8,4,4,10,2,2,2,14,2,14,2,20,8,8,6,14,6,8,12,6,10,6,2,2,18,2,6,8,6,2,
# %S A088681 ,2,2,2,6,6,2,6,2,4,6,6,4,4,4,4,2,2,6,6,2,2,2,12,2,6,10,6,2,4,10,4,4,6,2,6,6,4,8,8,2,2,4,8,2,12,4,4,12,18,10,6,6,6,2,6,2,10,4,6,12,6,10,10,6,4,6,8,14,12,10,4,10,4,4,4,4,4,10,4,6,4,6,6,4,2,2,10,10,6,4,4,6,6,22,10,
# Each element may occur at more than one position.
# Index a hash by element, and store a hash of the positions
# at which that element occurs.
sub hash_elements {
my $elem_ref = shift;
my ($i, %hash);
for ($i = 0; $i < @$elem_ref; ++$i) {
$hash{$elem_ref->[$i]}{$i} = 1;
}
return \%hash;
}
# To constitute a valid overlap,
# a) one sequence must start (or end) within the other, and
# b) the rest (or previous part) of that sequence must match
# until either sequence is exhausted
# The qhash allows quick checks for presence of an element
# in the query sequence, and, if present, where it occurs.
sub check_overlap {
my ($seqno, $need, $qhash, $qelem, $elems) = @_;
my ($elem, $posns, $posn, $span, $i);
$need = @$elems if ($need > @$elems);
$elem = $elems->[0]; # Maybe other starts somewhere in query
if (exists($qhash->{$elem})) {
$posns = $qhash->{$elem};
POS1:
foreach $posn (sort {$a <=> $b} keys(%$posns)) {
$span = (@$qelem - $posn); # How much of query sequence follows
$span = @$elems if (@$elems < $span); # other ends first
last unless ($span >= $need); # Too short and getting shorter
for ($i = 1; $i < $span; ++$i) { # index 0 known to match
next POS1 unless ($qelem->[$posn + $i] eq $elems->[$i]);
}
print("$seqno: $posn 0 $span\n");
return;
}
}
$elem = $elems->[-1]; # Maybe other ends somewhere in query
if (exists($qhash->{$elem})) {
$posns = $qhash->{$elem};
POS2:
foreach $posn (sort {$b <=> $a} keys(%$posns)) {
$span = $posn + 1; # Length of query sequence ending here
$span = @$elems if (@$elems < $span); # other starts later
last unless ($span >= $need); # Too short and getting shorter
for ($i = $span; --$i > 0; ) {
next POS2
unless ($qelem->[$posn - $i] eq $elems->[-$i - 1]);
}
$posn -= $span - 1; # first query position in match
$i = @$elems - $span; # first position in other
print("$seqno: $posn $i $span\n");
return;
}
}
return;
}
sub main {
my $fh = new FileHandle;
my ($qline, $line, @query_elem, $query);
my ($seqno, @elems, $seq, $need);
my ($i, $posn, $span);
open($fh, "< $SeqFile") || die("Open failed on sequence file $SeqFile: $!");
while ($qline = <>) {
chomp($qline);
@query_elem = split(/[^-0-9]+/ , $qline);
shift(@query_elem) while (@query_elem && ($query_elem[0] !~ /\d+/));
pop(@query_elem) while (@query_elem && ($query_elem[-1] !~ /\d+/));
if (@query_elem) {
$qline = join(',' => '', @query_elem, '');
$query = hash_elements(\@query_elem);
$need = (@query_elem <= 1.2 * $MinTerms) ?
int(.75 * @query_elem) : $MinTerms;
$need = 1 if ($need < 1);
seek($fh, 0, 0) || die("Rewind failed on $SeqFile: $!");
while ($line = <$fh>) {
(undef, $seqno, $seq) = split(" ", $line);
if (($i = index($seq, $qline)) >= 0) {
$posn = ($i == 0) ? 0 : (substr($seq, 0, $i-1) =~ tr/,//);
$span = @query_elem;
print("$seqno: 0 $posn $span\n");
next;
}
# Trailing empty field won't be included
@elems = split(',' => $seq);
shift(@elems); # Eliminate empty leading field
check_overlap($seqno, $need, $query, \@query_elem, \@elems)
if (@elems);
}
}
}
}
main();
__END__
=head1 NAME
CheckSeq.pl - Check query sequences against sequence database
=head1 SYNOPSIS
CheckSeq.pl F<query-file> ...
=head1 EXAMPLE
# Run queries against standard database
CheckSeq.pl queries
# Same, but reset minimum overlap threshold
MINTERMS=10 CheckSeq.pl queries
# Use a different sequence database
SEQFILE=/some/other/database CheckSeq.pl queries
=head1 DESCRIPTION
B<CheckSeq.pl> reads query sequences,
one per line, from the specified F<query-file>s,
or from F<standard in> if no F<query-file>s are specified.
For each sequence in the sequence database
that "overlaps adequately" with a query sequence,
a one-line summary resembling the following is produced.
A088676: 3 0 34
The first value is the name of the matching sequence from
the sequence database. The three numeric values that follow are
=over 4
=item 1
The position (starting at B<0>) in the query match
of the overlapping subsequence.
=item 2
The position (starting at B<0>) in the database sequence
of the overlapping subsequence.
=item 3
The length of the overlapping subsequence.
=back 4
Roughly speaking, two sequences I<overlap>
if you can "line up" the elements of the sequences
so they agree on some terms, and disagree on none.
More formally, we will say a sequence B<S1> I<overlaps> a sequence B<S2>
if there exist indexes B<I1> and B<I2>
and a length B<L> E<gt> B<0> such that
for all B<I> E<lt> B<L>,
S1[I1 + I] == S2[I2 + I]
and at least one of the following conditions hold.
=over 4
=item
The overlap includes the entire B<S1> sequence.
=item
The overlap includes the entire B<S2> sequence.
=item
The overlap includes the start of one sequence and the end of the other.
=back 4
Since overlap, thus defined, always includes the start of at least
one sequence, at least one of the positions reported for a
match will always be B<0>.
Small overlaps of long sequences are not particularly noteworthy.
The value of environment variable B<MINTERMS> (default B<20>)
establishes a lower limit of the length of an I<adequate> overlap
for long sequences.
Of course, short query sequences cannot be held to a fixed standard.
So, for query sequences shorter than B<1.2 * MINTERMS>,
an overlap of B<.75 * length(query)> is considered I<adequate>.
And, if even this exceeds the length of a database sequence,
the length of the database sequence is considered I<adequate>.
=head1 FILES AND FORMATS
The default B<SEQFILE> database is
F</home/njas/gauss/hisdir/strippedfry>.
It contains lines of the form
%S A077013 ,1,2,1,2,3,7,14,43,65,292,992,1154,
The C<%S> is just fixed text.
The word that follows, C<A077013>,
is the sequence name.
The string that follows is the sequence.
Note the leading and trailing commas,
and the absence of white space or signs.
The entire sequence must appear on a single line.
Any substitute B<SEQFILE> must have the same format.
Query sequences are less restricted.
They must still appear on a single line,
but the terms in the sequence can be separated by blanks,
commas, or, indeed, any non-empty sequence
of non-numeric characters.
=head1 AUTHOR
John P. Linderman
AT&T Shannon Laboratory
jpl@research.att.com
=head1 BUGS
The determination of what constitutes an I<adequate> overlap
is a bit ad-hoc.
If you submit more than one query,
the summary information doesn't provide information
about which query matched.
=cut