-
Notifications
You must be signed in to change notification settings - Fork 2
/
removeDuplicateSequencesFromAlignedFasta_safetosource.R
242 lines (227 loc) · 14.3 KB
/
removeDuplicateSequencesFromAlignedFasta_safetosource.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
library( "ade4", warn.conflicts = FALSE ) # needed by something. ape?
library( "ape" ) # for "chronos", "as.DNAbin", "dist.dna", "read.dna", "write.dna"
library( "seqinr", warn.conflicts = FALSE ) # for "as.alignment", "consensus"
# This removes duplicate sequences from an input multiple alignment in fasta format, writes the output to another fasta file, writes a table of duplicate sequences, and returns the output filename.
# The output table is tab-delimited with two columns: retained and removed, where retained is a single sequence name (the one not removed) and removed is a comma-separated list of removed sequence names.
# increase.threshold.to.ensure.max = 100 will call maybe define as "duplicated" seqs that have HD <= k for some k > 0; the smallest k will be chosen such that the number of retained sequences is increase.threshold.to.ensure.max.
removeDuplicateSequencesFromAlignedFasta <- function ( input.fasta.file, output.dir = NULL, output.fasta.file = NULL, output.table.file = NULL, output.fasta.width = 72, add.copy.number.to.sequence.names = FALSE, include.gaps.in.Hamming = TRUE, increase.threshold.to.ensure.max = NULL ) {
# increase.threshold.until.at.most.n.seqs.remain
if( length( grep( "^(.*?)\\/[^\\/]+$", input.fasta.file ) ) == 0 ) {
input.fasta.file.path <- ".";
} else {
input.fasta.file.path <-
gsub( "^(.*?)\\/[^\\/]+$", "\\1", input.fasta.file );
}
input.fasta.file.short <-
gsub( "^.*?\\/?([^\\/]+?)$", "\\1", input.fasta.file, perl = TRUE );
input.fasta.file.short.nosuffix <-
gsub( "^([^\\.]+)(\\..+)?$", "\\1", input.fasta.file.short, perl = TRUE );
input.fasta.file.suffix <-
gsub( "^([^\\.]+)(\\..+)?$", "\\2", input.fasta.file.short, perl = TRUE );
if( !is.null( output.fasta.file ) ) {
if( length( grep( "^(.*?)\\/[^\\/]+$", output.fasta.file ) ) == 0 ) {
output.fasta.file.path <- NULL;
output.fasta.file.path.is.absolute <- NA;
} else {
output.fasta.file.path <-
gsub( "^(.*?)\\/[^\\/]+$", "\\1", output.fasta.file );
output.fasta.file.path.is.absolute <- ( substring( output.fasta.file.path, 1, 1 ) == "/" );
}
output.fasta.file.short <-
gsub( "^.*?\\/?([^\\/]+?)$", "\\1", output.fasta.file, perl = TRUE );
if( !is.null( output.fasta.file.path ) && output.fasta.file.path.is.absolute ) {
output.dir <- output.fasta.file.path;
} else if( is.null( output.dir ) ) {
if( is.null( output.fasta.file.path ) ) {
output.dir <- input.fasta.file.path;
} else {
output.dir <- output.fasta.file.path;
}
} else {
output.dir <- paste( output.dir, output.fasta.file.path, sep = "/" );
}
output.fasta.file <- output.fasta.file.short;
} else { # is.null( output.fasta.file )
output.fasta.file <- paste( input.fasta.file.short.nosuffix, "_removeDuplicateSequences", input.fasta.file.suffix, sep = "" );
}
if( is.null( output.dir ) || ( nchar( output.dir ) == 0 ) ) {
output.dir <- ".";
}
## Remove "/" from end of output.dir
output.dir <-
gsub( "^(.*?)\\/+$", "\\1", output.dir );
if( is.null( output.table.file ) ) {
output.table.file <- paste( input.fasta.file.short.nosuffix, "_removeDuplicateSequences.tbl", sep = "" );
}
if( length( grep( "^(.*?)\\/[^\\/]+$", output.table.file ) ) == 0 ) {
output.table.file.path <- output.dir;
output.table.file.path.is.absolute <- NA;
} else {
output.table.file.path <-
gsub( "^(.*?)\\/[^\\/]+$", "\\1", output.table.file );
output.table.file.path.is.absolute <- ( substring( output.table.file.path, 1, 1 ) == "/" );
}
output.table.file.short <-
gsub( "^.*?\\/?([^\\/]+?)$", "\\1", output.table.file, perl = TRUE );
output.table.file <- paste( output.table.file.path, output.table.file.short, sep = "/" );
input.fasta <- read.dna( input.fasta.file, format = "fasta" );
n.seqs <- nrow( input.fasta );
# identical sequences have distance <= k between them [default k=0].
# The pairwise.deletion = TRUE argument is necessary so that columns with any gaps are not removed.
# The second call adds in the count of sites with gap differences
# (gap in one sequence but not the other), which are not included
# in the first call's "raw" count. Adding them back in, we are
# effectively treating those as differences.
in.dist <- dist.dna( input.fasta, model = "N", pairwise.deletion = TRUE );
in.dist[ is.nan( in.dist ) ] <- 0;
if( include.gaps.in.Hamming ) {
# By default we include indels because we don't want to treat two sequences as identical when there are gaps between them.
in.dist <- in.dist + dist.dna( input.fasta, model = "indel", pairwise.deletion = TRUE );
}
if( any( is.null( in.dist ) ) || any( is.na( in.dist ) ) || any( !is.finite( in.dist ) ) ) {
## TODO: REMOVE
warning( "UH OH got illegal distance value" );
print( "UH OH got illegal distance value" );
print( in.dist );
}
dist.mat <- as.matrix( in.dist );
## NOTE: Because of ambiguity codes, the transitivity property
## might _not_ hold, eg you could have dist(A,B) == 0, dist(B,C) == 0, dist(A, C) == 1.
# Ok so duplicates are those pairs with dist.mat <= threshold.k.
# Keep the first.
seq.removed.since.represented.by.seq <- rep( 0, nrow( dist.mat ) );
names( seq.removed.since.represented.by.seq ) <- colnames( dist.mat );
output.table.as.list <- list();
removeDuplicatesAtThreshold <- function ( threshold.k ) {
for( seq.i in ( 1:nrow( dist.mat ) ) ) {
if( seq.removed.since.represented.by.seq[ seq.i ] > 0 ) {
next;
}
# If the distances were true metric distances, ie if the transitivity property did hold, then it would be the case that we should not find this one to be identical to one already
# removed, since then this one would also have already been
# removed. However, it does not hold (see above). Thus we might need to add this to a collection of already removed sequences if it is zero-distance from something already removed.
if( any( seq.removed.since.represented.by.seq != 0 ) && any( dist.mat[ seq.i, ( seq.removed.since.represented.by.seq > 0 ) ] <= threshold.k ) ) {
## TODO: REMOVE?
#cat( "Found something within", threshold.k, "HD units from seq", seq.i, fill = TRUE );
# This is part of a group that was already removed. It got
# missed either because formerly threshold.k was lower, or perhaps even with constant threshold.k [eg this could happen if threshold.k == 0 by this being not perfectly identical to every other member due to ambiguities]. We go ahead and remove it if it is distance threshold.k from any member.
.representative.seq <- seq.removed.since.represented.by.seq[ ( seq.removed.since.represented.by.seq > 0 ) & ( dist.mat[ seq.i, ] <= threshold.k ) ];
stopifnot( length( .representative.seq ) > 0 );
if( length( .representative.seq ) > 1 ) {
#if( threshold.k == 0 ) {
#stopifnot( all( .representative.seq == .representative.seq[ 1 ] ) );
#}
.representative.seq <- .representative.seq[ 1 ];
}
## TODO: REMOVE
# print( paste( "adding", colnames( dist.mat )[ seq.i ], "to", rownames( dist.mat )[ .representative.seq ] ) );
output.table.as.list[[ rownames( dist.mat )[ .representative.seq ] ]][ 2 ] <-
paste( output.table.as.list[[ rownames( dist.mat )[ .representative.seq ] ]][ 2 ], colnames( dist.mat )[ seq.i ], sep = "," );
seq.removed.since.represented.by.seq[ seq.i ] <<- .representative.seq;
## TODO: REMOVE
#cat( "SETTING seq.removed.since.represented.by.seq[", seq.i, "] to", seq.removed.since.represented.by.seq[ seq.i ], fill = TRUE );
next;
}
# Are there any "identical" to this one?
if( !any( dist.mat[ seq.i, -seq.i ] <= threshold.k ) ) {
next;
}
all.the.zeros <-
colnames( dist.mat )[ -seq.i ][ dist.mat[ seq.i, -seq.i ] <= threshold.k ];
output.table.as.list[[ rownames( dist.mat )[ seq.i ] ]] <<-
c( colnames( dist.mat )[ seq.i ], paste( all.the.zeros, collapse = "," ) );
#print( paste( all.the.zeros, collapse = ", " ) );
## TODO: REMOVE
#cat( paste( "SETTING seq.removed.since.represented.by.seq[", all.the.zeros, "] to", seq.i, collapse = "\\n" ), fill = TRUE );
seq.removed.since.represented.by.seq[ all.the.zeros ] <<- seq.i;
} # End foreach seq.i
return( NULL );
} # removeDuplicatesAtThreshold ( threshold.k )
if( is.null( increase.threshold.to.ensure.max ) ) {
.result.ignored <- removeDuplicatesAtThreshold( 0 );
} else {
increase.threshold.to.ensure.max <-
as.numeric( increase.threshold.to.ensure.max );
stopifnot( increase.threshold.to.ensure.max > 1 );
n.seqs.after <- n.seqs;
threshold.k <- 0;
## TODO: REMOVE?
cat( "TRYING TO ENSURE A MAXIMUM OF", increase.threshold.to.ensure.max, "sequences", fill = TRUE );
cat( "THERE ARE", n.seqs.after, "sequences", fill = TRUE );
#cat( n.seqs.after, ">", increase.threshold.to.ensure.max, "=", ( n.seqs.after > increase.threshold.to.ensure.max ), fill = TRUE );
while( n.seqs.after > increase.threshold.to.ensure.max ) {
n.seqs.before <- n.seqs.after;
old.seq.removed.since.represented.by.seq <-
seq.removed.since.represented.by.seq;
old.output.table.as.list <- output.table.as.list;
## TODO: REMOVE?
# cat( "n.seqs.before:", n.seqs.before, fill = TRUE );
## TODO: REMOVE?
# cat( "removing dups at threshold:", threshold.k, fill = TRUE );
# print( "sum( seq.removed.since.represented.by.seq )" );
# print( sum( seq.removed.since.represented.by.seq ) );
# print( "table( seq.removed.since.represented.by.seq )" );
# print( table( seq.removed.since.represented.by.seq ) );
# print( "sum( seq.removed.since.represented.by.seq == 0 )" );
# print( sum( seq.removed.since.represented.by.seq == 0 ) );
.result.ignored <- removeDuplicatesAtThreshold( threshold.k );
#print( "sum( seq.removed.since.represented.by.seq )" );
#print( sum( seq.removed.since.represented.by.seq ) );
#print( "table( seq.removed.since.represented.by.seq )" );
#print( table( seq.removed.since.represented.by.seq ) );
#print( "table( seq.removed.since.represented.by.seq )" );
#print( table( seq.removed.since.represented.by.seq ) );
#print( "sum( seq.removed.since.represented.by.seq == 0 )" );
#print( sum( seq.removed.since.represented.by.seq == 0 ) );
# All the zeros, aka the number of unique remainders.
n.seqs.after <- sum( seq.removed.since.represented.by.seq == 0 );
## TODO: REMOVE?
#cat( "n.seqs.after:", n.seqs.after, fill = TRUE );
if( n.seqs.after == 0 ) {
## TODO: REMOVE
cat( "There are no sequences remaining at threshold", threshold.k, ", so we revert to", ( threshold.k - 1 ), ", which has", n.seqs.before, "sequences, despite the fact that it exceeds the limit (", increase.threshold.to.ensure.max, ").", fill = TRUE );
## woops. Undo it.
threshold.k <- threshold.k - 1;
n.seqs <- n.seqs.before;
seq.removed.since.represented.by.seq <-
old.seq.removed.since.represented.by.seq;
output.table.as.list <- old.output.table.as.list;
break;
} else if( n.seqs.after > increase.threshold.to.ensure.max ) {
## TODO: REMOVE
cat( "There are still ", n.seqs.after, " sequences, which exceeds the limit (", increase.threshold.to.ensure.max, "). Increasing threshold to ", threshold.k + 1, ".", sep = "", fill = TRUE );
if( n.seqs.after > increase.threshold.to.ensure.max ) {
threshold.k <- threshold.k + 1;
}
}
} # End while( n.seqs.after > increase.threshold.to.ensure.max )
# Consolodate seq.removed.since.represented.by.seq.
# All the zeros, aka the number of unique remainders.
n.seqs.after <- sum( seq.removed.since.represented.by.seq == 0 );
## TODO: REMOVE?
cat( "DONE. Using threshold ", threshold.k, ": THERE ARE ", n.seqs.after, " sequences.", sep = "", fill = TRUE );
} # End if is.null( increase.threshold.to.ensure.max ) .. else ..
if( !is.null( output.table.file ) && ( length( output.table.as.list ) > 0 ) ) {
output.table <- do.call( rbind, output.table.as.list );
colnames( output.table ) <- c( "retained", "removed" );
rownames( output.table ) <- NULL;
write.table( output.table, file = output.table.file, row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE );
}
output.fasta <-
input.fasta[ ( seq.removed.since.represented.by.seq == 0 ), , drop = FALSE ];
if( add.copy.number.to.sequence.names ) {
rownames( output.fasta ) <- sapply( rownames( output.fasta ), function ( .retained.seq.name ) {
if( .retained.seq.name %in% names( output.table.as.list ) ) {
#print( c( output.table.as.list[[ .retained.seq.name ]], gsub( "[^,]", "", ( output.table.as.list[[ .retained.seq.name ]] )[ 2 ] ), nchar( gsub( "[^,]", "", ( output.table.as.list[[ .retained.seq.name ]] )[ 2 ] ) ) + 2 ) );
paste( .retained.seq.name, nchar( gsub( "[^,]", "", ( output.table.as.list[[ .retained.seq.name ]] )[ 2 ] ) ) + 2, sep = "_" );
} else {
paste( .retained.seq.name, 1, sep = "_" );
}
} );
}
output.fasta.path <-
paste( output.dir, "/", output.fasta.file, sep = "" );
write.dna( output.fasta, output.fasta.path, format = "fasta", colsep = "", indent = 0, blocksep = 0, colw = output.fasta.width );
# Return the file name.
return( output.fasta.path );
} # removeDuplicateSequencesFromAlignedFasta ( input.fasta.file, ... )