-
Notifications
You must be signed in to change notification settings - Fork 2
/
computeEntropyFromAlignedFasta.R
112 lines (102 loc) · 5.41 KB
/
computeEntropyFromAlignedFasta.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
library( "ade4", warn.conflicts = FALSE ) # needed by something. ape?
library( "ape" ) # for "chronos", "as.DNAbin", "dist.dna", "read.dna", "write.dna"
library( "seqinr", warn.conflicts = FALSE ) # for "as.alignment", "consensus"
library( "entropy" );
# This computes the consensus of the given alignment, writes it to a fasta file, returns the filename.
# consensus.sequence.name = NA means use the name of the input fasta file (not the full path, just the filename, excluding suffix eg ".fasta"), postpended with "_Consensus".
# include.full.alignment == TRUE means that the output file will contain both the consensus and the input alignment (consensus first).
# if use.sequence.numbers.as.names == TRUE, rename non-consensus output sequences using just their order of appearance (eg 1, 2, 3, etc). This only applies if include.full.alignment is also TRUE.
computeEntropyFromAlignedFasta <- function ( input.fasta.file, output.dir = NULL, output.file = NULL ) {
if( length( grep( "^(.*?)\\/[^\\/]+$", input.fasta.file ) ) == 0 ) {
input.fasta.file.path <- ".";
} else {
input.fasta.file.path <-
gsub( "^(.*?)\\/[^\\/]+$", "\\1", input.fasta.file );
}
input.fasta.file.short <-
gsub( "^.*?\\/?([^\\/]+?)$", "\\1", input.fasta.file, perl = TRUE );
input.fasta.file.short.nosuffix <-
gsub( "^([^\\.]+)(\\..+)?$", "\\1", input.fasta.file.short, perl = TRUE );
input.fasta.file.suffix <-
gsub( "^([^\\.]+)(\\..+)?$", "\\2", input.fasta.file.short, perl = TRUE );
if( !is.null( output.file ) ) {
if( length( grep( "^(.*?)\\/[^\\/]+$", output.file ) ) == 0 ) {
output.file.path <- NULL;
output.file.path.is.absolute <- NA;
} else {
output.file.path <-
gsub( "^(.*?)\\/[^\\/]+$", "\\1", output.file );
output.file.path.is.absolute <- ( substring( output.file.path, 1, 1 ) == "/" );
}
output.file.short <-
gsub( "^.*?\\/?([^\\/]+?)$", "\\1", output.file, perl = TRUE );
if( !is.null( output.file.path ) && output.file.path.is.absolute ) {
output.dir <- output.file.path;
} else if( is.null( output.dir ) ) {
if( is.null( output.file.path ) ) {
output.dir <- input.fasta.file.path;
} else {
output.dir <- output.file.path;
}
} else {
output.dir <- paste( output.dir, output.file.path, sep = "/" );
}
output.file <- output.file.short;
} else { # is.null( output.file )
output.file <- paste( input.fasta.file.short.nosuffix, ".entropy.txt", sep = "" );
}
if( is.null( output.dir ) || ( nchar( output.dir ) == 0 ) ) {
output.dir <- ".";
}
## Remove "/" from end of output.dir
output.dir <-
gsub( "^(.*?)\\/+$", "\\1", output.dir );
input.fasta <- read.dna( input.fasta.file, format = "fasta" );
# IUPAC profile (may contain ambiguity chars)
input.fasta.profile.iupac <- seqinr::consensus( as.character( input.fasta ), method = "profile" );
input.fasta.profile <- input.fasta.profile.iupac[ intersect( c( "-", "a", "c", "g", "t" ), rownames( input.fasta.profile.iupac ) ), ];
.iupacs <- setdiff( rownames( input.fasta.profile.iupac ), c( "-", "a", "c", "g", "t" ) );
# DIVVY up ambiguous counts evenly among the component bases.
for( .iupac in .iupacs ) {
.component.bases <- amb( .iupac );
input.fasta.profile[ .component.bases, ] <-
input.fasta.profile[ .component.bases, ] + ( input.fasta.profile.iupac[ .iupac, ] / length( .component.bases) );
}
input.fasta.profile.nogap <- input.fasta.profile[ setdiff( rownames( input.fasta.profile ), "-" ), ];
entropies <- apply( input.fasta.profile.nogap, 2, entropy.empirical, unit = "log2" );
entropies.sd <- sd( entropies, na.rm = T );
num.entropies <- length( !is.na( entropies ) );
.results <- summary( entropies[ !is.na( entropies ) ] );
.results.string <- sprintf( "%0.4f", .results );
names( .results.string ) <- names( .results );
.results <- c( N = as.character( nrow( input.fasta ) ), K = as.character( num.entropies ), .results.string, SD = round( entropies.sd, digits = 4 ) );
.results[ .results == "-0.0000" ] <- "0.0000";
output.file.path <-
paste( output.dir, "/", output.file, sep = "" );
write.table( t( as.matrix( .results ) ), file =output.file.path, row.names = F, sep = "\t" );
# Return the file name.
return( output.file.path );
} # computeEntropyFromAlignedFasta ( input.fasta.file, ... )
## Here is where the action is.
input.fasta.file <- Sys.getenv( "computeEntropyFromAlignedFasta_inputFilename" );
output.fasta.file <- Sys.getenv( "computeEntropyFromAlignedFasta_outputFilename" );
if( nchar( output.fasta.file ) == 0 ) {
output.fasta.file <- NULL;
}
output.dir <- Sys.getenv( "computeEntropyFromAlignedFasta_outputDir" );
if( nchar( output.dir ) == 0 ) {
output.dir <- NULL;
}
## TODO: REMOVE
# warning( paste( "aligned fasta input file:", input.fasta.file ) );
# if( !is.null( output.dir ) ) {
# warning( paste( "consensus fasta output dir:", output.dir ) );
# }
# if( !is.null( output.fasta.file ) ) {
# warning( paste( "consensus fasta output file:", output.fasta.file ) );
# }
if( file.exists( input.fasta.file ) ) {
print( computeEntropyFromAlignedFasta( input.fasta.file, output.dir = output.dir, output.file = output.fasta.file ) );
} else {
stop( paste( "File does not exist:", input.fasta.file ) );
}