-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.R
240 lines (217 loc) · 15.5 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# Program: SarsCov2Origin
# Version: 1.0 alpha
# Date: 05/06/2022
# Author: Emiliano Vivas Rodríguez
# Contact: a01424732@tec.mx
install.packages(seqinr);
install.packages(ggplot2);
install.packages(RecordLinkage);
library(seqinr);
library(ggplot2);
library(RecordLinkage);
AMINOACIDS = c(UUU="FPhe", UUC="FPhe", UUA="LLeu", UUG="LLeu", UCU="SSer", UCC="SSer", UCA="SSer", UCG="SSer", UAU="YTyr", UAC="YTyr", UAA='-', UAG='-', UGU="CCys", UGC="CCys", UGA='-', UGG="WTrp", CUU="LLeu", CUC="LLeu", CUA="LLeu", CUG="LLeu", CCU="PPro", CCC="PPro", CCA="PPro", CCG="PPro", CAU="HHis", CAC="HHis", CAA="QGln", CAG="QGln", CGU="RArg", CGC="RArg", CGA="RArg", CGG="RArg", AUU="IIle", AUC="IIle", AUA="IIle", AUG="MMet", ACU="TThr", ACC="TThr", ACA="TThr", ACG="TThr", AAU="NAsn", AAC="NAsn", AAA="KLys", AAG="KLys", AGU="SSer", AGC="SSer", AGA="RArg", AGG="RArg", GUU="VVal", GUC="VVal", GUA="VVal", GUG="VVal", GCU="AAla", GCC="AAla", GCA="AAla", GCG="AAla", GAU="DAsp", GAC="DAsp", GAA="EGlu", GAG="EGlu", GGU="GGly", GGC="GGly", GGA="GGly", GGG="GGly");
SARSCov2 = read.fasta("sarscov2.txt");
RaTG13 = read.fasta("batcovratg13.txt");
MERSCov = read.fasta("merscov.txt");
finalVirus = "SARS-Cov 2";
originVirus = c("RaTG13", "MERS-Cov");
data = data.frame(virus = character(), gene = character(), protein = character(), length = integer(), initialLocation = integer(), finalLocation = integer(), nucleotids = character(), aminoacids = character());
mutations = data.frame(mutation = character(), nucleotide = numeric(), codon = character(), protein = character(), index = numeric(), structure = character(), gene = character(), change = logical());
mutations2 = data.frame(mutation = character(), nucleotide = numeric(), codon = character(), protein = character(), index = numeric(), structure = character(), gene = character(), change = logical());
similarity = data.frame(type=character(), originVirus=character(), finalVirus=character(), gene=character(), nucleotidePercentage = numeric(), aminoacidPercentage=numeric());
needleman = function(seq1, seq2, gap, mismatch, match){
stopifnot(gap <= 0);
stopifnot(mismatch <= 0);
stopifnot(match >= 0);
len1 = nchar(seq1); len2 = nchar(seq2);
seq1 = unlist(strsplit(seq1, split = ""));
seq2 = unlist(strsplit(seq2, split = ""));
M = matrix(0, nrow = len1 + 1, ncol = len2 + 1);
rownames(M) = c("-", seq1);
colnames(M) = c("-", seq2);
M[1, ] = cumsum(c(0, rep(gap, len2)));
M[, 1] = cumsum(c(0, rep(gap, len1)));
D = matrix(0, nrow = len1 + 1, ncol = len2 + 1);
rownames(D) = c("-", seq1);
colnames(D) = c("-", seq2);
D[1, ] = rep("\u2190");
D[, 1] = rep("\u2191");
type = c("\u2190", "\u2191", "\u2b09");
for (i in 2:(len1 + 1)){
for (j in 2:(len2 + 1)){
hor = M[i, j - 1] + gap;
ver = M[i - 1, j] + gap;
dia = ifelse(rownames(M)[i] == colnames(M)[j], M[i - 1, j - 1] + match, M[i - 1, j - 1] + mismatch);
M[i, j] = max(hor, ver, dia);
D[i, j] = type[which.max(c(hor, ver, dia))];
}
}
align1 = c();align2 = c();
while(i > 1 && j > 1){
if(D[i, j] == "\u2b09") {
align1 = c(rownames(M)[i], align1);
align2 = c(colnames(M)[j], align2);
j = j - 1; i = i - 1;
} else if (D[i, j] == "\u2191") {
align1 = c(rownames(M)[i], align1);
align2 = c("-", align2);
i = i - 1;
} else if (D[i, j] == "\u2190") {
align1 = c("-", align1);
align2 = c(colnames(M)[j], align2);
j = j - 1;
}
}
return(list(aligned_seqs = matrix(c(align1, align2), byrow = TRUE, nrow = 2), score = M[nrow(M), ncol(M)], score_matrix = M, movement_matrix = D));
}
transformingToAminoacids = function(sequence){
newSequence = "";
for(index in seq(1, length(sequence), 3))
newSequence = paste(newSequence, substr(AMINOACIDS[paste(sequence[index], sequence[index+1], sequence[index+2], sep = '', collapse = '')], 1, 1), sep = '', collapse = '');
return(newSequence);
}
gettingMetadata = function(sequence){
sequence = unlist(strsplit(attr(sequence,"Annot"), "\\[|\\]|:|=|\\.|join|\\(|\\)|,"));
return(sequence[sequence!="" & sequence!=" "]);
}
registeringData = function(genomeDenomination, genome){
for (index in 1:length(genome)){
sequence = toupper(genome[[index]]);
sequence[sequence == 'T'] = 'U';
metadata = gettingMetadata(sequence);
data[nrow(data) + 1, ] <<- list(genomeDenomination, metadata[which(metadata=="gene") + 1], metadata[which(metadata=="protein") + 1], as.integer(metadata[which(metadata=="location") + 2]) - as.integer(metadata[which(metadata=="location") + 1]) + 1, as.integer(metadata[which(metadata=="location") + 1]), as.integer(metadata[which(metadata=="location") + 2]), paste(sequence, sep = '', collapse = ''), transformingToAminoacids(sequence));
}
}
analysis = function(indexOriginVirus, index, index2, final){
if(data[index, ]$gene==data[index2, ]$gene){
origin = data[index2, ]$nucleotids;
solution = needleman(data[index, ]$aminoacids, data[index2, ]$aminoacids, gap = -1, mismatch = -1, match = 0);
similarity[nrow(similarity) + 1, ] <<- list(paste(data[index, ]$gene, originVirus[indexOriginVirus], finalVirus, sep="_", collapse = ""), originVirus[indexOriginVirus], finalVirus, data[index, ]$gene, round(levenshteinSim(data[index, ]$nucleotids, data[index2, ]$nucleotids)*100,3), round(levenshteinSim(data[index, ]$aminoacids, data[index2, ]$aminoacids)*100, 3));
similarity[nrow(similarity) + 1, ] <<- list(paste(data[index, ]$gene, originVirus[indexOriginVirus], finalVirus, sep="_", collapse = ""), originVirus[indexOriginVirus], "Alineamiento A", data[index, ]$gene, "", round(levenshteinSim(data[index, ]$aminoacids, paste(solution$aligned_seqs[1,], sep = '', collapse = ''))*100,3));
similarity[nrow(similarity) + 1, ] <<- list(paste(data[index, ]$gene, originVirus[indexOriginVirus], finalVirus, sep="_", collapse = ""), finalVirus, "Alineamiento A", data[index, ]$gene, "", round(levenshteinSim(data[index2, ]$aminoacids, paste(solution$aligned_seqs[1,], sep = '', collapse = ''))*100, 3));
similarity[nrow(similarity) + 1, ] <<- list(paste(data[index, ]$gene, originVirus[indexOriginVirus], finalVirus, sep="_", collapse = ""), originVirus[indexOriginVirus], "Alineamiento B", data[index, ]$gene, "", round(levenshteinSim(data[index, ]$aminoacids, paste(solution$aligned_seqs[2,], sep = '', collapse = ''))*100, 3));
similarity[nrow(similarity) + 1, ] <<- list(paste(data[index, ]$gene, originVirus[indexOriginVirus], finalVirus, sep="_", collapse = ""), finalVirus, "Alineamiento B", data[index, ]$gene, "", round(levenshteinSim(data[index2, ]$aminoacids, paste(solution$aligned_seqs[2,], sep = '', collapse = ''))*100, 3));
for(index3 in 1:nchar(final)){
if(substr(final, index3, index3)!=substr(origin, index3, index3)){
codon = ceiling(index3/3);
finalCodon = substr(final, 3*codon-2, 3*codon);
finalCodonDenomination = AMINOACIDS[paste(finalCodon, sep = '', collapse = '')];
originCodon = substr(origin, 3*codon-2, 3*codon);
originCodonDenomination = AMINOACIDS[paste(originCodon, sep = '', collapse = '')];
if(indexOriginVirus==1) {
mutations[nrow(mutations)+1, ] <<- list(paste(substr(origin, index3, index3), "to", substr(final, index3, index3), sep = '', collapse = ''), data[index, ]$initialLocation + index3, paste(paste(originCodon, sep = '', collapse = ''), "to", paste(finalCodon, sep = '', collapse = ''), sep = '', collapse = ''), paste(substr(originCodonDenomination, 1, 1), "to", substr(finalCodonDenomination, 1, 1), sep = '', collapse = ''), codon, data[index, ]$protein, data[index, ]$gene, is.na(substr(originCodonDenomination, 1, 1)) | substr(finalCodonDenomination, 1, 1) != substr(originCodonDenomination, 1, 1));
} else {
mutations2[nrow(mutations2)+1, ] <<- list(paste(substr(origin, index3, index3), "to", substr(final, index3, index3), sep = '', collapse = ''), data[index, ]$initialLocation + index3, paste(paste(originCodon, sep = '', collapse = ''), "to", paste(finalCodon, sep = '', collapse = ''), sep = '', collapse = ''), paste(substr(originCodonDenomination, 1, 1), "to", substr(finalCodonDenomination, 1, 1), sep = '', collapse = ''), codon, data[index, ]$protein, data[index, ]$gene, is.na(substr(originCodonDenomination, 1, 1)) | substr(finalCodonDenomination, 1, 1) != substr(originCodonDenomination, 1, 1));
}
}
}
}
}
registeringData(finalVirus, SARSCov2);
registeringData(originVirus[1], RaTG13);
registeringData(originVirus[2], MERSCov);
for(index in 1:length(SARSCov2)){
for(index2 in (length(SARSCov2)+1):(length(SARSCov2)+length(RaTG13)))
analysis(1, index, index2, data[index, ]$nucleotids);
for(index2 in (length(SARSCov2)+length(RaTG13)+1):(length(SARSCov2)+length(RaTG13)+length(MERSCov)))
analysis(2, index, index2, data[index, ]$nucleotids);
}
graphic1a = ggplot(mutations) +
aes(x=mutation, fill=mutation, label=..count..) +
ggtitle(paste("Diferencias totales entre ", originVirus[1], " y ", finalVirus, '.', sep = '', collapse = '')) +
labs(x="Diferencia (pseudomutación).", y="Frecuencia.", fill="Pseudomutaciones.") +
geom_bar(stat = "count") +
geom_text(stat = "count", aes(vjust=1)) +
theme(axis.title=element_text(size=10,face="bold"));
graphic2a = ggplot(mutations[mutations$change==TRUE, ]) +
aes(x=mutation, fill=mutation, label=..count..) +
ggtitle(paste("Diferencias significativas entre ", originVirus[1], " y ", finalVirus, '.', sep = '', collapse = '')) +
labs(x="Diferencia (pseudomutación).", y="Frecuencia.", fill="Pseudomutaciones.") +
geom_bar(stat = "count") +
geom_text(stat = "count", aes(vjust=1)) +
theme(axis.title=element_text(size=10,face="bold"));
graphic3a = ggplot(similarity[similarity$nucleotidePercentage>0 & similarity$originVirus==originVirus[1], ], aes(x = gene, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud de genes estructurales entre ", originVirus[1], " y ", finalVirus, '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic4a = ggplot(similarity[similarity$originVirus==originVirus[1] & similarity$gene=='S', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen S de ", originVirus[1], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic5a = ggplot(similarity[similarity$originVirus==originVirus[1] & similarity$gene=='N', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen N de ", originVirus[1], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic6a = ggplot(similarity[similarity$originVirus==originVirus[1] & similarity$gene=='M', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen M de ", originVirus[1], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic7a = ggplot(similarity[similarity$originVirus==originVirus[1] & similarity$gene=='E', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen E de ", originVirus[1], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic1b = ggplot(mutations2) +
aes(x=mutation, fill=mutation, label=..count..) +
ggtitle(paste("Diferencias totales entre ", originVirus[2], " y ", finalVirus, '.', sep = '', collapse = '')) +
labs(x="Diferencia (pseudomutación).", y="Frecuencia.", fill="Pseudomutaciones.") +
geom_bar(stat = "count") +
geom_text(stat = "count", aes(vjust=1)) +
theme(axis.title=element_text(size=10,face="bold"));
graphic2b = ggplot(mutations2[mutations$change==TRUE, ]) +
aes(x=mutation, fill=mutation, label=..count..) +
ggtitle(paste("Diferencias significativas entre ", originVirus[2], " y ", finalVirus, '.', sep = '', collapse = '')) +
labs(x="Diferencia (pseudomutación).", y="Frecuencia.", fill="Pseudomutaciones.") +
geom_bar(stat = "count") +
geom_text(stat = "count", aes(vjust=1)) +
theme(axis.title=element_text(size=10,face="bold"));
graphic3b = ggplot(similarity[similarity$nucleotidePercentage>0 & similarity$originVirus==originVirus[2], ], aes(x = gene, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud de genes estructurales entre ", originVirus[2], " y ", finalVirus, '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic4b = ggplot(similarity[similarity$originVirus==originVirus[2] & similarity$gene=='S', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen S de ", originVirus[2], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic5b = ggplot(similarity[similarity$originVirus==originVirus[2] & similarity$gene=='N', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen N de ", originVirus[2], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic6b = ggplot(similarity[similarity$originVirus==originVirus[2] & similarity$gene=='M', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen M de ", originVirus[2], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
graphic7b = ggplot(similarity[similarity$originVirus==originVirus[2] & similarity$gene=='E', ], aes(x = finalVirus, y = aminoacidPercentage, fill = type)) +
ggtitle(paste("Porcentaje de similitud del gen E de ", originVirus[2], '.', sep = '', collapse = '')) +
geom_bar(stat = "identity") +
geom_text(aes(label = aminoacidPercentage), vjust = 1, hjust=2, colour = "white") +
theme(legend.position = "bottom") +
theme(axis.title=element_text(size=10,face="bold")) +
coord_flip();
save.image("analysis.RData");