-
Notifications
You must be signed in to change notification settings - Fork 0
/
sarcoRmarkdown.Rmd
193 lines (192 loc) · 8.68 KB
/
sarcoRmarkdown.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
---
title: "Sarcoscypha Phylogeny"
author: "Juliana Leshchenko"
date: "2/17/2022"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
Read the data
```{r}
library(ape) # Analysis of phylogenetics and evolution
library(hierfstat) # Hierarchical F-statistics
library(corrplot) # Visualization of correlation matrix
sarco.dna<-read.dna(file="sarco_seq.fasta", format = "fasta")
sarco.dna
class(sarco.dna)
```
## Multiple sequence alignment
```{r}
library(ips) #MAFFT is available here
#Requires path to MAFFT binary - set it according to your installation
sarco.mafft <- mafft(x=sarco.dna, method="localpair", maxiterate=100, options="--adjustdirection", exec="C:/Users/Mycolab-2017/Desktop/mafft-win/mafft")
sarco.mafft
### Delete all columns containing at least 25% of gaps
sarco.mafft.ng <- deleteGaps(x=sarco.mafft,gap.max=nrow(sarco.mafft)/4)
### Delete every line (sample) containing at least 20% of missing data
sarco.mafft.ng <- del.rowgapsonly(x=sarco.mafft.ng, threshold=0.2, freq.only=FALSE)
# Delete every alignment position having at least 20% of missing data
sarco.mafft.ng <- del.colgapsonly(x=sarco.mafft.ng, threshold=0.2, freq.only=FALSE)
sarco.mafft.ng
class(sarco.mafft.ng)
image.DNAbin(sarco.mafft.ng) # Plot the alignment
# Check the alignment
checkAlignment(x=sarco.mafft.ng, check.gaps=TRUE, plot=TRUE, what=1:4)
library(adegenet)
```
### Checking SNPs
```{r}
# Position of polymorphism within alignment - snpposi.plot() requires input data in form of matrix
snpposi.plot(x=as.matrix(sarco.mafft.ng), codon=FALSE)
# Position of polymorphism within alignment - differentiating codons
snpposi.plot(as.matrix(sarco.mafft.ng))
# When converting to genind object, only polymorphic loci are kept - threshold for polymorphism can be arbitrary (polyThres=...)
sarco.genind <- DNAbin2genind(x=sarco.mafft.ng, polyThres=0.01)
sarco.genind # See it
# Check sequences
# Nucleotide diversity
pegas::nuc.div(x=sarco.mafft.ng)
# Base frequencies
ape::base.freq(x=sarco.mafft.ng)
# GC content
ape::GC.content(x=sarco.mafft.ng)
# Number of times any dimer/trimer/etc oligomers occur in a sequence
seqinr::count(seq=as.character.DNAbin(sarco.dna[["MZ227236_Sarcoscypha_coccinea_Chernivtsi_UA"]]), wordsize=3)
```
### Distance-based phylogenies
#### Model selection
```{r}
library(phangorn)
# Conversion to phyDat for phangorn
sarco.phydat <- as.phyDat(sarco.mafft.ng) # Prepare starting tree
modelTest(object=as.phyDat(sarco.mafft.ng), tree=nj(dist.dna(x=sarco.mafft.ng,model="raw")))
```
```{r}
# Create the distance matrix
sarco.dist <- dist.dna(x=sarco.mafft.ng, model="F81")
```
sarco.dist is an object of class dist which contains the distances between every pairs of sequences.
Check the resulting distance matrix
```{r}
sarco.dist
class(sarco.dist)
dim(as.matrix(sarco.dist))
```
Now that genetic distances between samples have been computed, we need to visualize this information.
There are n(n − 1)/2 distances for n sequences, and most of the time summarizing this information is not entirely
trivial. The simplest approach is plotting directly the matrix of pairwise
distances:
```{r}
library(ade4) #Analysis of ecological data, multivariate methods
par(mfrow=c(1,1))
temp <- as.data.frame(as.matrix(sarco.dist))
table.paint(temp, cleg = 0, clabel.row = 0.5, clabel.col = 0.5)
# Same visualization, colored
heatmap(x=as.matrix(sarco.dist), Rowv=NA, Colv=NA, symm=TRUE)
```
Dendrogram
```{r}
#This is very basic function to make dendrogram
plot(hclust(d=sarco.dist, method="complete")) #hierarchical clustering
```
##Building trees. UPGMA
```{r}
#Saving as phylo object (and not hclust) gives more possibilities for further plotting and manipulations
sarco.upgma <- as.phylo(hclust(d=sarco.dist, method="average"))
plot.phylo(x=sarco.upgma, cex=0.75)
title("UPGMA tree") #looks ok
```
## Building trees. Neighbor-Joining
```{r}
sarco.nj <- nj(sarco.dist)
class(sarco.nj)
# Plot a basic tree
plot.phylo(x=sarco.nj, type="phylogram")
sarco.tree <- nj(X=sarco.dist)
plot.phylo(x=sarco.tree, type="unrooted")
title("Unrooted NJ tree")
# an improved version of Neighbor-Joining
sarco.bionj <- bionj(sarco.dist)
# Plot a basic tree
plot.phylo(x=sarco.nj, type="phylogram")
sarco.tree <- nj(X=sarco.dist)
plot.phylo(x=sarco.tree, type="unrooted")
title("Unrooted NJ tree")
```
## Assessing the quality of a phylogeny
```{r}
#Test quality - tests correlation of original distance in the matrix and reconstructed distance from hclust object
plot(x=as.vector(sarco.dist), y=as.vector(as.dist(
cophenetic(sarco.upgma))), xlab="Original pairwise distances",
ylab="Pairwise distances on the tree", main="Is UPGMA appropriate?", pch=20, col=transp(col="black",
alpha=0.1), cex=2)
abline(lm(as.vector(as.dist(cophenetic(sarco.upgma)))~as.vector(sarco.dist)), col="red")
cor.test(x=as.vector(sarco.dist), y=as.vector(as.dist(cophenetic(sarco.upgma))), alternative="two.sided") # Testing the correlation
# For NJ
plot(x=as.vector(sarco.dist), y=as.vector(as.dist(
cophenetic(sarco.nj))), xlab="Original pairwise distances",
ylab="Pairwise distances on the tree", main="Is simple NJ appropriate?", pch=20, col=transp(col="black",
alpha=0.1), cex=2)
abline(lm(as.vector(as.dist(cophenetic(sarco.nj)))~as.vector(sarco.dist)), col="red")
cor.test(x=as.vector(sarco.dist), y=as.vector(as.dist(cophenetic(sarco.nj))), alternative="two.sided")
# For bioNJ
plot(x=as.vector(sarco.dist), y=as.vector(as.dist(
cophenetic(sarco.bionj))), xlab="Original pairwise distances",
ylab="Pairwise distances on the tree", main="Is the improved version of NJ appropriate?", pch=20, col=transp(col="black",alpha=0.1), cex=2)
abline(lm(as.vector(as.dist(cophenetic(sarco.bionj)))~as.vector(sarco.dist)), col="red")
cor.test(x=as.vector(sarco.dist), y=as.vector(as.dist(cophenetic(sarco.bionj))), alternative="two.sided")
```
In this case, UPGMA is a poor choice. Why is this? UPGMA forces ultrametry (all the tips are equidistant to the root). The underlying assumption is that all lineages have undergone the same amount of evolution, but we see that the species evolved at different speeds. Improved version of Neighbor-Joining tree (sarco.bionj) is a the best representation of the chosen genetic distances.
```{r}
# Linear model for above graph
summary(lm(as.vector(sarco.dist) ~
as.vector(as.dist(cophenetic(sarco.bionj))))) # Prints summary text
```
## Calculate bootstrap
```{r}
sarco.tree1 <- bionj(X=sarco.dist)
fit = pml(sarco.tree1, data=sarco.phydat)
fit
methods(class="pml")
fitJC <- optim.pml(fit, TRUE)
logLik(fitJC)
bs = bootstrap.pml(fitJC, bs=1000, optNni=TRUE,
control = pml.control(trace = 0))
plotBS(midpoint(fitJC$tree), bs, p = 50, type="p")
title("bioNJ tree + bootstrap values")
```
The output gives the number of times each node was identified in bootstrapped analyses (the order is the same as in the original object).All major branches have bootstrap support > 70%.
## Building trees. Maximum parsimony
```{r}
parsimony(sarco.upgma, sarco.phydat)
parsimony(sarco.nj, sarco.phydat)
#returns the parsimony score, that is the number of changes which are at least necessary to describe the data for a given tree
#The tree rearrangement implemented are nearest-neighbor interchanges (NNI) and subtree pruning and regrafting (SPR).
treePars <- optim.parsimony(sarco.upgma, sarco.phydat) #performs tree rearrangements to find trees with a lower parsimony score
treeRatchet <- pratchet(sarco.phydat, trace = 0) #parsimony ratchet (Nixon 1999) implemented
parsimony(c(treePars, treeRatchet), sarco.phydat)
treeRatchet <- acctran(treeRatchet, sarco.phydat) #assign branch length to the tree. The branch length are proportional to the number of substitutions / site.
plotBS(midpoint(treeRatchet), type="phylogram")
#Maximum parsimony 2
tre.ini <- nj(dist.dna(sarco.mafft.ng,model="raw")) #reconstruct a tree
tre.ini
parsimony(tre.ini, sarco.phydat) #measure this tree’s parsimony
sarco.pars <- optim.parsimony(tre.ini, sarco.phydat) #the most parsimonious possible tree
sarco.pars
parsimony(sarco.pars, sarco.phydat)
library(ape)
plot(sarco.pars, type="unr", edge.width=2)
title("Maximum-parsimony tree")
```
## Visualize the trees
```{r}
library(ggtree)
library(TDbook)
p <- ggtree(sarco.upgma) + geom_tiplab(size=3) #upgma
msaplot(p, sarco.mafft.ng, offset=3, width=15)
p <- ggtree(sarco.nj) + geom_tiplab(size=3) #nj
msaplot(p, sarco.mafft.ng, offset=3, width=15)
p <- ggtree(treeRatchet) + geom_tiplab(size=3) #Maximum parsimony
msaplot(p, sarco.mafft.ng, offset=3, width=1)
```