-
Notifications
You must be signed in to change notification settings - Fork 0
/
tree_props.R
173 lines (142 loc) · 6.16 KB
/
tree_props.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#CALCULATE TREE AND ALIGNMENT PROPERTIES
#Taken from M. Borowiec GitHub (https://github.com/marekborowiec/good_genes)
#Modified for HybPhyloMaker
#--------------------------------------------------------------------------
# disable scientific notation
options(scipen=999)
# install required libraries
# uncomment if you don't have these installed
#install.packages("ape")
#install.packages("seqinr")
#install.packages("data.table")
# load needed libraries
library("ape")
library("seqinr")
library("data.table")
### SET WORKING DIRECTORY ###
#all trees must be in dir 'trees'
trees_dir <- file.path("./trees/")
#all alignments must be in dir 'alignments'
aln_dir <- file.path("./alignments/")
#names of the trees must be 'something_LOCUSNR_somethingelse.tre'
trees_files <- dir(path=trees_dir, pattern="*tre")
#alignment files must be 'something.fas'
aln_files <- dir(path=aln_dir, pattern="*fas")
### AVERAGE BOOTSTRAP SUPPORT ###
# This code should work with any Newick tree files
# that have some measure of support at nodes,
# including PP from Bayesian analysis.
# define a function to calculate average support
Avg_support <- function(file) {
# get Assembly number from filename (second part of file name separated by '_')
locus_no <- strsplit(file, "_")
locus_no <- locus_no[[1]][2]
# read in tree
tree <- read.tree(paste(trees_dir,file, sep="/"))
# store support values in a vector
support <- c(as.numeric(tree$node.label))
# calculate average support
avg_supp <- mean(support, na.rm=T)
return(c(locus_no,avg_supp))
}
# loop over all files
average_bootstrap <- lapply(trees_files, Avg_support)
average_bootstrap <- data.frame(matrix(unlist(average_bootstrap), nrow=(length(average_bootstrap)), byrow=T))
colnames(average_bootstrap) <- c("Locus", "Average_bootstrap")
### AVERAGE BRANCH LENGTHS ###
# This takes a Newick tree with branch lengths
# and returns the number of tips for each tree,
# and calculates average branch length.
Br_length.trees <- function(file) {
# reads the phylogenetic tree
tree <- read.tree(paste(trees_dir,file, sep="/"))
# gets number of tips
no_tips <- length(tree$tip.label)
# calculate avg branch length
avg_br_length <- mean(tree$edge.length)
# get Assembly number from filename (second part of file name separated by '_')
locus_no <- strsplit(file, "_")
locus_no <- locus_no[[1]][2]
return(c(locus_no,avg_br_length))
}
# loop over all files
br_lengths <- lapply(trees_files, Br_length.trees)
br_lengths <- data.frame(matrix(unlist(br_lengths), nrow=(length(br_lengths)), byrow=T))
colnames(br_lengths) <- c("Locus", "Average_branch_length")
### SATURATION ###
# The code below takes as input FASTA alignments and tree files with branch lengths
# and calculates regression slope and r-squared for each locus
# also saving regression plots for each locus
# in a newly created 'Saturation_plots' folder
# define function to perform regression,
# calculate R-squared, and save saturation plots
# needs fasta alignments and
# corresponding raxml's tree files
Saturation <- function(seq, tree, locus_no) {
# read in alignment
alignment <- read.alignment(file=seq, format="fasta")
# read in tree
tree <- read.tree(tree)
# get locus number from filename
# this has to be different regex from other functions used here
# because file names here include directory
locus_no <- strsplit(locus_no, "_")
locus_no <- locus_no[[1]][2]
# matrix with pairwise identity
mat <- dist.alignment(alignment, matrix="identity")
# matrix with uncorrected p-distances
p_mat <- mat*mat
# mean p-distance
avg_p_mat <- mean(p_mat, na.rm=TRUE)
# make matrix of pairwise distances in branch lengths from the tree
cophentr <- cophenetic(tree)
# store as matrix objects
mat_mat <- as.matrix(mat)
mat_p_mat <- as.matrix(p_mat)
# order p-distance matrix by names
mat_p_mat <- mat_p_mat[order(row.names(mat_p_mat)),order(row.names(mat_p_mat))]
mat_co <- as.matrix(cophentr)
# order pairwise distances matrix by names
mat_co <- mat_co[order(row.names(mat_co)),order(row.names(mat_co))]
# get lower triangulars of both matrices
branch_dist <- mat_co[lower.tri(mat_co)]
p_dist <- mat_p_mat[lower.tri(mat_p_mat)]
# perform simple linear regression
regress <- lm(p_dist ~ branch_dist)
# get slope
slope <- coef(regress)[2]
# get r-squared
Rsquared <- summary(regress)$r.squared
# plot branch length pairwise distances on x
# and uncorrected p-distances on y
# open png file
png(file=paste(sat_dir, paste(locus_no, "-saturation.png", sep=""), sep="/"), width=600, height=600)
plot(branch_dist, p_dist)
# add simple linear regression line
abline(lm(p_dist ~ branch_dist), col="red")
# give title as locus number and subtitle as tree length
title(main=locus_no,sub=paste("Slope: ", round(slope, digits=3), " R-squared: ", round(Rsquared, digits=3), sep=""), cex.sub=1.25)
# close png file
dev.off()
return(list(locus_no, avg_p_mat, slope, Rsquared))
}
# get current working directory
work_dir <- getwd()
# create a folder for saturation plots
dir.create("./saturation_plots")
sat_dir <- file.path(work_dir, "saturation_plots/")
# create a table with file names and column for filtering locus name
files_table <- as.data.frame(cbind(paste(aln_dir, aln_files, sep="/"), paste(trees_dir, trees_files, sep="/"), trees_files))
saturation_table <- t(mapply(Saturation, as.matrix(files_table$V1), as.matrix(files_table$V2), as.matrix(files_table$trees_files)))
saturation_table <- as.data.frame(saturation_table)
colnames(saturation_table) <- c("Locus","Avg_p_dist","Slope","R_squared")
row.names(saturation_table) <- NULL
# putting together all the data
dfs <- list(average_bootstrap, br_lengths, saturation_table)
Multmerge <- function(dfs){
datalist <- lapply(dfs, function(x){data.frame(x)})
Reduce(function(x,y) {merge(x,y)}, datalist)
}
all_loci_stats <- Multmerge(dfs)
all_loci_stats <- data.frame(lapply(all_loci_stats, as.character), stringsAsFactors=FALSE)
write.csv(all_loci_stats, file="tree_stats_table.csv", quote=FALSE, row.names=FALSE)