-
Notifications
You must be signed in to change notification settings - Fork 1
/
lm_analysis.R
224 lines (196 loc) · 9.18 KB
/
lm_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# Cacao Landmark Analysis for Wolcott et al. 2023, APPS
# Last updated on Apr 15 by K Wolcott
# If on Mac, first install XQuartz 2.7.11
## https://www.xquartz.org/releases/XQuartz-2.7.11.html
# Open terminal, cd to lm directory (one folder per specimen with multiple lm files), and remove lms only used for measurements, not gmm
# LM names to remove: OvaryH, StigLen, StigToStam, StyleH, ovary_style_stam, ovary_perimeter
#find . -name '*Ovary*' #-delete
#find . -name '*ovary*' #-delete
#find . -name '*Stig*' #-delete
#find . -name '*Style*' #-delete
# Load packages using pacman
#install.packages("pacman")
pacman::p_load(devtools, geomorph, rgl, purrr, dplyr, glue, abind, stringr,
SlicerMorphR, ggplot2, ggforce, concaveman, Rcpp)
# Set working directory (define your data_wd below)
data_wd = "/Users/katherinewolcott/Documents/r/cacao"
lms_wd = paste(data_wd, "/lms", sep="")
setwd(lms_wd)
# Build df of all landmark files for each specimen
## Get paths for all specimens (1 folder each containing several landmark files)
dirs = list.dirs(".", full.names=FALSE, recursive=FALSE)
## Populate df with all landmark files per specimen
all_lms = lapply(dirs, function(y) {
glue("Fetching landmark files for specimen: {y}")
path = paste(y, '/lms/', sep='')
files <- dir(path, pattern = "*.json")
glue("Compiling landmark files: {files}")
lms <- files %>% map_df(~data.frame(read.markups.json(
file.path(path, .))) %>% mutate_if(is.numeric, as.character))
rownames(lms) = NULL
lms["spec_id"] = y
rbind(lms)
})
## Optional: Check df to make sure no specimens are missing any lms
#for(i in 1:length(all_lms)) {
#print(i)
#print(nrow(all_lms[[i]][1]))
#}
## Export to file
df = bind_rows(all_lms) # Normalize dataframe with landmarks as rows, coords, spec_id, and lm_id as cols
write.table(df, "all_lms_wout-names.tsv", sep = "\t", row.names = FALSE)
# Add specimen and landmark names to all_lms dataframe
## Make list of specimen names
fpath_idx = lapply(dirs, function(y) {
path = paste(y, '/lms/', sep='')
files <- dir(path, pattern = "*.json")
fpath_idx = list(path)
})
fpath_idx = unlist(fpath_idx)
write.table(fpath_idx, "specimen_idx.tsv", sep = "\t")
## Add specimen and landmark names to all_lms dataframe
master_lm_df = read.table("all_lms_wout-names.tsv", sep = "\t", header = TRUE)
## Make list of landmark names (lm curve name, number of lm points)
## hacky step, but avoids file name irregularities resulting from duplicate naming in 3D slicer
lm_names = c(rep("fil_bot", 15), rep("fil_top", 10), rep("ligule_length", 20),
rep("ligule_width", 15), rep("petal_curveL", 20), rep("petal_curveR", 20),
rep("sepal_length", 20), rep("sepal_width", 10), rep("staminode_height", 5))
master_lm_df["lm_name"] = rep(lm_names, length(unique(df$spec_id)))
## Add numeric labels for 135 landmarks for each specimen
master_lm_df["lm_id"] = rep(c(1:135), length(all_lms))
master_lm_df = master_lm_df[, c(4, 5, 6, 1, 2, 3)] # re-order columns for easier reading
## Optional: Check if there are any NA's
#master_lm_df[!complete.cases(master_lm_df),]
## Write to file File S2 - all_lms
write.table(master_lm_df, "all_lms.tsv", sep = "\t", row.names = FALSE)
# Format landmark coordinates as an array for geomorph
df_lms = df[c('X', 'Y', 'Z')] # Take only coodinates from all_lms df
df_lms_numeric = mutate_all(df_lms, function(x) as.numeric(as.character(x))) # Force as numeric
m = data.matrix(df_lms_numeric) # Make into an array
a = arrayspecs(A=m, p=135, k=3, sep=NULL) # Define array shapes
# Inspect data
a[,,1] # Inspect one specimen
a[1,,] # Inspect one landmark for all specimens
a[,1,] # Inspect landmark coord 1 of 3 for all specimens
# Define sliding landmarks for curves of flower
setwd(data_wd)
sliders = read.table("lm_curves.tsv", header=TRUE)
sliders = sliders[4:6]
# Make specimen info df to sort lms by groups (location, month, self compat, etc)
## Load in specimen id mapping file with names, collection info
spec_ids = read.table("map_specimen_ids.tsv", header=T, sep="\t")
spec_ids$spec_id<-tolower(spec_ids$spec_id)
## Load in specimen idx file to add specimen names to numbers for interpreting lm analysis
setwd(lms_wd)
spec_nums = read.table("specimen_idx.tsv", sep = "\t", row.names=1)
spec_nums["spec_num"] = row.names(spec_nums)
spec_nums["spec_id"] = spec_nums$x
spec_nums["spec_id"] = str_replace(spec_nums[,3], '/lms/', '')
spec_nums$spec_id<-tolower(spec_nums$spec_id)
## Combine files
spec_info <- merge(spec_ids, spec_nums)
colnames(spec_info)[7] <- 'tree_id'
## Set variables of interest as factors
spec_info$col_month = as.factor(spec_info$col_month) # collection month
spec_info$selfCompatible = as.factor(spec_info$selfCompatible) # self compatible
spec_info$col_loc = as.factor(spec_info$col_loc) # collection location
spec_info$tree_id = as.factor(spec_info$tree_id) # tree id
# Run GPA to align and scale specimens use Procrustes Distance for sliding
Y.gpa <- gpagen(a, curves = sliders, ProcD = TRUE)
summary(Y.gpa)
plot(Y.gpa)
# Get mean shape coordinates (reference shape) after GPA alignment
ref = mshape(Y.gpa$coords, na.action = 3)
plot(ref)
# Compare a specimen to mean consensus shape
plotRefToTarget(ref,Y.gpa$coords[,,1],method="points")
plotRefToTarget(ref,Y.gpa$coords[,,1],method="TPS")
# Plot outliers
outliers = plotOutliers(Y.gpa$coords)
# Find the specimen with the closest values to the mean shape
meanspec = findMeanSpec(Y.gpa$coords)
# Compare an individual specimen to the mean shape using different plotting methods
target = Y.gpa$coords[,,1] # Normal specimen
## Thin plate spline
plotRefToTarget(ref, target, method="TPS")
## Lollipops
plotRefToTarget(ref, target, method="vector")
## Points
plotRefToTarget(ref, target, method="points")
# PCA, all specimens
pca = gm.prcomp(Y.gpa$coords)
sum = summary(pca)
prop_var = sum$PC.summary[2,1:2]
# Plot PCA in ggplot
dtp = data.frame('tree_id'=spec_info$tree_id, # Take first 2 components of PCA
'PC1'=pca$x[,1], 'PC2'=pca$x[,2])
## Plot parameters
p = ggplot(data = dtp, aes(x = PC1, y = PC2, col = tree_id)) +
# Add polygons around each tree
geom_mark_hull(concavity = 5, expand=0, radius=0, aes(fill=tree_id))+
geom_point() +
theme_minimal()
## Add labels
p + labs(x=glue("PC1: {round(prop_var[1], 4)*100}%"),
y=glue("PC2: {round(prop_var[2], 4)*100}%"))
# Make deformation meshes to manually add to PCA plots outside of R
## Compare min and max from PCA to mean consensus shape
ref<-mshape(Y.gpa$coords) # mean shape
## Component 1
plotRefToTarget(ref, pca$shapes$shapes.comp1$min) # min shape to mean
plotRefToTarget(ref, pca$shapes$shapes.comp1$max) # max shape to mean
## Component 2
plotRefToTarget(ref, pca$shapes$shapes.comp2$min) # min shape to mean
plotRefToTarget(ref, pca$shapes$shapes.comp2$max) # max shape to mean
# Procrustes Anova
## Make a geomorph dataframe
gdf = geomorph.data.frame(Y.gpa, tree=spec_info$tree_id,
col_loc=spec_info$col_loc, col_month=spec_info$col_month,
self_compat=spec_info$selfCompatible)
attributes(gdf)
## Run anova
### Null fit model (no group effects)
fit = procD.lm(coords ~ log(Csize), data = gdf)
aov = anova(fit)
summary(aov)
### Common fit model (group effects of tree_id, col_month)
fit.common = procD.lm(coords ~ log(Csize) + tree + col_month, data = gdf, SS.type = "III")
aov.common = anova(fit.common)
summary(aov.common)
### Common fit model sc (group effects of tree_id, self_compat, col_month)
fit.common.sc = procD.lm(coords ~ log(Csize) + self_compat + tree + col_month, data = gdf, SS.type = "III")
aov.common.sc = anova(fit.common.sc)
summary(aov.common.sc)
### Common fit model sc (group effects of tree_id, self_compat, col_month)
fit.common.sc = procD.lm(coords ~ log(Csize) + self_compat + tree + col_month, data = gdf, SS.type = "III")
aov.common.sc = anova(fit.common.sc)
summary(aov.common.sc)
# Compare fit models
anova(fit, fit.common, fit.common.sc, SS.type = "III")
# Pairwise test of unique vs unique sc
PW = pairwise(fit = fit.common.sc, fit.null = fit, groups = gdf$self_compat, covariate=gdf$Csize)
summary(PW, confidence = 0.95)
PW = pairwise(fit = fit.common.sc, fit.null = fit, groups = interaction(gdf$self_compat, gdf$tree))
summary(PW, confidence = 0.95)
## Export results
capture.output(summary(aov),file="anova.txt")
# Generate descriptive statistics for Table 1 - flower measurements
setwd(data_wd)
fpath = "measurements_raw.tsv"
meas = read.table(fpath, sep="\t", header=TRUE)
## Inspect raw data
head(meas)
str(meas)
## Drop columns with qualifying variables, keep only measurements
meas_cols = na.omit(meas[,4:18])
## Calculate min, mean, med, max, and sd
meas_stats = data.frame(lapply(meas_cols, min), row.names = "min")
meas_stats = rbind(meas_stats, data.frame(lapply(meas_cols, mean), row.names = "mean"))
meas_stats = rbind(meas_stats, data.frame(lapply(meas_cols, median), row.names = "median"))
meas_stats = rbind(meas_stats, data.frame(lapply(meas_cols, max), row.names = "max"))
meas_stats = rbind(meas_stats, data.frame(lapply(meas_cols, sd), row.names = "sd"))
## Calculate rsd (mean/sd)
t = data.frame(t(meas_stats))
t["rsd"] = t[,5]/t[,2]
## Export to file - Table 1
write.table(t, "Table_1-measurements.tsv", sep="\t")