forked from LeifAnderssonLab/2023_Horse_mackerel_popgen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path09-IBD.R
340 lines (264 loc) · 14.5 KB
/
09-IBD.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
###########################################################################
# Test for Isolation-by-Distance (IBD) and Isolation-by-Environment (IBE) #
###########################################################################
## Clean environment space.
rm(list=ls())
working_dir <- "/Users/angfu103/Dropbox/PostDoc_UU/Projects/HorseMackerel/Pool-seq/analysis/09-GEA/IBD"
coord_txt_file <- '~/Dropbox/PostDoc_UU/Projects/HorseMackerel/Pool-seq/analysis/09-GEA/data/HOM_12pools_coordinates.txt'
coord_csv_file <- '~/Dropbox/PostDoc_UU/Projects/HorseMackerel/Pool-seq/analysis/09-GEA/coordinates_HOM_9pools.csv'
FST_file <- "/Users/angfu103/Dropbox/PostDoc_UU/Projects/HorseMackerel/Pool-seq/analysis/07-1-popgen-stats/pairwiseFST/PairwiseFST_HOM_12_pools.UG.SNPs.hf.DP20-300.GQ20.mono.miss20.mac3.AD.Neff.AF_Feb12011_renamed.txt"
setwd(working_dir) # Set working directory.
####################################################
# Obtain pairwise least-coast distance (Km) matrix #
####################################################
## Taken from CartDist, function CartDistFunction.R, lines 1 to 138.
#coordinates - path to a csv file with the coordinates and site names. This must have three columns the first being the site name, second the longitude and third the latitude.
#min.depth - minimum depth that can be considered passable in the least-cost analysis. Note that depths are negative. Default is NA or 0.
#max.depth - maximum depth that can be considred passable in the least-cost analysis. Note that depths are negative. Default is NA or NULL meaning the analysis depths will permit movement between the min.depth and the maximum depth of the derived bathymetric layer.
#trans - is a transition object (marmap) that might have been calcualted in a previous analysis. Note that each run will return the transition object. This object can called from the workspace into the fuction.
#gridres - resolution (mins) of the bathymetric grid used for least coast path (default = 2)
#directory - if specified this is where marmap will save and or look for the output for the bathy object created by marmap.
#outpath - this is the filepath for the output from the function a .Rdata file. Note this must be a full file path ending in .RData. If no path is provided a 'Output' .RData file will be created in the current directory.
coordinates=coord_csv_file
min.depth=0
max.depth=NA
trans=NA
gridres=2
#gridres=4
directory=working_dir
outpath=paste0(working_dir,"/LeastCoastDistanceMatrix_HOM_9pools.RData")
#Libraries ----------
#Check to make sure the packages required are there and install missing
writeLines("\nChecking on package availability.\n")
packages <- c("gdistance", "ggplot2", "marmap","vegan")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
writeLines(paste("Installing missing packages: ",paste(setdiff(packages, rownames(installed.packages())),collapse=", "),sep=" "))
install.packages(setdiff(packages, rownames(installed.packages())))
}
#load each library
require(gdistance)
require(marmap)
require(vegan)
require(ggplot2)
## check the outpath
if(!is.na(outpath) & substring(outpath,nchar(outpath)-5,nchar(outpath))!= ".RData"){
stop("\nParamter outpath must be a full path ending in .RData. Please fix and try again.\n")
}
# Plotting settings for water bathymetry fill
blues <- c("lightsteelblue4", "lightsteelblue3",
"lightsteelblue2", "lightsteelblue1")
greys <- c(grey(0.6), grey(0.93), grey(0.99))
## read in coordinates and check format
coords <- read.csv(coordinates, header = TRUE, stringsAsFactors = FALSE)
writeLines("\nEnsure that your data is set up as three columns with the first column having the sample location name, the second with the longitude and the third with the latitude. \n")
if (length(colnames(coords))<3 | !is.numeric(coords[,2]) | !is.numeric(coords[,3])){
stop("\nCheck coordinate input, there is a problem here.\n")
}
# Clean up column names
colnames(coords)[which(sapply(coords,is.character))[1]] <- "Code"
colnames(coords)[which(sapply(coords,is.numeric))[1]] <- "Long"
colnames(coords)[which(sapply(coords,is.numeric))[2]] <- "Lat"
## Set map limits------adding and subtracting 2 degrees to make the lc.dists function work better
Lat.lim = c(min(coords$Lat)-2, max(coords$Lat)+2)
Long.lim = c(min(coords$Long)-2, max(coords$Long)+2)
#Get the bathydata and keep it
holddir <- getwd() # grab working directory
# directory is specified.
if(!is.na(directory)){
setwd(directory) # switch to the output directory
writeLines("\nGetting bathymetry data from NOAA database\n")
bathydata <- marmap::getNOAA.bathy(lon1 = Long.lim[1], lon2 = Long.lim[2], lat1 = Lat.lim[1], lat2 = Lat.lim[2],
resolution = gridres,keep=TRUE)
setwd(holddir)
} # set back to the working directory
# directory is not specified.
if(is.na(directory)){
writeLines("\nGetting bathymetry data from NOAA database\n")
bathydata <- marmap::getNOAA.bathy(lon1 = Long.lim[1], lon2 = Long.lim[2], lat1 = Lat.lim[1], lat2 = Lat.lim[2],
resolution = gridres,keep=FALSE)
}
### check depths to ensure they are all in water (e.g., no positive 'land' depths)
# Get depths and plot. If any depths > 0 we will not proceed
depths <- marmap::get.depth(bathydata, x=coords$Long, y=coords$Lat, locator=F)
colnames(depths) <- c("Long","Lat","depth")
coords <- merge(coords, depths, by=c("Long","Lat"))
#colours to assign those locations which are in water "green" and on land "red".
coords$col <- "green"
coords[coords$depth>=0,"col"] <- "red"
#coords[is.na(coords$depth),"col"] <- "red"
if(length(which(depths$depth>0))!=0){
writeLines("\nSome of your coordinates appear to have a positive depth. Refer to map and bump coordinates for those points marked as 'red' off of land. Suggest moving points farther off land for this analysis.\n")
marmap::plot.bathy(bathydata,image = TRUE, land = T, lwd = 0.03,
bpal = list(c(0, max(bathydata), greys),
c(min(bathydata), 0, blues)),deep=0,shallow=0)
marmap::plot.bathy(bathydata, lwd = 1, deep = 0, shallow = 0, step = 0, add = TRUE)
legend("bottomright",
legend = c("Water","Land"),
col=c("green","red"),
pch=19,
pt.cex=1.5,
bg="white")
points(coords$Long, coords$Lat, pch=19, cex=2, col=coords$col)
print(coords[coords$depth>0,c("Code","Long","Lat","depth")])
stop("\nFix and re-run funciton.\n")
}
# min and maximum depth for transition object
if(is.na(min.depth)){ min.depth <- 0
writeLines("\n No min.depth specified, defaulting to 0 m depth.\n")
}
if(is.na(max.depth)){ max.depth <- NULL
writeLines("\n No max.depth specified, defaulting to maximum depth of bathymetry object from marmap.\n")
}
#Make the trans mat object then do the lc dist calculation
if(is.na(trans)){
writeLines("\nCalculating transition object for least-cost analysis.\n")
trans <- marmap::trans.mat(bathydata, min.depth = min.depth, max.depth = max.depth)
}
sites <- coords[, c("Long","Lat")]
rownames(sites) <- coords[, "Code"]
sites <- sites[order(sites$Lat, decreasing = TRUE), ]
writeLines("\nCalculating least cost distances. This will probably can take a few minutes depending on resolution. If insufficient memory error is returned try adjusting the gridres argument (default = 2) to a higher number. gridres refers to the resolution of the bathymetric grid in minutes.\n")
lc.dists <- marmap::lc.dist(trans, sites, res="dist")
lc.dists
## Transform the dist class object into a matrix for subsequent analysis.
lcDist <- as.matrix(lc.dists)
lcDist
# Remove Mediterranean sample
tmp <- as.data.frame(lcDist)
tmp$MED <- NULL
tmp <- tmp[!rownames(tmp) == "MED", ]
tmp <- as.matrix(tmp)
lcDist <- tmp
# Rename rows and columns (I got the pop order from excel using the original coordinate locations).
#rownames(lcDist) <- c("Tri", "NDS", "NDF", "Lab", "Bla", "Ste", "BDO", "NsF", "Mus", "Mir", "NsS", "ScB", "InB", "7iA", "GeB", "M514")
#colnames(lcDist) <- c("Tri", "NDS", "NDF", "Lab", "Bla", "Ste", "BDO", "NsF", "Mus", "Mir", "NsS", "ScB", "InB", "7iA", "GeB", "M514")
lcDist
## Remove teo pops not used in the final analysis.
#Toremove <- c("InB", "NsF")
#lcDist <- lcDist[!(rownames(lcDist) %in% Toremove), !(colnames(lcDist) %in% Toremove)]
#lcDist
## Save the matrix with least coast distances (Km) as a CSV file.
write.table(lcDist, file = "LeastCoastDistanceMatrix_HOM_9pops.csv", row.names = TRUE, col.names = NA, sep = ',', quote = FALSE)
# Load data ---------------------------------------------------------------
# Load libraries
library(data.table)
library(tidyverse)
library(tools)
# Load the previously created lcDist file
lcDist <- read.table("LeastCoastDistanceMatrix_HOM_9pops.csv", header = TRUE, row.names = 1, sep = ',')
# Load pairwise FST matrix
pFst <- read.table(FST_file, header = TRUE, sep = "\t", row.names = 1, stringsAsFactors = FALSE) # Modify the separator for the one used in your file.
# Exclude some samples to only retain the northern samples
to_include <- c("NOS1", "NOS2", "WIE1", "NES", "NPT1")
#to_include <- c("NOS1", "NOS2", "WIE1", "WIE3", "NES", "NPT1")
#to_include <- c("WIE1", "WIE3", "NES", "NPT1")
pFst <- pFst[rownames(pFst) %in% to_include, ]
pFst <- pFst[, colnames(pFst) %in% to_include]
lcDist <- lcDist[rownames(lcDist) %in% to_include, ]
lcDist <- lcDist[, colnames(lcDist) %in% to_include]
# Reorder matrix to match the rownames and colnames of the distance matrix
pFst <- as.matrix(pFst)
pFst <- pFst[rownames(lcDist), colnames(lcDist)]
pFst
# Verify rownames are euqal between GenDit and GeoDist matrixes
all.equal(rownames(pFst), rownames(lcDist))
# Linearize FST values applying the formula FST/(1-FST)
ln_pFst <- apply(pFst, 2, function(x) x/(1-x))
ln_pFst
## Save the matrix with linerialized FSTs as a CSV file
#write.table(pFst, file="Linearized_pairwiseFST_Herring_14pops_reordered.csv", row.names=TRUE, sep=',', quote=FALSE)
# Run the isolaion-by-distance test ---------------------------------------
# Readings: http://dr-k-lo.blogspot.com/2012/04/how-to-do-mantel-test-in-r.html,
# https://stats.idre.ucla.edu/r/faq/how-can-i-perform-a-mantel-test-in-r/
# The test consists of calculating the correlation of the entries in the matrices,
# then permuting the matrices and calculating the same test statistic under each permutation and
# comparing the original test statistic to the distribution of test statistics from the permutations
# to generate a p-value. The number of permutations defines the precision with which the p-value can be calculated.
# The function to perform the Mantel test is mantel.rtest and the required arguments are the two distance matrices.
# The number of permutations can also be specified by the user, but is by default 99.
# Load library
library(ade4)
# Convert the distance matrices to dist class objects
GEOdist <- as.dist(lcDist)
GENdist <- as.dist(ln_pFst) # linearized FST
# Run the Mantel test
set.seed(777)
mantel.rtest(GEOdist, GENdist, nrepet = 9999)
#Monte-Carlo test
#Call: mantelnoneuclid(m1 = m1, m2 = m2, nrepet = nrepet)
#Observation: 0.8917953
#Based on 9999 replicates
#Simulated p-value: 0.0153
#Alternative hypothesis: greater
#Std.Obs Expectation Variance
#2.678299239 -0.002933674 0.111600163
# Accept Ho: there is correlation between genetic distance and geographic distance
# Plot the result
set.seed(777)
ibd <- mantel.rtest(GEOdist, GENdist, nrepet = 9999)
plot(ibd)
# To get the slope you can run a linear model:
summary(lm(GENdist~GEOdist))
lm_2M <- lm(GENdist~GEOdist) # Save lm for plotting.
# Convert dist objects into matrixes for plotting.
geN <- as.matrix(GENdist)
geO <- as.matrix(GEOdist)
# "Melt" the matrices to a three column format and save them in a dataframe
library(reshape2)
split_geN <- setNames(reshape2::melt(geN), c('rows', 'vars', 'FstDist'))
split_geO <- setNames(reshape2::melt(geO), c('rows', 'vars', 'GeoDist'))
df_geOgeN <- cbind.data.frame(split_geN, split_geO, stringsAsFactors=FALSE)
# Get the equation, R-squared adjusted, and significance values as characters
# Source: https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
lm_eqn <- function(df, y, x){
formula = as.formula(sprintf('%s ~ %s', y, x))
m <- lm(formula, data = df);
# formating the values into a summary string to print out
# ~ give some space, but equal size and comma need to be quoted
eq <- substitute(italic(r)^2~"="~r2*","~~italic(p)~"="~italic(pvalue),
#italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~italic(p)~"="~italic(pvalue),
list(#target = y,
input = x,
a = format(as.vector(coef(m)[1]), digits = 2),
b = format(as.vector(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
# getting the pvalue is painful
pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits = 2)
)
)
as.character(as.expression(eq));
}
# Extract and remane columns
df <- df_geOgeN[, c("GeoDist", "FstDist")]
colnames(df) <- c("x", "y")
# Make the plot
p <- ggplot(df, aes(x = x, y = y)) +
geom_point() +
geom_smooth(method = 'lm', formula = y~x, linetype = "dashed", size = 0.5, color = "black") +
geom_text(x = 1450, y = 0, label = lm_eqn(df, 'y','x'), parse = TRUE, size = 6) +
xlab("Geographic distance (km)") +
#ylab("Genetic distance (FST/1-FST)") +
ylab(expression(paste("Genetic distance (", italic("F")[ST]/1-italic("F")[ST],")"))) +
theme_bw() +
theme(legend.position = "right",
panel.grid = element_blank(),
text = element_text(size = 20, colour = "black"),
panel.border = element_rect(colour = "black", fill = NA, size = 1))
#p
# Make a scatter plot to show the relationship between genetic and geographic distance
pdf("Isolation_by_distance_test_HOM_Northern_pops_NOS1-NOS2-WIE1-NES-NPT1.pdf", width = 5.95, height = 4.95)
p
dev.off()
# Reading on IBD: https://www.molecularecologist.com/2012/09/04/isolating-isolation-by-distance/
# test for IBD within population clusters identified by AMOVA or a clustering algorithm
# partial Mantel test can be used to test whether geography (and thereby IBD) contributes to apparent clustering—by
# testing to see whether an association between a matrix of cluster assignments and genetic distances disappears
# when controlled for geographic distance.
GENdist
geN
GEOdist
geO
euclidean_distance <- geO
library(vegan)
eu_dist <- vegdist(geO, method="euclidean")
mantel.partial(geN, geO, geO, method="pearson", permutations=999)