-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_basic_processing.R
83 lines (62 loc) · 3.43 KB
/
0_basic_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Heli Juottonen 2019-2022 heli.juottonen@alumni.helsinki.fi
# R scripts for Elsi Hietaranta's willow manuscript
# https://github.com/helijuottonen/elsiwillow
# 0: Basic processing: formatting input files for later analyses, subsampling OTU tables
# bacterial and fungal OTU tables have been produced in mothur (see manuscript for details)
# Disclaimer: I'm a self-taught R user. I try my very best to make sure that the scripts work, but I do not
# guarantee that they are correct, make sense or follow any kind of best practices. Use at your own risk!
# loading necessary packages
library(vegan) #v. 2.5-7
# reading in OTU tables (files as they come from mothur):
# bacteria
elotub <- read.table("data/pajut_bothruns.unique.good.filter.unique.precluster.pick.pick.pick.opti_mcc.shared", header=T)
# fungi
elotuf <- read.table("data/pajubothx.fasta.ITS2.precluster.pick.pick.agc.shared", header=T)
# reading in the metadata file:
elmeta <- read.csv2("data/Elsi_paju_metadata.csv", header=T, row.names=1)
# converting otu tables into dataframes
elotub.df <- as.data.frame(elotub)
elotuf.df <- as.data.frame(elotuf)
# insert sample names in the OTU tables as row names (needed in later commands):
row.names(elotub.df) <- elotub.df$Group
row.names(elotuf.df) <- elotuf.df$Group
# remove unnecessary columns in the OTU table:
# done because the first three columns (=1:3) of the shared files contain unnecessary values
# subset and -c means we are keeping else except the first 3 columns
elotub.df <- subset(elotub.df, select = -c(1:3))
elotuf.df <- subset(elotuf.df, select = -c(1:3))
# making sure the otu table and metadata contain the same samples
# this is done based on row names so otu table and metadata file must have identical formats of row names
elmetab <- elmeta[row.names(elmeta) %in% row.names(elotub.df),]
elmetaf <- elmeta[row.names(elmeta) %in% row.names(elotuf.df),]
elotub.df2 <- elotub.df[row.names(elotub.df) %in% row.names(elmetab),]
elotuf.df2 <- elotuf.df[row.names(elotuf.df) %in% row.names(elmetaf),]
# ordering otu table and metadata based on row names to make sure the order is the same
elotub.df2 <- elotub.df2[order(row.names(elotub.df2)),]
elotuf.df2 <- elotuf.df2[order(row.names(elotuf.df2)),]
elmetab <- elmetab[order(row.names(elmetab)),]
elmetaf <- elmetaf[order(row.names(elmetaf)),]
# subsampling the data (rarefying):
# defining a function for subsampling based on medians:
# picking the same number of reads from all samples: median of read numbers (=row sums)
# except samples that have less reads than the median: all reads are taken
# note:for comparing alpha diversity among samples it would be better to have the exact same number of reads
# this is a compromise because some samples have so few reads
median_raref <- function(x) {
x_sums <- rowSums(x)
x_sums2 <- replace(x_sums, x_sums>(median(x_sums)), median(x_sums))
set.seed(1732)
x.r <- rrarefy(x, x_sums2)
x.rdf <- as.data.frame(x.r)
return(x.rdf)
}
# sumsampling to median number of reads:
elotub.r.all <- median_raref(elotub.df2)
elotuf.r.all <- median_raref(elotuf.df2)
# removing OTUs with less than 5 reads overall
elotub.r <- elotub.r.all[,colSums(elotub.r.all) > 5]
elotuf.r <- elotuf.r.all[,colSums(elotuf.r.all) > 5]
# converting to relative abundances to even out the effects of median rarefaction, removal of rare OTUs
elotub.st <- decostand(elotub.r, "normalize")
elotuf.st <- decostand(elotuf.r, "normalize")
# OTU tables, metadata ready for further analyses