-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.r
194 lines (156 loc) · 6.97 KB
/
script.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#NanoASV phylosequisation
#Arthur Cousson - 2023
#Contact : arthur.cousson@ird.fr
#Workaround conflicting host vs local libraries
.libPaths(c("/usr/local/lib/R/site-library","/usr/lib/R/site-library","/usr/lib/R/library"))
args <- commandArgs(trailingOnly = TRUE)
DIR <- args[1]
OUTPWD <- args[2]
R_CLEANING <- args[3]
TREE <- args[4]
METADATA <- args[5]
#Phylosequization -----
library(phyloseq)
library(dplyr) #For mget() function
library(ape) #To handle trees
metadata <- read.csv(paste0(METADATA,"/metadata.csv"), row.names = 1, header = TRUE, check.names = FALSE)
barcodes <- rownames(metadata)
ASV.tree <- read.tree(paste0(OUTPWD, "/Results/Phylogeny/ASV.tree"))
##Unknown OTUs ----
if (file.exists(paste0(OUTPWD,"/Results/Unknown_clusters/unknown_clusters.tsv"))) {
U_OTU <- read.csv(paste0(OUTPWD,"/Results/Unknown_clusters/unknown_clusters.tsv"), sep = "\t", header = T, row.names = 1)
U_OTU <- U_OTU[rowSums(U_OTU) > 5,] #Remove clusters with total abundance inferior to five
unknown_taxonomy <- rep("Unknown", times = 7) #Adapt taxonomy dataframe so it fits later on with bad entries for cleaning
###inpu %>% read_csv() %>% filter(rowSums()>5) -> U_OTU
###Unknown taxonomy ----
#Taking care of unknown taxonomy table
U_TAX <- data.frame(matrix(nrow = nrow(U_OTU), ncol = 7))
rownames(U_TAX) <- rownames(U_OTU)
colnames(U_TAX) <- c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus", "Species")
U_TAX[,] <- unknown_taxonomy
U_TAX <- data.frame(U_TAX,
other1 = NA,
other2 = NA,
other3 = NA,
other4 = NA,
other5 = NA,
other6 = NA,
others7 = NA,
others8 = NA,
others9 = NA,
others10 = NA,
others11 = NA,
others12 = NA,
others13 = NA,
others14 = NA,
others15 = NA,
others16 = NA,
others17 = NA,
others19 = NA)
}
##ASV tables ----
#Individuals ASV tables loading
temp_ASV = list.files(path = paste0(OUTPWD,"/Results/ASV/"), pattern = "*.tsv")
for (i in 1:length(temp_ASV)) {
if (file.size(paste(OUTPWD,"/Results/ASV/", temp_ASV[i], sep = "")) == 0) {
assign(temp_ASV[i], data.frame())
} else {
assign(temp_ASV[i], read.csv(file = paste(OUTPWD,"/Results/ASV/", temp_ASV[i], sep = ""),
sep = " ",
row.names = 2, header = FALSE))
}
}
temp_ASV <- mget(temp_ASV) #To get files as objects from names
names(temp_ASV) <- lapply(strsplit(names(temp_ASV), split = "_"), function(x) x[[1]])
barcodes <- names(temp_ASV)
if(file.exists(paste0(OUTPWD,"/Results/Unknown_clusters/unknown_clusters.tsv"))){
for (i in 1:ncol(U_OTU)){
for (j in 1:length(temp_ASV)){
if (colnames(U_OTU)[i] == names(temp_ASV[j])) {
colnames(temp_ASV[[j]]) <- barcodes[j]
temp_ASV[[j]] <- rbind(data.frame(temp_ASV[j]), U_OTU[i])
}
}
}
}
#The following function will deal with empty barcodes (like blanks)
for(i in 1:length(temp_ASV)) {
if (lapply(temp_ASV[i], function(df) nrow(df)) == 0) {
temp_ASV[i] <- list(data.frame(V1 = rep(0, times = 1)))
rownames(temp_ASV[[i]]) <- rownames(temp_ASV[[i-1]])[nrow(temp_ASV[[i-1]])]
}
}
for (i in 1:length(temp_ASV)){
if (rownames(temp_ASV[[i]])[1] == "*"){
rn <- rownames(temp_ASV[[i]])[-1]
temp_ASV[[i]] <- data.frame(temp_ASV[[i]][-1,], check.rows = F, check.names = F)
rownames(temp_ASV[[i]]) <- rn
}
}
for (i in 1:length(temp_ASV)) colnames(temp_ASV[[i]]) <- barcodes[i]
names(temp_ASV) <- barcodes[1:length(temp_ASV)]
##ASV Taxonomy ----
#Individual taxonomy tables loading
temp_TAX = list.files(path = paste0(OUTPWD,"/Results/Tax/"), pattern="*.csv")
for (i in 1:length(temp_TAX)) assign(temp_TAX[i], data.frame(read.csv2(file = paste(OUTPWD,"/Results/Tax/",temp_TAX[i], sep = ""), sep = ";", header = F, check.names = F, fill = TRUE,
col.names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus",
"Species", "other1", "other2", "other3", "other4", "other5", "other6",
"others7", "others8", "others9", "others10", "others11", "others12", "others13", "others14", "others15", "others16", "others17", "others19"))))
#The reason we allow so many fields is that Chloroplasts and Mitochondria (mainly), have non-coherent taxonomy sizes compared to Bacterial full taxonomy.
#To avoid any "Out of field" errors or alike, we allow them to exist in a first time
temp_TAX <- mget(temp_TAX)
for(i in 1: length(temp_TAX)){
if (lapply(temp_TAX[i], function(df) nrow(df)) != 0) {#This step allows to correct for Kingdom, which is currently merged with SILVA ID
rownames(temp_TAX[[i]]) <- sapply(strsplit(temp_TAX[[i]][,1], split = " "), "[[", 1)
temp_TAX[[i]][,1] <- sapply(strsplit(temp_TAX[[i]][,1], split = " "), "[", 2)
if(file.exists(paste0(OUTPWD,"/Results/Unknown_clusters/unknown_clusters.tsv"))){
temp_TAX[[i]] <- rbind(temp_TAX[[i]], U_TAX)
}
}
}
for(i in 1: length(temp_TAX)) {
if (nrow(temp_TAX[[i]]) == 0) {
temp_TAX[[i]][1,] <- temp_TAX[[i-1]][1,]
rownames(temp_TAX[[i]]) <- rownames(temp_ASV[[i-1]])[length(rownames(temp_ASV[[i-1]]))]
}
}
temp_phyloseq <- barcodes
##Phyloseq objects ----
physeq_list <- list()
for (i in 1:length(temp_ASV)) {
# Create the phyloseq object
physeq_object <- phyloseq(otu_table(temp_ASV[[i]], taxa_are_rows = TRUE),
tax_table(as.matrix(temp_TAX[[i]])),
sample_data(metadata))
# Get the name from the barcodes vector
barcode_name <- barcodes[i]
# Assign the phyloseq object to the list with the dynamic name
physeq_list[[barcode_name]] <- physeq_object
}
i<-1 #Reset the incrementation
#Initialize the phyloseq object
NanoASV <- merge_phyloseq(physeq_list[[i]], physeq_list[[i + 1]])
# If more than 2 samples, then, adding them all together
if (length(physeq_list) > 2) {
for (i in 3:length(physeq_list)) {
NanoASV <- merge_phyloseq(NanoASV, physeq_list[[i]])
}
}
##Phylogeny ----
if(TREE == 1){
phy_tree(NanoASV) <- phy_tree(ASV.tree)
}
##Dataset cleaning ----
#Delete bad entries such as Eukaryota, Cyanobacteria and Archea if any
if(R_CLEANING == 1){
NanoASV <- subset_taxa(NanoASV, Kingdom != "Eukaryota")
NanoASV <- subset_taxa(NanoASV, Family != "Mitochondria")
NanoASV <- subset_taxa(NanoASV, Order != "Chloroplast")
}
##Taxonomy cleaning ----
#After those functions, there is no more taxa with fucked up names so we can remove supp fields of taxa table
tax_table(NanoASV) <- tax_table(NanoASV)[,1:7]
#Phyloseq export ----
print("Exporting phyloseq object")
save(NanoASV, file = paste0(OUTPWD,"/Results/Rdata/NanoASV.rdata"))