-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathampliconCov.Rmd
executable file
·191 lines (173 loc) · 9.99 KB
/
ampliconCov.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
---
title: "Amplicon Coverage"
params:
analysisDirFP: null
runID: null
seqDirFP: null
pacbamDir: "pacbam"
pacbamFileSuffix: ".primertrim.sorted.pileup"
ampBED: null
ampMinCov: 30
ampMeanCov: 100
ampMaxNFail: 5
ampMinNPass: 93
---
```{r ampliconCov-comments, include=FALSE}
# There's still a lot of clean up and optimization to be done on this file. Big loops need to be broken up, repeated blocks put into functions, etc.
# Note that turning on cache in block amp_cov is useful when testing on single run, but you have to turn it off when running operationally and delete any _cache directories.
```
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(formattable)
library(DT)
# stop summarise messages https://rstats-tips.net/2020/07/31/get-rid-of-info-of-dplyr-when-grouping-summarise-regrouping-output-by-species-override-with-groups-argument/
options(dplyr.summarise.inform = FALSE)
# Note that Rmd files use the location of the Rmd file as their working directory during knitting. This causes problems when trying to unambiguously locate external dependencies that aren't in the same directory. Setting working directory below to address.
knitr::opts_knit$set(root.dir = params$analysisDirFP)
```
#### Per Amplicon Coverage Summary
The table below lists the number of amplicons per sample or control with coverage statistics passing or failing coverage thresholds. To pass, amplicons must have an average depth of coverage of >= `r params$ampMeanCov`x across their length and a minimum coverage of >= `r params$ampMinCov`x at all nucleotide positions. Samples are listed in the first table and controls in the second. Desirable values are highlighted in green (>= `r params$ampMinNPass`/98 amplicons passing for samples, <= `r params$ampMaxNFail`/98 amplicons passing for controls) and undesirable values in bold red (> `r params$ampMaxNFail`/98 amplicons failing for samples, < `r params$ampMinNPass`/95 amplicons failing for controls). The report column links to detailed plots of the coverage for each amplicon for a given sample or control.
```{r read_bed, include=FALSE}
# read in the amplicon bed file to get the amplicon coordinates
ampInfo <- read_tsv(params$ampBED, col_names = FALSE)
colnames(ampInfo) <- c("REF", "START", "END", "AMP_ID", "POOL", "TRASH")
ampInfo <- select(ampInfo, -TRASH, -REF)
```
```{r amp_cov, include=FALSE}
# Read in the names of all of the files; note cache=TRUE for this block because it takes a long time to read everything in.
pbFiles <- list.files(path = params$pacbamDir,
full.names = TRUE,
recursive = TRUE)
# Now filter it down to just the pileups and sort by even or odd pools
pbFilesPU_even <- vector()
pbFilesPU_odd <- vector()
for (f in 1:length(pbFiles)) {
if (str_detect(pbFiles[f], "pileup") == TRUE && str_detect(pbFiles[f], "even") == TRUE) {
pbFilesPU_even <- append(pbFilesPU_even, pbFiles[f])
} else {
if (str_detect(pbFiles[f], "pileup") == TRUE && str_detect(pbFiles[f], "odd") == TRUE) {
pbFilesPU_odd <- append(pbFilesPU_odd, pbFiles[f])
}
}
}
# Read it all in and format
keepers <- tibble()
colnames(keepers) <- c("pos", "cov", "SampleID", "SampleGroup", "Amplicon", "UnifiedScale")
for (r in 1:nrow(ampInfo)) { # this gives row index for ampInfo, i.e. a single amplicon
# Store some info from ampInfo in convenience variables
start <- ampInfo[r,]$START
end <- ampInfo[r,]$END
pool <- ampInfo[r,]$POOL
ampID <- ampInfo[r,]$AMP_ID
# Loop over even and odd pools separately. At some point make a proper function instead of repeating code blocks.
if (pool == 1) {
for (f in 1:length(pbFilesPU_odd)) { # this gives the index of a file in the odd list
tempPU <- read_tsv(pbFilesPU_odd[f]) %>%
dplyr::filter(pos %in% seq(start, end)) %>%
dplyr::select(pos, cov) %>%
dplyr::mutate(SampleID = str_replace(pbFilesPU_odd[f], "^.*/odd/", "")) %>%
dplyr::mutate(SampleID = str_replace(SampleID, params$pacbamFileSuffix, "")) %>%
dplyr::mutate(SampleGroup = stringr::str_replace(SampleID, "-A(1|2|3|4|5|6|7|8)$", "")) %>%
dplyr::mutate(Amplicon = ampID) %>%
dplyr::arrange(pos)
tempPU <- dplyr::bind_cols(tempPU, seq(1, nrow(tempPU)))
colnames(tempPU) <- c("pos", "cov", "SampleID", "SampleGroup", "Amplicon", "UnifiedScale")
keepers <- bind_rows(keepers, tempPU)
}
} else {
if (pool == 2) {
for (f in 1:length(pbFilesPU_even)) { # this gives the index of a file in the even list
tempPU <- read_tsv(pbFilesPU_even[f]) %>%
dplyr::filter(pos %in% seq(start, end)) %>%
dplyr::select(pos, cov) %>%
dplyr::mutate(SampleID = str_replace(pbFilesPU_even[f], "^.*/even/", "")) %>%
dplyr::mutate(SampleID = str_replace(SampleID, params$pacbamFileSuffix, "")) %>%
dplyr::mutate(SampleGroup = stringr::str_replace(SampleID, "-A(1|2|3|4|5|6|7|8)$", "")) %>%
dplyr::mutate(Amplicon = ampID) %>%
dplyr::arrange(pos)
tempPU <- dplyr::bind_cols(tempPU, seq(1, nrow(tempPU)))
colnames(tempPU) <- c("pos", "cov", "SampleID", "SampleGroup", "Amplicon", "UnifiedScale")
keepers <- bind_rows(keepers, tempPU)
}
}
}
}
```
```{r cov_summary, echo=FALSE, warning=FALSE, message=FALSE}
# Summarize first by SampleID and Amplicon
keepers_sum <- group_by(keepers, SampleID, Amplicon) %>%
summarise(Coverage.Mean = mean(cov),
Coverage.Min = min(cov),
Coverage.Max = max(cov)) %>%
mutate(Mean.Pass = if_else(Coverage.Mean >= params$ampMeanCov, "PASS", "FAIL"),
Min.Pass = if_else(Coverage.Min >= params$ampMinCov, "PASS", "FAIL"))
# Now summarize above summary by SampleID
cov_sum <- group_by(keepers_sum, SampleID) %>%
summarise('Count Amplicons Passing' = sum(Mean.Pass == "PASS" & Min.Pass == "PASS"),
'Count Amplicons Failing' = sum(Mean.Pass == "FAIL" | Min.Pass == "FAIL"))
# Add a column with hyperlinks to the detailed report pages.
cov_sum <- mutate(cov_sum,
Report = paste0('<a href="subpages/',
SampleID,
'-amplicon-detail.html">report</a>'))
# Divide up the samples and controls
cov_sum_samples <- filter(cov_sum, !str_detect(SampleID, "NC-"))
cov_sum_controls <- filter(cov_sum, str_detect(SampleID, "NC-"))
# Make the summary table pretty
highgood_formatter <- formatter("span",
style = x ~ style("color" = ifelse(x >= params$ampMinNPass,
"green",
ifelse(x < params$ampMinNPass,
"red",
"gray")),
"font-weight" = ifelse(x >= params$ampMinNPass,
"plain",
ifelse(x < params$ampMinNPass,
"bold",
"italic"))
))
lowgood_formatter <- formatter("span",
style = x ~ style("color" = ifelse(x <= params$ampMaxNFail,
"green",
ifelse(x > params$ampMaxNFail,
"red",
"gray")),
"font-weight" = ifelse(x <= params$ampMaxNFail,
"plain",
ifelse(x > params$ampMaxNFail,
"bold",
"italic"))
))
formattable(cov_sum_samples, list(
'Count Amplicons Passing' = highgood_formatter,
'Count Amplicons Failing' = lowgood_formatter
))
formattable(cov_sum_controls, list(
'Count Amplicons Passing' = lowgood_formatter,
'Count Amplicons Failing' = highgood_formatter
))
```
```{r spawn, include=FALSE}
#echo=FALSE, results='asis', message=FALSE, warning=FALSE}
for (s in 1:nrow(cov_sum)) {
rmarkdown::render(input = "report/temp/ampliconDetailTemplate.Rmd",
output_file = paste0(cov_sum[s,]$SampleID, "-amplicon-detail.html"),
output_dir = file.path("report", "subpages"),
params = list(covTable = filter(keepers,
SampleID == cov_sum[s,]$SampleID),
sampleID = cov_sum[s,]$SampleID,
passfailTab = filter(keepers_sum,
SampleID == cov_sum[s,]$SampleID)),
envir = new.env())
}
# for testing
# rmarkdown::render(input = "ref_docs/ampliconDetailTemplate.Rmd",
# output_file = paste0("3002228466-ZZYGJK71-A6", "-amplicon-detail.html"),
# params = list(covTable = filter(keepers,
# SampleID == "3002228466-ZZYGJK71-A6"),
# sampleID = "3002228466-ZZYGJK71-A6",
# passfailTab = filter(keepers_sum,
# SampleID == "3002228466-ZZYGJK71-A6")),
# envir = new.env())
```