-
Notifications
You must be signed in to change notification settings - Fork 1
/
sum_38targets.Rmd
141 lines (116 loc) · 4.41 KB
/
sum_38targets.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
---
title: "Summarize Methylation Panel for Cancer Research"
author: "Kim Siegmund"
date: "10/10/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(GenomicAlignments)
library(Rsamtools)
library(tidyverse)
library(writexl)
if(!require("BSgenome.Hsapiens.UCSC.hg19"))
{BiocManager::install("BSgenome.Hsapiens.UCSC.hg19")}
library(BSgenome.Hsapiens.UCSC.hg19)
```
# 1. Read in the .bed file for 38 Amplicons
Convert the .bed file to a GRanges object.
```{r readBedfile}
targets <- read.delim("data/Methylation_Panel_for_Cancer_Research.Designed.bed",skip=1, header=FALSE)
colnames(targets) <- c("chr","start","end","unkn","score","strand","mis","name")
targets <- targets[targets$chr!="Lambda",]
WC_strand <- substr(targets$name,11,11)
table(WC_strand)
# replace strand with WC_strand
#targets$strand <- ifelse(WC_strand=="W","+","-")
#table(WC_strand,targets$strand)
ir <- IRanges(start=targets$start,
end=targets$end)
targetgr <- GRanges(seqnames=targets$chr,strand=targets$strand,
ranges=ir)
values(targetgr) <- DataFrame(WC_strand=WC_strand)
save(targetgr,file="data/targetgr.rda")
```
# 2. Find sequences for Amplicons
Grab the reference sequences for the amplicons from hg19. Then let's report for each amplicon the number of CpGs and the GC-content. To count the CGs, I need to use the sequences + 1 base 3' of amplicon (for possible G 3' of C in end position).
```{r countcgs}
vi = suppressWarnings(Views(Hsapiens,targetgr))
targetseqs <- as(vi,"DNAStringSet")
#compute GC content from this
basefreq=oligonucleotideFrequency(targetseqs,width=1)
head(basefreq)
gcratio <- rowSums(basefreq[,c("C","G")])/width(targetseqs)
#to count CpGs, I need to include 1 base 3' of amplicon.
targetgrplus1 <- targetgr
end(targetgrplus1) <- end(targetgr) + 1
head(cbind(end(targetgr),end(targetgrplus1)))
vi = suppressWarnings(Views(Hsapiens,targetgrplus1))
targetseqsplus1 <- as(vi,"DNAStringSet")
chartable <- cbind.data.frame(region = 1:38,
seqnames = as.vector(seqnames(targetgr)),
start = start(ranges(targetgr)),
end = end(ranges(targetgr)),
width = width(ranges(targetgr)),
nCpGs = dinucleotideFrequency(targetseqsplus1)[,'CG'],
gcratio = round(gcratio*100,1),
ampliconseq = targetseqs,
info = str_trim(targets$name)
)
#chartable
```
Let's save the summary data on these regions in an Excel file, and the target sequences for calling haplotypes later.
```{r savefiles}
write_xlsx(chartable,path ="data/summary-38-amplicons.xlsx")
save(targetseqs,file="data/targetseqs.rda")
```
# 3. Find the positions of CpGs in each amplicon
I'll save these to a list object for later use.
```{r cgfind}
cgpos <- vector(mode = "list", length = 38)
for (i in 1:38)
{ts <- targetseqsplus1[i]
dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
cgpos[[i]] <- which(dn=="CG")
}
save(cgpos,file="data/cgpos.rda")
```
# 4. Find the positions of CpHs in each amplicon
I'll save these to a list object for later use.
```{r chfind}
Ws <- which(values(targetgrplus1)$WC_strand=="W")
Cs <- which(values(targetgrplus1)$WC_strand=="C")
# These I use for clipping and I apply this to all targets, both W and C.
cposWS <- vector(mode = "list", length = 38)
for (i in 1:38)
{ts <- targetseqsplus1[i]
dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
cposWS[[i]] <- c(which(dn=="CA"),which(dn=="CC"),which(dn=="CT"),which(dn=="CG"))
cposWS[[i]] <- sort(cposWS[[i]])
}
chposWS <- vector(mode = "list", length = length(Ws))
for (i in 1:length(Ws))
{ts <- targetseqsplus1[Ws[i]]
dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
chposWS[[i]] <- c(which(dn=="CA"),which(dn=="CC"),which(dn=="CT"))
chposWS[[i]] <- sort(chposWS[[i]])
}
chposCS <- vector(mode = "list", length = length(Cs))
for (i in 1:length(Cs))
{ts <- targetseqsplus1[Cs[i]]
dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
chposCS[[i]] <- c(which(dn=="AG"),which(dn=="GG"),which(dn=="TG")) + 1
chposCS[[i]] <- sort(as.integer(chposCS[[i]]))
}
cxpos <- list(cgpos,cposWS,chposWS,chposCS)
names(cxpos) <- c("cgpos","cposWS","chposWS","chposCS")
rm(cgpos,cposWS,chposWS,chposCS)
save(cxpos,file="data/cxpos.rda")
```
# 5. Number of amplicons per chromosome
```{r regions}
table(as.vector(seqnames(targetgr)))
```
```{r sI}
sessionInfo()
```