sum_38targets.Rmd

---
title: "Summarize Methylation Panel for Cancer Research"
author: "Kim Siegmund"
date: "10/10/2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(GenomicAlignments)
library(Rsamtools)
library(tidyverse)
library(writexl)
if(!require("BSgenome.Hsapiens.UCSC.hg19"))
    {BiocManager::install("BSgenome.Hsapiens.UCSC.hg19")} 
library(BSgenome.Hsapiens.UCSC.hg19)
```

# 1. Read in the .bed file for 38 Amplicons

Convert the .bed file to a GRanges object.
```{r readBedfile}
targets <- read.delim("data/Methylation_Panel_for_Cancer_Research.Designed.bed",skip=1, header=FALSE)
colnames(targets) <- c("chr","start","end","unkn","score","strand","mis","name")
targets <- targets[targets$chr!="Lambda",]

WC_strand <- substr(targets$name,11,11)
table(WC_strand)
# replace strand with WC_strand
#targets$strand <- ifelse(WC_strand=="W","+","-")
#table(WC_strand,targets$strand)

ir <- IRanges(start=targets$start,
                end=targets$end)
targetgr <- GRanges(seqnames=targets$chr,strand=targets$strand,
                        ranges=ir)
values(targetgr) <- DataFrame(WC_strand=WC_strand)
save(targetgr,file="data/targetgr.rda")
```

# 2. Find sequences for Amplicons

Grab the reference sequences for the amplicons from hg19. Then let's report for each amplicon the number of CpGs and the GC-content. To count the CGs, I need to use the sequences + 1 base 3' of amplicon (for possible G 3' of C in end position). 
```{r countcgs}
vi = suppressWarnings(Views(Hsapiens,targetgr))
targetseqs <- as(vi,"DNAStringSet")

#compute GC content from this
basefreq=oligonucleotideFrequency(targetseqs,width=1)
head(basefreq)
gcratio <- rowSums(basefreq[,c("C","G")])/width(targetseqs)

#to count CpGs, I need to include 1 base 3' of amplicon.
targetgrplus1 <- targetgr
end(targetgrplus1) <- end(targetgr) + 1
head(cbind(end(targetgr),end(targetgrplus1)))
vi = suppressWarnings(Views(Hsapiens,targetgrplus1))
targetseqsplus1 <- as(vi,"DNAStringSet")

chartable <- cbind.data.frame(region = 1:38,
                  seqnames = as.vector(seqnames(targetgr)),
                  start = start(ranges(targetgr)),
                  end = end(ranges(targetgr)),
                  width = width(ranges(targetgr)),
                  nCpGs = dinucleotideFrequency(targetseqsplus1)[,'CG'],
                  gcratio = round(gcratio*100,1),
                  ampliconseq = targetseqs,
                  info = str_trim(targets$name)
                  )
#chartable
```

Let's save the summary data on these regions in an Excel file, and the target sequences for calling haplotypes later.
```{r savefiles}
write_xlsx(chartable,path ="data/summary-38-amplicons.xlsx")
save(targetseqs,file="data/targetseqs.rda")
```

# 3. Find the positions of CpGs in each amplicon

I'll save these to a list object for later use.
```{r cgfind}
cgpos <- vector(mode = "list", length = 38)
for (i in 1:38) 
  {ts <- targetseqsplus1[i]
  dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
  cgpos[[i]] <- which(dn=="CG")
}
save(cgpos,file="data/cgpos.rda")
```

# 4. Find the positions of CpHs in each amplicon

I'll save these to a list object for later use.
```{r chfind}
Ws <- which(values(targetgrplus1)$WC_strand=="W")
Cs <- which(values(targetgrplus1)$WC_strand=="C")

# These I use for clipping and I apply this to all targets, both W and C.
cposWS <- vector(mode = "list", length = 38)
for (i in 1:38) 
  {ts <- targetseqsplus1[i]
  dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
  cposWS[[i]] <- c(which(dn=="CA"),which(dn=="CC"),which(dn=="CT"),which(dn=="CG"))
  cposWS[[i]] <- sort(cposWS[[i]])
}

chposWS <- vector(mode = "list", length = length(Ws))
for (i in 1:length(Ws)) 
  {ts <- targetseqsplus1[Ws[i]]
  dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
  chposWS[[i]] <- c(which(dn=="CA"),which(dn=="CC"),which(dn=="CT"))
  chposWS[[i]] <- sort(chposWS[[i]])
}

chposCS <- vector(mode = "list", length = length(Cs))
for (i in 1:length(Cs)) 
  {ts <- targetseqsplus1[Cs[i]]
  dn <- BStringSet(toString(ts), end=2:width(ts), width=2)
  chposCS[[i]] <- c(which(dn=="AG"),which(dn=="GG"),which(dn=="TG")) + 1
  chposCS[[i]] <- sort(as.integer(chposCS[[i]]))
}

cxpos <- list(cgpos,cposWS,chposWS,chposCS)
names(cxpos) <- c("cgpos","cposWS","chposWS","chposCS")
rm(cgpos,cposWS,chposWS,chposCS)
save(cxpos,file="data/cxpos.rda")
```


# 5. Number of amplicons per chromosome

```{r regions}
table(as.vector(seqnames(targetgr)))
```


```{r sI}
sessionInfo()
```