-
Notifications
You must be signed in to change notification settings - Fork 0
/
DESeq.Rmd
125 lines (95 loc) · 3.51 KB
/
DESeq.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
---
title: "DESeq"
output: html_document
---
## Loading packages
```{r}
#if installation is need, uncomment the following
#if (!require("DESeq2")) install.packages("DESeq2"); library(DESeq2)
#if (!require("apeglm")) install.packages("apeglm"); library(apeglm)
#if (!require("ggplot2")) install.packages("ggplot2"); library(ggplot2)
#if (!require("pheatmap")) install.packages("pheatmap"); library(pheatmap)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(pheatmap)
```
## Loading in the files
```{r}
#load count matrix and reference
#tximport summerization of counts = 'intALL.csv'
#'ref.txt' is the reference file needed by deseq
#data <- read.csv("intAll.csv", header=T, row.names = 1)
#info <- read.table("ref.txt", header = T, sep ="\t")
data <- read.csv("roundedMergedCountFile.csv", header=T, row.names = 1)
info <- read.table("refFile.txt", header = T, sep ="\t")
head(data)
```
## Running DESeq
```{r}
de <- DESeqDataSetFromMatrix(data, info, ~diet)
```
## Editing DESeq output
```{r}
#removes low expressed genes
keep <- rowSums(counts(de)) >= 100
de <- de[keep,]
deSeqData <- DESeq(de)
###un-comment to export normalized read count
normCounts <- counts(deSeqData, normalized = T)
write.csv(normCounts, "normal.allCounts.txm.csv")
#p value less than .05 is diff. exp.
result <- results(deSeqData, alpha = 0.05)
# order based on p adjusted value
resOrdered <- result[order(result$padj),]
###un-comment to export ordered file
##write.csv(resOrdered, "deSeq.order.csv")
```
## Editing data for plots
```{r}
#files generated above
normCount <- read.csv("normal.allCounts.txm.csv", row.names = 1)
deSeqRes <- read.csv("deSeq.order.csv", row.names = 1)
#if padj is <= .05, the value is significant
deSeqRes$Significant <- ifelse(deSeqRes$padj <= 0.05, "Yes", "No")
#taking out any na values
deSeqRes <- na.omit(deSeqRes)
head(deSeqRes)
```
##Creating Volcano plot
```{r}
#baseMean is the normalized count values, dividing by size factor, taken over all samples.. use log bc it needs to be put on a log scale
#log2foldchage is the effective size estimate
#ggplot(deSeqRes, aes( x = log10(baseMean), y = log2FoldChange, color = Significant)) + geom_point()
#volcano plot..
```
```{r}
ggplot(deSeqRes, aes(x = log2FoldChange, y = -log10(padj), color = Significant)) + geom_point() #+ scale_color_brewer(palette = "YlGn")
```
## Creating heatmap
```{r}
#editing data for heat map
signi <- subset(deSeqRes, padj <= 0.05)
allSig <- merge(normCount, signi, by = 0)
sigCounts <- allSig[,2:13]
row.names(sigCounts) <- allSig$Row.names
#creating heatmap
#log2 looks at exponents instead of raw numbers, is used to normalize the data .. if there is a 0 it wont work so you have to do +1
#scale compares expression within a col/row... finds median read count of a row/col.. doesn't look at raw numbers
pheatmap(log2(sigCounts + 1), scale = "row", show_rownames = F, treeheight_row = 0, treeheight_col = 50) #, color = hcl.colors(50, "BluYl"))
```
## Looking at the outliers from Volcano plot
```{r}
#adding in column to show -log10(padj) values that are seen on the volcano plot above
deSeqRes$neglog10 <- (-log10(deSeqRes$padj))
##neglog10 values above 10 are observed to be outliers according to the volcano plot
head(deSeqRes)
```
## Plotting just the outliers
```{r}
signi <- subset(deSeqRes, neglog10 >= 10)
allSig <- merge(normCount, signi, by = 0)
sigCounts <- allSig[,2:13]
row.names(sigCounts) <- allSig$Row.names
pheatmap(log2(sigCounts + 1), scale = "row", show_rownames = F,treeheight_row = 0) #,color = hcl.colors(50, "YlGn"))
```