-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathnormalization.DESeq.R
110 lines (79 loc) · 3.72 KB
/
normalization.DESeq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# This script is useful for:
# normalizing samples by row scaling
##------------
## LIBRARIES
##------------
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(DESeq))
suppressPackageStartupMessages(library("optparse"))
options(stringsAsFactors=F)
pseudocount = 1e-05
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
##################
# OPTION PARSING
##################
option_list <- list(
make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
make_option(c("-m", "--metadata"), help="tsv file with the metadata"),
make_option(c("-s", "--scale_by"), help="choose one or multiple attributes you want to scale by. Null for normalize the whole matrix")
#make_option(c("-r", "--row_first"), action="store_true", help="scale first by rows then by columns", default=FALSE),
#make_option(c("-n", "--n_iter"), type='integer', help="how many times to iterate [default=%default]", default=20)
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments <- parse_args(parser, positional_arguments = TRUE)
opt <- arguments$options
print(opt)
###############
# BEGIN
###############
# read options
m <- read.table(opt$input_matrix, h=T)
mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t")
mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
char_cols <- which(sapply(m, class) == 'character')
sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
if (length(char_cols) == 0) {genes = rownames(m)}
if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
m = (apply(m, 2, as.integer))
# apply the log if required
if (opt$log) { m = log2(replace(m, is.na(m), 0) + pseudocount) }
mdata = subset(mdata, labExpId %in% colnames(m))
# store the plot of the distribution with input matrix
gp1 = ggplot(melt(m), aes(x=value)) + geom_density() + facet_wrap(~variable)
##########
# EQUILIBRATION
##########
DESeq_norm = function(matr) {
condition = rep('group',ncol(matr))
cds = newCountDataSet(matr, condition)
cds = estimateSizeFactors(cds)
return(counts(cds, normalized=T))
}
new_m = m
# scale the whole matrix if no value is provided
if (is.null(opt$scale_by)) {
new_m = DESeq_norm(new_m)
}
# scale the sub-matrices defined the scale_by option
if (!is.null(opt$scale_by)) {
scale_by <- strsplit(opt$scale_by, ",")[[1]]
if (length(scale_by) != 1){
ids = apply(data.frame(unique(mdata[, scale_by])), 1, function(x) unique(merge(t(as.data.frame(x)), mdata, by=scale_by)$labExpId ))}
if (length(scale_by) == 1){
ids = sapply(unique(mdata[, scale_by]), function(x) unique(mdata[ mdata[,scale_by] == x,]$labExpId))}
# apply normalization
if (length(scale_by) != 1){for (i in 1:length(ids)) { new_m[, ids[[i]]] <- DESeq_norm(new_m[,ids[[i]]])} }
if (length(scale_by) == 1){for (i in 1:ncol(ids)) { new_m[, ids[[i]]] <- DESeq_norm(new_m[,ids[[i]]])} }
}
if (is.null(opt$scale_by)) {gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~Var2)}else{
gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~variable)}
if (length(char_cols) != 0) {new_m <- cbind(genes, new_m)}
# print output
#--------------
output = sprintf('DESeq_norm.log_%s.pscnt_%s', opt$log, opt$pseudocount)
write.table(new_m, sprintf('%s.tsv',output), quote=F, sep='\t', row.names=F)
pdf(sprintf("%s.pdf", output)); gp1; gp2; dev.off()
q(save='no')