-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvar_genes.plot.r~
50 lines (42 loc) · 1.6 KB
/
var_genes.plot.r~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
library(MASS)
# Get command line arguments
if (interactive()) {
counts = 'human.dge.txt.gz'
prefix='test'
} else {
args = commandArgs(trailingOnly=T)
counts = args[[1]]
prefix = args[[2]]
}
# Read data
count.data = read.table(counts, sep='\t', header=T, row.names=1)
# Empirical mean, var and CV
mean_emp = apply(count.data, 1, mean)
var_emp = apply(count.data, 1, var)
cv_emp = sqrt(var_emp) / mean_emp
# NB sampling
a=colSums(count.data)
size_factor = a/ mean(a)
fit=fitdistr(size_factor, "Gamma")
# Plot results
pdf(paste(prefix, '.var_genes.pdf'), height=10, width=10)
par(mfrow=c(2,2))
hist(size_factor, 50, probability=TRUE, xlab="N_UMI/<N_UMI>")
curve(dgamma(x, shape=fit$estimate[1], rate=fit$estimate[2]),from=0, to=quantile(size_factor, 0.999), add=TRUE, col="red", main="Gamma dist fit for size factor")
text(5,0.6, paste("shape = ", round(fit$estimate[1],2)))
text(5,0.5, paste("rate = ", round(fit$estimate[2],2)))
# Gamma distributions of individual genes are just scaled versions. If X ~ Gamma(a,b)
# then cX ~ Gamma(a, b/c)
a_i = rep(fit$estimate[1], length(mean_emp)); names(a_i) = names(mean_emp)
b_i = fit$estimate[2] / mean_emp; names(b_i) = names(mean_emp)
mean_NB = a_i / b_i; var_NB = a_i*(1+b_i) / (b_i^2)
cv_NB = sqrt(var_NB)/mean_NB
# Plot CV vs mean
plot(mean_emp, cv_emp, pch=16, cex=0.5, col="black", xlab="Mean Counts", ylab="CV (counts)", log="xy")
curve(sqrt(1/x), add=TRUE, col="red", log="xy", lty=2, lwd=2)
or = order(mean_NB)
lines(mean_NB[or], cv_NB[or], col="magenta", lwd=2)
# Plot diffCV
diffCV = log(cv_emp) - log(cv_NB)
hist(diffCV, 50, xlab='diffCV')
dev.off()