var_genes.plot.r~

library(MASS)

# Get command line arguments
if (interactive()) {
    counts = 'human.dge.txt.gz'
    prefix='test'
} else {
    args = commandArgs(trailingOnly=T)
    counts = args[[1]]
    prefix = args[[2]]
}

# Read data
count.data = read.table(counts, sep='\t', header=T, row.names=1)

# Empirical mean, var and CV
mean_emp = apply(count.data, 1, mean)
var_emp = apply(count.data, 1, var)
cv_emp = sqrt(var_emp) / mean_emp

# NB sampling
a=colSums(count.data)
size_factor =  a/ mean(a)
fit=fitdistr(size_factor, "Gamma")

# Plot results
pdf(paste(prefix, '.var_genes.pdf'), height=10, width=10)
par(mfrow=c(2,2))
hist(size_factor, 50, probability=TRUE, xlab="N_UMI/<N_UMI>")
curve(dgamma(x, shape=fit$estimate[1], rate=fit$estimate[2]),from=0, to=quantile(size_factor, 0.999), add=TRUE, col="red", main="Gamma dist fit for size factor")
text(5,0.6, paste("shape = ", round(fit$estimate[1],2)))
text(5,0.5, paste("rate = ", round(fit$estimate[2],2)))
    
# Gamma distributions of individual genes are just scaled versions. If X ~ Gamma(a,b)
# then cX ~ Gamma(a, b/c)
a_i = rep(fit$estimate[1], length(mean_emp)); names(a_i) = names(mean_emp)
b_i = fit$estimate[2] / mean_emp; names(b_i) = names(mean_emp)
mean_NB = a_i / b_i; var_NB = a_i*(1+b_i) / (b_i^2)
cv_NB = sqrt(var_NB)/mean_NB

# Plot CV vs mean    
plot(mean_emp, cv_emp, pch=16, cex=0.5, col="black", xlab="Mean Counts", ylab="CV (counts)", log="xy")
curve(sqrt(1/x), add=TRUE, col="red", log="xy", lty=2, lwd=2)
or = order(mean_NB)
lines(mean_NB[or], cv_NB[or], col="magenta", lwd=2)

# Plot diffCV
diffCV = log(cv_emp) - log(cv_NB)
hist(diffCV, 50, xlab='diffCV')
dev.off()