-
Notifications
You must be signed in to change notification settings - Fork 3
/
Splatter_Functions.R
143 lines (128 loc) · 5.41 KB
/
Splatter_Functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
Generate_param_table <- function() {
set.seed(20194)
nrep=10;
my_seeds <- round(runif(nrep*2)*10000)
my_seeds <- unique(my_seeds)
my_seeds <- my_seeds[1:nrep]
sim_params <- list(dropouts=c(NA, 1, 2, 3, 4),
propDE=c(0.01, 0.1, 0.3),
method=c("groups"),
ngroups=c(2,5,10),
ngenes=c(1000, 2000, 5000),
seeds=my_seeds[1:4])
param_table <- data.frame(
dropouts=rep(sim_params$dropouts, times=length(sim_params$ngroups)*length(sim_params$method)*length(sim_params$seeds)),
ngroups=rep(sim_params$ngroups, times=length(sim_params$dropouts)*length(sim_params$method)*length(sim_params$seeds)),
method=rep(sim_params$method, times=length(sim_params$dropouts)*length(sim_params$ngroups)*length(sim_params$seeds)),
seed=rep(sim_params$seeds, each=length(sim_params$dropouts)*length(sim_params$method)*length(sim_params$ngroups))
)
set.seed(3819)
param_table$propDE <- sample(sim_params$propDE, nrow(param_table), replace=TRUE)
param_table$ngenes <- sample(sim_params$ngenes, nrow(param_table), replace=TRUE)
param_table$ncells <- rep(1000, times=nrow(param_table));
return(param_table)
}
# Fixed an error in this function 6 Dec 2018
Sim_w_splatter <- function(ngenes=1000, ncells=1000, ngroups=2, seed=101, dropouts=3, propDE=0.1, method="groups", nbatchs=1) {
require("splatter")
require("scater")
if (is.null(dropouts) | is.na(dropouts)) {
sim <- splatSimulate( nGenes = ngenes,
batchCells=ncells,
group.prob=rep(1/ngroups, times=ngroups),
method=method, seed=seed,
de.prob=propDE/ngroups)
} else {
sim <- splatSimulate(nGenes = ngenes,
batchCells=ncells,
group.prob=rep(1/ngroups, times=ngroups),
method=method, seed=seed,
dropout.present=TRUE, dropout.mid=dropouts,
de.prob=propDE/ngroups)
}
sim <- normalise(sim)
return(sim);
}
check_multiDE_accuracy_splatter <- function(sim, mat_name="logcounts", mag_centile=NULL, norm=FALSE) {
require("Hmisc")
source("/nfs/users/nfs_t/ta6/MAGIC/CTP_Functions.R")
require("scater")
require("Matrix")
# Ground Truth
de_cols <- grep("DEFac", colnames(rowData(sim)))
de_tab <- rowData(sim)[,de_cols]
high <- apply(de_tab, 1, max)
low <- apply(de_tab, 1, min)
is.de <- (high-low) != 0
colnames(de_tab) <- sub("DEFac", "", colnames(de_tab))
de_rows <- which(is.de)
# make paths more divergent by only considering the tips
if (grepl("Path",sim$Group[1])){
sim <- sim[,sim$Step > 50]
}
this_mat <- assays(sim)[[mat_name]]
if (norm) {
sf <- Matrix::colSums(this_mat)
this_mat <- t( t(this_mat)/sf*median(sf) )
}
# Non-parametric DE using Kruskal-Wallis test
p_values <- apply(this_mat, 1, function(x) { kruskal.test(x, factor(sim$Group))$p.value})
p_values[is.na(p_values)] <- 1;
# FDR multiple testing correction
q_values <- p.adjust(p_values, method="fdr")
# Observed FC
lvls <- my_row_mean_aggregate(this_mat, factor(sim$Group)) # Obs mean expression by group
mag <- apply(lvls, 1, max) - apply(lvls, 1, min)
# Check direction
# Truth
de_up <- high > 1
top <- sapply(which(de_up), function(i){which(unlist(de_tab[i,]) == high[i])})
de_dn <- low < 1
bottom <- sapply(which(de_dn), function(i){which(unlist(de_tab[i,]) == low[i])})
# Observed
high_obs <- apply(lvls, 1, max)
low_obs <- apply(lvls, 1, min)
top_obs <- sapply(which(de_up), function(i){
out <- which(unlist(lvls[i,]) == high_obs[i])
if (length(out) > 1) { # If ties for highest obs group
if (top[i] %in% out) { return(top[i])}
else {return(out[1])}
} else { return(out)}
})
bottom_obs <- sapply(which(de_dn), function(i){
out <- which(unlist(lvls[i,]) == low_obs[i])
if (length(out) > 1) { # If ties for lowest obs group
if (bottom[i] %in% out) {return(bottom[i])}
else {return(out[1])}
} else {return(out)}
})
# Deal with if both up & down
dir.correct <- rep(TRUE, times=length(is.de))
dir.correct[de_up] <- dir.correct[de_up] & top == top_obs # Is the group with highest obs expression also the group with true highest expression?
dir.correct[de_dn] <- dir.correct[de_dn] & bottom == bottom_obs # Is the group with lowest obs expression also the group with true lowest expression?
thresh <- 0.05 #significance
mag_thresh <- 0 #magnitude
if (!is.null(mag_centile)) {
mag_thresh <- quantile(mag, probs=1-mag_centile)
}
TP <- sum(q_values < thresh & is.de & dir.correct & mag > mag_thresh, na.rm=T)
FP <- sum(q_values < thresh & mag > mag_thresh, na.rm=T) - TP
FN <- sum( (q_values >= thresh | mag <= mag_thresh) & is.de, na.rm=T)
TN <- sum( (q_values >= thresh | mag <= mag_thresh), na.rm=T) - FN
# Get examples of the false positives
FPs <- which( !is.de & q_values < thresh, arr.ind=T )
if ( sum(!is.de & q_values < thresh) > 100) {
FPs_p <- q_values[FPs]
FPs_r <- mag[FPs]
most_FP <- which(abs(FPs_r) >= quantile(abs(FPs_r), prob=1-100/length(FPs_r)))
FPs <- data.frame(gene=FPs, p=FPs_p, r=FPs_r)
FPs <- FPs[most_FP,]
}
wrong_dir <- which(q_values < thresh & is.de & ! dir.correct)
if (length(wrong_dir) > 0) {
wrong_dir <- data.frame(wrong_dir, q_values[wrong_dir], mag[wrong_dir])
}
# Collect results for making an ROC curve
ROC_info <- data.frame(q.values=q_values, magnitude=mag, dir.correct=dir.correct, truth=is.de)
return(list(stats=c(TP, FP, TN, FN), wrongway=wrong_dir, mostfp=FPs, ROC.info=ROC_info))
}