-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path01-simulation.R
184 lines (148 loc) · 5.96 KB
/
01-simulation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#########################
## STAT 8053 - Model-X ##
## Simulation script ##
## Linh Nguyen ##
## 2021-10-20 ##
#########################
#### ---- Set up ---- ####
# load libraries
libraries <- c("knockoff", # model-X knockoff
"doMC", # suggest for knockoff
"Matrix", # required for knockoff
"glmnet", # required for knockoff
"ridge") # ridge regression in n > p cases
lapply(libraries, library, character = TRUE)
#### ---- Function definitions ---- ####
# function for Benjamini-Hochberg
bh <- function(X, Y, fdr, beta) {
# arguments:
# X = nxp matrix of covariates
# Y = nx1 vector of responses
# fdr = numeric false discovery rate
# beta = px1 vector of true beta coefficients in linear model
# if n > p
if (ncol(X) < nrow(X)) {
# fit multiple regression and extract p-val
mod <- lm(Y ~ X + 0)
p.val <- summary(mod)$coefficient[, 4]
} else {
# fit ridge regression and extract p-val
mod <- summary(linearRidge(Y ~ X + 0))
mod.chosen <- mod$chosen.nPCs
mod.chosen <- mod$summaries[mod.chosen][[1]]
p.val <- mod.chosen$coefficients[, 5]
}
# B-H hits
bh_selected <- which(p.adjust(p.val, method = "BH") <= fdr)
# FDR for B-H
bh_fdr <- sum(beta[bh_selected] == 0) / max(1, length(bh_selected))
# return list of hits and fdr
return(list(bh_selected = bh_selected,
bh_fdr = bh_fdr))
}
# function for knock off procedures
ko_gaussian <- function(ns, ps,
k, amp, fdr,
reps, seed) {
# arguments:
# ns = vector of different sample sizes
# ps = vector of different numbers of covariates
# rhos = vector of different off-diag entries for Sigma
# k = numeric proportion of covariates with non-zero coefficient
# amp = numeric signal amplitude for noise = 1
# fdr = numeric false discovery rate
# reps = number of replications
# seed = seed
# returns:
# list of length (simulation times)
# results of knock-off procedures and Benjamini-Hochberg procedures
# including selected variables, fdr, run time
# using same simulated data set
# set seed
set.seed(seed)
# initialize return list
results <- vector(mode = "list", length = length(ns)*length(ps))
# create gaussian function
gaussian_knockoffs <- function(X) create.second_order(X)
# initialize simulation count and result list
i <- 0
# looping through all simulation conditions
for (n in ns) {
for (p in ps) {
# simulation count
i = i + 1
ko_time <- bh_time <- vector(mode = "numeric", length = reps)
ko_fdr <- bh_fdr <- vector(mode = "numeric", length = reps)
ko_miss <- bh_miss <- vector(mode = "numeric", length = reps)
# progress bar
pb <- txtProgressBar(max = reps,
char = paste0("sim", i),
style = 3)
for (rep in seq_len(reps)){
### SIMULATE DATA ###
# progress bar
setTxtProgressBar(pb = pb,
value = rep)
# simulate covariates
X <- matrix(rnorm(n*p), nrow = n)
# simulate responses from linear model
hit <- sample(x = p, size = p*k)
beta <- amp * (1:p %in% hit)/sqrt(n)
Y <- X %*% beta + rnorm(n) # linear model + error
### MODEL-X KNOCKOFF PROCEDURES ###
ko_start <- Sys.time()
ko_selected <- knockoff.filter(X = X, y = Y, fdr = fdr,
knockoff = gaussian_knockoffs,
offset = 0)$selected
ko_end <- Sys.time()
ko_fdr[rep] <- sum(beta[ko_selected]==0)/max(1, length(ko_selected))
ko_miss[rep] <- (length(hit) - sum(hit %in% ko_selected))/length(hit)
ko_time[rep] <- ko_end - ko_start
### BENJAMINI-HOCHBERG PROCEDURES ###
bh_start <- Sys.time()
bh_results <- bh(X = X, Y = Y, fdr = fdr, beta = beta)
bh_end <- Sys.time()
bh_selected <- bh_results$bh_selected
bh_fdr[rep] <- bh_results$bh_fdr
bh_miss[rep] <- (length(hit) - sum(hit %in% bh_selected))/length(hit)
bh_time[rep] <- bh_end - bh_start
}
# return results -> CHANGE TO AVERAGE ACROSS REPS
results[[i]] <- list(n = n,
p = p,
k = k,
amp = amp,
fdr = fdr,
ko_fdr = ko_fdr,
bh_fdr = bh_fdr,
ko_miss = ko_miss,
bh_miss = bh_miss,
ko_time = ko_time,
bh_time = bh_time)
}
} # END for ns, ps, rhos LOOPS
# return full simulation results
return(results)
}
#### ---- Simulation p > n ---- ####
# simulation conditions
ns <- c(400) # sample size
ps <- c(1000) # number of covariates
k <- 0.1 # % of covar with non-zero coef/total covar
amp <- 5 # signal amplitude
fdr <- 0.1 # false discovery rate
# simulation results
sim_results_hidim <- ko_gaussian(ns = ns, ps = ps,
k = k, amp = amp, fdr = fdr,
reps = 100, seed = 8053)[[1]]
#### ---- Simulation n > p ---- ####
# simulation conditions
ns <- c(1000) # sample size
ps <- c(400) # number of covariates
k <- 0.1 # % of covar with non-zero coef/total covar
amp <- 5 # signal amplitude
fdr <- 0.1 # false discovery rate
# simulation results
sim_results_fullrank <- ko_gaussian(ns = ns, ps = ps,
k = k, amp = amp, fdr = fdr,
reps = 100, seed = 8053)[[1]]