-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMADMMplasso.R
320 lines (270 loc) · 11.5 KB
/
MADMMplasso.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#' @title Fit a multi-response pliable lasso model over a path of regularization values
#' @description This function fits a multi-response pliable lasso model over a path of regularization values.
#' @param X N by p matrix of predictors
#' @param Z N by K matrix of modifying variables. The elements of Z may represent quantitative or categorical variables, or a mixture of the two.
#' Categorical varables should be coded by 0-1 dummy variables: for a k-level variable, one can use either k or k-1 dummy variables.
#' @param y N by D matrix of responses. The X and Z variables are centered in the function. We recommend that X and Z also be standardized before the call
#' @param maxgrid number of lambda_3 values desired (default 50)
#' @param nlambda number of lambda_3 values desired (default 50). Similar to maxgrid but can have a value less than or equal to maxgrid.
#' @param alpha mixing parameter- default 0.5. When the goal is to include more interactions, alpha should be very small and vice versa.
#' @param max_it maximum number of iterations in the ADMM algorithm for one lambda. Default 50000
#' @param rho the Lagrange variable for the ADMM (default 5 ). This value is updated during the ADMM call based on a certain condition.
#' @param e.abs absolute error for the admm. default is 1E-3
#' @param e.rel relative error for the admm-default is 1E-3
#' @param gg penalty term for the tree structure. This is a 2x2 matrix values in the first row representing the maximum to the minimum values for lambda_1 and the second row representing the maximum to the minimum values for lambda_2. In the current setting, we set both maximum and the minimum to be same because cross validation is not carried across the lambda_1 and lambda_2. However, setting different values will work during the model fit.
#' @param my_lambda user specified lambda_3 values. Default NULL
#' @param lambda_min the smallest value for lambda_3 , as a fraction of max(lambda_3), the (data derived (lammax)) entry value (i.e. the smallest value for which all coefficients are zero). Default is 0.001 if N>p, and 0.01 if N< p.
#' @param max_it maximum number of iterations in loop for one lambda during the ADMM optimization. Default 50000
#' @param my_print Should information form each ADMM iteration be printed along the way? Default FALSE. This prints the dual and primal residuals
#' @param alph an overrelaxation parameter in \[1, 1.8\]. Default 1. The implementation is borrowed from Stephen Boyd's \href{https://stanford.edu/~boyd/papers/admm/lasso/lasso.html}{MATLAB code}
#' @param tree The results from the hierarchical clustering of the response matrix. The easy way to obtain this is by using the function (tree_parms) which gives a default clustering. However, user decide on a specific structure and then input a tree that follows such structure.
#' @param parallel should parallel processing be used or not? Default True. If set to true, pal should be set 0.
#' @param pal Should the lapply function be applied for an alternative quicker optimization when there no parallel package available. Default is 0.
#' @param tol threshold for the non-zero coefficients. Default 1E-4
#' @param cl The number of cpu to be used for parallel processing. default 4
#' @param legacy If \code{TRUE}, use the R version of the algorithm. Defaults to
#' C++.
#' @return predicted values for the MADMMplasso object with the following components:
#' path: a table containing the summary of the model for each lambda_3.
#'
#' beta0: a list (length=nlambda) of estimated beta_0 coefficients each having a size of 1 by ncol(y)
#'
#' beta: a list (length=nlambda) of estimated beta coefficients each having a matrix ncol(X) by ncol(y)
#'
#' BETA_hat: a list (length=nlambda) of estimated beta and theta coefficients each having a matrix (ncol(X)+ncol(X) by ncol(Z)) by ncol(y)
#'
#' theta0: a list (length=nlambda) of estimated theta_0 coefficients each having a matrix ncol(Z) by ncol(y)
#'
#' theta: a list (length=nlambda) of estimated theta coefficients each having a an array ncol(X) by ncol(Z) by ncol(y)
#'
#' Lambdas: values of lambda_3 used
#'
#' non_zero: number of nonzero betas for each model in path
#'
#' LOSS: sum of squared of the error for each model in path
#'
#' Y_HAT: a list (length=nlambda) of predicted response nrow(X) by ncol(y)
#'
#' gg: penalty term for the tree structure (lambda_1 and lambda_2) for each lambda_3 nlambda by 2
#' @example inst/examples/MADMMplasso_example.R
#' @export
MADMMplasso <- function(X, Z, y, alpha, my_lambda = NULL, lambda_min = .001, max_it = 50000, e.abs = 1E-3, e.rel = 1E-3, maxgrid, nlambda, rho = 5, my_print = F, alph = 1.8, tree, parallel = T, pal = 0, gg = NULL, tol = 1E-4, cl = 4, legacy = FALSE) {
N <- nrow(X)
p <- ncol(X)
K <- ncol(Z)
D <- dim(y)[2]
TT <- tree
C <- TT$Tree
CW <- TT$Tw
BETA0 <- lapply(
seq_len(nlambda),
function(j) (matrix(0, nrow = (D)))
)
BETA <- lapply(
seq_len(nlambda),
function(j) (as(matrix(0, nrow = p, ncol = (D)), "sparseMatrix"))
)
BETA_hat <- lapply(
seq_len(nlambda),
function(j) (as(matrix(0, nrow = p + p * K, ncol = (D)), "sparseMatrix"))
)
THETA0 <- lapply(
seq_len(nlambda),
function(j) (matrix(0, nrow = K, ncol = (D)))
)
THETA <- lapply(
seq_len(nlambda),
function(j) (as.sparse3Darray(array(0, c(p, K, (D)))))
)
Y_HAT <- lapply(
seq_len(nlambda),
function(j) (matrix(0, nrow = N, ncol = (D)))
)
rat <- lambda_min
if (is.null(my_lambda)) {
lamda_new <- matrix(0, dim(y)[2])
r <- y
lammax <- lapply(
seq_len(dim(y)[2]),
function(g) {
l_max <- max(abs(t(X) %*% (r - colMeans(r))) / length(r[, 1])) / ((1 - alpha) + (max(gg[1, ]) * max(CW) + max(gg[2, ])))
return(l_max)
}
)
big_lambda <- lammax
lambda_i <- lapply(
seq_len(dim(y)[2]),
function(g) {
lam_i <- exp(seq(log(big_lambda[[g]]), log(big_lambda[[g]] * rat), length = maxgrid))
return(lam_i)
}
)
gg1 <- seq((gg[1, 1]), (gg[1, 2]), length = maxgrid)
gg2 <- seq((gg[2, 1]), (gg[2, 2]), length = maxgrid)
gg3 <- matrix(0, maxgrid, 2)
gg3[, 1] <- gg1
gg3[, 2] <- gg2
gg <- gg3
} else {
lambda_i <- my_lambda
gg1 <- gg
gg <- gg1
}
lam_list <- list()
beta_0_list <- list()
theta_0_list <- list()
beta_list <- list()
theta_list <- list()
obj <- c()
n_main_terms <- c()
non_zero_theta <- c()
my_obj <- list()
my_W_hat <- generate_my_w(X = X, Z = Z)
svd.w <- svd(my_W_hat)
svd.w$tu <- t(svd.w$u)
svd.w$tv <- t(svd.w$v)
rho1 <- rho
D <- dim(y)[2]
### for response groups ###############################################################
input <- 1:(dim(y)[2] * nrow(C))
multiple_of_D <- (input %% dim(y)[2]) == 0
I <- matrix(0, nrow = nrow(C) * dim(y)[2], ncol = dim(y)[2])
II <- input[multiple_of_D]
diag(I[c(1:dim(y)[2]), ]) <- C[1, ] * (CW[1])
c_count <- 2
for (e in II[-length(II)]) {
diag(I[c((e + 1):(c_count * dim(y)[2])), ]) <- C[c_count, ] * (CW[c_count])
c_count <- 1 + c_count
}
new_I <- diag(t(I) %*% I)
new_G <- matrix(0, (p + p * K))
new_G[c(1:p)] <- 1
new_G[-c(1:p)] <- 2
new_G[c(1:p)] <- rho * (1 + new_G[c(1:p)])
new_G[-c(1:p)] <- rho * (1 + new_G[-c(1:p)])
invmat <- list() # denominator of the beta estimates
for (rr in 1:D) {
DD1 <- rho1 * (new_I[rr] + 1)
DD2 <- new_G + DD1
invmat[[rr]] <- DD2
}
beta0 <- matrix(0, 1, D)
theta0 <- matrix(0, K, D)
beta <- (matrix(0, p, D))
beta_hat <- (matrix(0, p + p * (K), D))
# auxiliary variables for the L1 norm####
theta <- (array(0, c(p, K, D)))
if (is.null(my_lambda)) {
lam <- matrix(0, nlambda, dim(y)[2])
for (i in 1:dim(y)[2]) {
lam[, i] <- lambda_i[[i]]
}
} else {
lam <- lambda_i
}
r_current <- y
b <- reg(r_current, Z)
beta0 <- b$beta0
theta0 <- b$theta0
new_y <- y - (matrix(1, N) %*% beta0 + Z %*% ((theta0)))
XtY <- crossprod((my_W_hat), (new_y))
cl1 <- cl
if (parallel) {
cl <- makeCluster(cl1, type = "FORK")
doParallel::registerDoParallel(cl = cl)
foreach::getDoParRegistered()
my_values <- foreach(i = 1:nlambda, .packages = "MADMMplasso", .combine = rbind) %dopar% {
admm_MADMMplasso(
beta0, theta0, beta, beta_hat, theta, rho1, X, Z, max_it, my_W_hat, XtY,
y, N, e.abs, e.rel, alpha, lam[i, ], alph, svd.w, tree, my_print,
invmat, gg[i, ],legacy
)
}
parallel::stopCluster(cl)
} else if (parallel == F & pal == 0) {
my_values <- lapply(
seq_len(nlambda),
function(g) {
admm_MADMMplasso(
beta0, theta0, beta, beta_hat, theta, rho1, X, Z, max_it, my_W_hat,
XtY, y, N, e.abs, e.rel, alpha, lam[g, ], alph, svd.w, tree, my_print,
invmat, gg[g, ],legacy
)
}
)
}
repeat_loop <- 0
hh <- 1
while (hh <= nlambda) {
res_dual <- 0 # dual residual
res_pri <- 0 # primal residual
lambda <- lam[hh, ]
start_time <- Sys.time()
if (pal == 1) {
my_values <- admm_MADMMplasso(
beta0, theta0, beta, beta_hat, theta, rho1, X, Z, max_it, my_W_hat, XtY,
y, N, e.abs, e.rel, alpha, lambda, alph, svd.w, tree, my_print, invmat,
gg[hh, ],legacy
)
beta <- my_values$beta
theta <- my_values$theta
converge <- my_values$converge
my_obj[[hh]] <- list(my_values$obj)
beta0 <- my_values$beta0
theta0 <- my_values$theta0 ### iteration
beta_hat <- my_values$beta_hat
y_hat <- my_values$y_hat
}
cost_time <- Sys.time() - start_time
print(cost_time)
if (parallel == T & pal == 0) {
beta <- my_values[hh, ]$beta
theta <- my_values[hh, ]$theta
converge <- my_values[hh, ]$converge
my_obj[[hh]] <- list(my_values[hh, ]$obj)
beta0 <- my_values[hh, ]$beta0
theta0 <- my_values[hh, ]$theta0 ### iteration
beta_hat <- my_values[hh, ]$beta_hat
y_hat <- my_values[hh, ]$y_hat
} else if (parallel == F & pal == 0) {
beta <- my_values[[hh]]$beta
theta <- my_values[[hh]]$theta
converge <- my_values[[hh]]$converge
my_obj[[hh]] <- list(my_values[[hh]]$obj)
beta0 <- my_values[[hh]]$beta0
theta0 <- my_values[[hh]]$theta0 ### iteration
beta_hat <- my_values[[hh]]$beta_hat
y_hat <- my_values[[hh]]$y_hat
}
beta1 <- as(beta * (abs(beta) > tol), "sparseMatrix")
theta1 <- as.sparse3Darray(theta * (abs(theta) > tol))
beta_hat1 <- as(beta_hat * (abs(beta_hat) > tol), "sparseMatrix")
n_interaction_terms <- count_nonzero_a((theta1))
n_main_terms <- (c(n_main_terms, count_nonzero_a((beta1))))
obj1 <- (sum(as.vector((y - y_hat)^2))) / (D * N)
obj <- c(obj, obj1)
non_zero_theta <- (c(non_zero_theta, n_interaction_terms))
lam_list <- (c(lam_list, lambda))
BETA0[[hh]] <- beta0
THETA0[[hh]] <- theta0
BETA[[hh]] <- as(beta1, "sparseMatrix")
BETA_hat[[hh]] <- as(beta_hat1, "sparseMatrix")
Y_HAT[[hh]] <- y_hat
THETA[[hh]] <- as.sparse3Darray(theta1)
if (hh == 1) {
print(c(hh, (n_main_terms[hh]), non_zero_theta[hh], obj1))
} else {
print(c(hh, (n_main_terms[hh]), non_zero_theta[hh], obj[hh - 1], obj1))
}
hh <- hh + 1
} ### lambda
remove(invmat)
remove(my_values)
remove(my_W_hat)
obj[1] <- obj[2]
pred <- data.frame(Lambda = lam, nzero = n_main_terms, nzero_inter = non_zero_theta, OBJ_main = obj)
out <- list(beta0 = BETA0, beta = BETA, BETA_hat = BETA_hat, theta0 = THETA0, theta = THETA, path = pred, Lambdas = lam, non_zero = n_main_terms, LOSS = obj, Y_HAT = Y_HAT, gg = gg)
class(out) <- "MADMMplasso"
# Return results
return(out)
}