-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First version of the VLMC for collections of time series (issue #30).
- Loading branch information
1 parent
53a9ccc
commit c408db4
Showing
9 changed files
with
499 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#' @inherit loglikelihood | ||
#' @param newdata either a discrete time series or a list of discrete time | ||
#' series | ||
#' @description This function evaluates the log-likelihood of a VLMC fitted on a | ||
#' collection of discrete time series. | ||
#' | ||
#' @section Limitation: | ||
#' | ||
#' VLMC fitted via [multi_vlmc()] on a collection discrete time series do not | ||
#' support likelihood calculation with `newdata`. | ||
#' | ||
#' @seealso [multi_vlmc()] | ||
#' @export | ||
loglikelihood.multi_vlmc <- function(vlmc, newdata, initial = c("truncated", "specific", "extended"), | ||
ignore, ...) { | ||
assertthat::assert_that(!missing(newdata)) | ||
if (!is.list(newdata)) { | ||
NextMethod() | ||
} else { | ||
initial <- match.arg(initial) | ||
ll <- 0 | ||
nb_obs <- 0L | ||
for (k in seq_along(newdata)) { | ||
if (missing(ignore)) { | ||
the_ll <- loglikelihood(vlmc, newdata[[k]], initial, ...) | ||
} else { | ||
the_ll <- loglikelihood(vlmc, newdata[[k]], initial, ignore, ...) | ||
} | ||
ll <- ll + as.numeric(the_ll) | ||
nb_obs <- nb_obs + attr(the_ll, "nobs") | ||
} | ||
df <- attr(the_ll, "df") | ||
attr(ll, "df") <- df | ||
attr(ll, "nobs") <- nb_obs | ||
attr(ll, "initial") <- initial | ||
structure(ll, class = c("logLikMixVLMC", "logLik")) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
## VLMC for multiple series | ||
|
||
#' Fit a Variable Length Markov Chain (VLMC) to a collection of time series | ||
#' | ||
#' This function fits a Variable Length Markov Chain (VLMC) to a collection of | ||
#' discrete time series. | ||
#' | ||
#' Owing to the iterative nature of the construction, this function may use a large | ||
#' quantity of memory as pruning infrequent contexts is only done after | ||
#' computing all of them. It is therefore recommend to avoid large depths and | ||
#' the default value of `max_depth` is smaller than in the single time series | ||
#' function [vlmc()]. | ||
#' | ||
#' @param xs list of discrete times series | ||
#' @param alpha number in (0,1] (default: 0.05) cut off value in quantile scale | ||
#' in the pruning phase. | ||
#' @param cutoff non negative number: cut off value in native (likelihood ratio) | ||
#' scale in the pruning phase. Defaults to the value obtained from `alpha`. | ||
#' Takes precedence over `alpha` is specified. | ||
#' @param min_size integer >= 1 (default: 2). Minimum number of observations for | ||
#' a context in the growing phase of the context tree. | ||
#' @param max_depth integer >= 1 (default: 25). Longest context considered in | ||
#' growing phase of the context tree. | ||
#' @param prune logical: specify whether the context tree should be pruned | ||
#' (default behaviour). | ||
#' @param keep_match logical: specify whether to keep the context matches | ||
#' (default to FALSE) | ||
#' @returns a fitted vlmc model (of class `multi_vlmc`) | ||
#' @examples | ||
#' pc <- powerconsumption[powerconsumption$week %in% 5:8, ] | ||
#' powerlevels <- quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1)) | ||
#' dts <- lapply( | ||
#' 5:8, | ||
#' function(x) { | ||
#' cut(pc$active_power[pc$week == x], | ||
#' breaks = c(0, powerlevels) | ||
#' ) | ||
#' } | ||
#' ) | ||
#' model <- multi_vlmc(dts, max_depth = 3) | ||
#' draw(model) | ||
#' depth(model) | ||
#' @export | ||
#' @seealso [multi_ctx_tree()], [vlmc()] | ||
multi_vlmc <- function(xs, alpha = 0.05, cutoff = NULL, min_size = 2L, max_depth = 25L, | ||
prune = TRUE, keep_match = FALSE) { | ||
## keep_match=TRUE is currently not supported | ||
assertthat::assert_that(!keep_match) | ||
assertthat::assert_that(is.list(xs)) | ||
ctx_tree <- multi_ctx_tree(xs, | ||
min_size = min_size, max_depth = max_depth, | ||
keep_position = keep_match | ||
) | ||
if (is.null(cutoff)) { | ||
if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) { | ||
stop("the alpha parameter must be in (0, 1]") | ||
} | ||
cutoff <- to_native(alpha, length(ctx_tree$vals)) | ||
} else { | ||
## cutoff takes precedence | ||
if (!is.numeric(cutoff) || cutoff < 0) { | ||
stop("the cutoff parameter must be a non negative number") | ||
} | ||
alpha <- to_quantile(cutoff, length(ctx_tree$vals)) | ||
} | ||
if (prune) { | ||
result <- prune_ctx_tree(ctx_tree, alpha = alpha, cutoff = cutoff) | ||
class(result) <- c("multi_vlmc", class(result)) | ||
} else { | ||
result <- new_ctx_tree(ctx_tree$vals, ctx_tree, class = c("multi_vlmc", "vlmc")) | ||
} | ||
result$alpha <- alpha | ||
result$cutoff <- cutoff | ||
result$keep_match <- keep_match | ||
result$data_size <- sum(lengths(xs, use.names = FALSE)) | ||
result$pruned <- prune | ||
result | ||
} | ||
|
||
#' @rdname prune | ||
#' @export | ||
prune.multi_vlmc <- function(vlmc, alpha = 0.05, cutoff = NULL, ...) { | ||
if (is.null(cutoff)) { | ||
if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) { | ||
stop("the alpha parameter must be in (0, 1]") | ||
} | ||
} | ||
result <- prune_ctx_tree(vlmc, | ||
alpha = alpha, cutoff = cutoff, | ||
class = c("multi_vlmc", "vlmc") | ||
) | ||
if (is.null(cutoff)) { | ||
cutoff <- to_native(alpha, length(vlmc$vals)) | ||
} else { | ||
## cutoff takes precedence | ||
alpha <- to_quantile(cutoff, length(vlmc$vals)) | ||
} | ||
result$alpha <- alpha | ||
result$cutoff <- cutoff | ||
result$data_size <- vlmc$data_size | ||
result$keep_match <- vlmc$keep_match | ||
## preserve the construction information | ||
result$max_depth <- vlmc$max_depth | ||
result$pruned <- TRUE | ||
result | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
tune_multi_vlmc <- function(xs, criterion = c("BIC", "AIC"), | ||
initial = c("truncated", "specific", "extended"), | ||
alpha_init = NULL, cutoff_init = NULL, | ||
min_size = 2L, max_depth = 10L, | ||
verbose = 0, | ||
save = c("best", "initial", "all")) { | ||
criterion <- match.arg(criterion) | ||
initial <- match.arg(initial) | ||
save <- match.arg(save) | ||
data_sizes <- lengths(xs) | ||
if (is.null(alpha_init) && is.null(cutoff_init)) { | ||
if (criterion == "BIC") { | ||
cutoff <- 0.25 * log(sum(data_sizes)) | ||
f_criterion <- stats::BIC | ||
} else { | ||
cutoff <- 1 | ||
f_criterion <- stats::AIC | ||
} | ||
} else { | ||
if (is.null(cutoff_init)) { | ||
if (is.null(alpha_init) || !is.numeric(alpha_init) || alpha_init <= 0 || alpha_init > 1) { | ||
stop("the alpha_init parameter must be in (0, 1]") | ||
} | ||
## we need to compute the state model | ||
nx <- to_dts(xs[[1]]) | ||
cutoff <- to_native(alpha_init, length(nx$vals)) | ||
} else { | ||
## cutoff takes precedence | ||
if (!is.numeric(cutoff_init) || cutoff_init < 0) { | ||
stop("the cutoff_init parameter must be a non negative number") | ||
} | ||
cutoff <- cutoff_init | ||
} | ||
} | ||
if (criterion == "BIC") { | ||
f_criterion <- stats::BIC | ||
} else { | ||
f_criterion <- stats::AIC | ||
} | ||
if (verbose > 0) { | ||
cat("Fitting a vlmc with max_depth=", max_depth, "and cutoff=", cutoff, "\n") | ||
} | ||
saved_models <- list() | ||
base_model <- multi_vlmc(xs, | ||
cutoff = cutoff, min_size = min_size, | ||
max_depth = max_depth | ||
) | ||
while (base_model$max_depth) { | ||
n_max_depth <- min(2 * max_depth, min(data_sizes) - 1) | ||
if (n_max_depth > max_depth) { | ||
if (verbose > 0) { | ||
cat("Max depth reached, increasing it to", n_max_depth, "\n") | ||
} | ||
max_depth <- n_max_depth | ||
base_model <- multi_vlmc(xs, cutoff = cutoff, min_size = min_size, max_depth = max_depth) | ||
} else { | ||
warning("cannot find a suitable value for max_depth") | ||
break | ||
} | ||
} | ||
if (verbose > 0) { | ||
cat("Pruning phase\n") | ||
} | ||
cutoffs <- cutoff(base_model, scale = "native") | ||
results <- data.frame( | ||
cutoff = c(cutoff, cutoffs), | ||
alpha = to_quantile(c(cutoff, cutoffs), length(states(base_model))), | ||
depth = rep(NA, length(cutoffs) + 1), | ||
nb_contexts = rep(NA, length(cutoffs) + 1), | ||
loglikelihood = rep(NA, length(cutoffs) + 1), | ||
AIC = rep(NA, length(cutoffs) + 1), | ||
BIC = rep(NA, length(cutoffs) + 1) | ||
) | ||
k <- 1 | ||
model <- base_model | ||
best_crit <- Inf | ||
if (verbose > 0) { | ||
cat("Initial criterion =", best_crit, "\n") | ||
} | ||
if (save == "all") { | ||
all_models <- vector(mode = "list", length = length(cutoffs)) | ||
} | ||
max_order <- depth(model) | ||
repeat { | ||
if (verbose > 0) { | ||
cat("Computing loglikelihood\n") | ||
} | ||
if (initial == "truncated") { | ||
ll <- loglikelihood(model, initial = "truncated", newdata = xs, ignore = max_order) | ||
} else { | ||
ll <- loglikelihood(model, initial = initial, newdata = xs) | ||
} | ||
crit <- f_criterion(ll) | ||
if (crit <= best_crit) { | ||
best_crit <- crit | ||
best_model <- model | ||
if (verbose > 0) { | ||
cat( | ||
"Improving criterion =", best_crit, "likelihood =", ll, | ||
"df =", attr(ll, "df"), | ||
"nobs = ", attr(ll, "nobs"), "\n" | ||
) | ||
} | ||
} | ||
results$depth[k] <- depth(model) | ||
results$nb_contexts[k] <- context_number(model) | ||
results$loglikelihood[k] <- ll | ||
results$AIC[k] <- stats::AIC(ll) | ||
results$BIC[k] <- stats::BIC(ll) | ||
if (k <= length(cutoffs)) { | ||
if (verbose > 0) { | ||
cat("Pruning vlmc with cutoff =", cutoffs[k], "\n") | ||
} | ||
model <- prune(model, cutoff = cutoffs[k]) | ||
if (save == "all") { | ||
all_models[[k]] <- model | ||
} | ||
k <- k + 1 | ||
} else { | ||
break | ||
} | ||
} | ||
pre_result <- list( | ||
best_model = best_model, | ||
best_ll = loglikelihood(best_model, newdata = xs), | ||
criterion = criterion, | ||
initial = initial, | ||
results = results, | ||
cutoffs = c(cutoff, cutoffs) | ||
) | ||
if (save == "all") { | ||
pre_result[["saved_models"]] <- list(initial = base_model, all = all_models) | ||
} else if (save == "initial") { | ||
pre_result[["saved_models"]] <- list(initial = base_model) | ||
} | ||
structure(pre_result, class = "tune_vlmc") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.