First version of the VLMC for collections of time series (issue #30).

fabrice-rossi · Feb 16, 2024 · 055148f · 055148f
1 parent cc00941
commit 055148f
Show file tree

Hide file tree

Showing 9 changed files with 499 additions and 1 deletion.
diff --git a/NAMESPACE b/NAMESPACE
@@ -69,6 +69,7 @@ S3method(logLik,covlmc)
 S3method(logLik,vlmc)
 S3method(logLik,vlmc_cpp)
 S3method(loglikelihood,covlmc)
+S3method(loglikelihood,multi_vlmc)
 S3method(loglikelihood,vlmc)
 S3method(loglikelihood,vlmc_cpp)
 S3method(metrics,covlmc)
@@ -111,6 +112,7 @@ S3method(print,vlmc_cpp)
 S3method(probs,ctx_node)
 S3method(probs,ctx_node_cpp)
 S3method(prune,covlmc)
+S3method(prune,multi_vlmc)
 S3method(prune,vlmc)
 S3method(prune,vlmc_cpp)
 S3method(restore_model,ctx_tree_cpp)
@@ -158,6 +160,7 @@ export(merged_with)
 export(metrics)
 export(model)
 export(multi_ctx_tree)
+export(multi_vlmc)
 export(parent)
 export(positions)
 export(prune)

diff --git a/R/vlmc_likelihood_multi.R b/R/vlmc_likelihood_multi.R
@@ -0,0 +1,38 @@
+#' @inherit loglikelihood
+#' @param newdata either a discrete time series or a list of discrete time
+#'   series
+#' @description This function evaluates the log-likelihood of a VLMC fitted on a
+#'   collection of discrete time series.
+#'
+#' @section Limitation:
+#'
+#'   VLMC fitted via [multi_vlmc()] on a collection discrete time series do not
+#'   support likelihood calculation with `newdata`.
+#'
+#' @seealso [multi_vlmc()]
+#' @export
+loglikelihood.multi_vlmc <- function(vlmc, newdata, initial = c("truncated", "specific", "extended"),
+                                     ignore, ...) {
+  assertthat::assert_that(!missing(newdata))
+  if (!is.list(newdata)) {
+    NextMethod()
+  } else {
+    initial <- match.arg(initial)
+    ll <- 0
+    nb_obs <- 0L
+    for (k in seq_along(newdata)) {
+      if (missing(ignore)) {
+        the_ll <- loglikelihood(vlmc, newdata[[k]], initial, ...)
+      } else {
+        the_ll <- loglikelihood(vlmc, newdata[[k]], initial, ignore, ...)
+      }
+      ll <- ll + as.numeric(the_ll)
+      nb_obs <- nb_obs + attr(the_ll, "nobs")
+    }
+    df <- attr(the_ll, "df")
+    attr(ll, "df") <- df
+    attr(ll, "nobs") <- nb_obs
+    attr(ll, "initial") <- initial
+    structure(ll, class = c("logLikMixVLMC", "logLik"))
+  }
+}
diff --git a/R/vlmc_multi.R b/R/vlmc_multi.R
@@ -0,0 +1,106 @@
+## VLMC for multiple series
+
+#' Fit a Variable Length Markov Chain (VLMC) to a collection of time series
+#'
+#' This function fits a  Variable Length Markov Chain (VLMC) to a collection of
+#' discrete time series.
+#'
+#' Owing to the iterative nature of the construction, this function may use a large
+#' quantity of memory as pruning infrequent contexts is only done after
+#' computing all of them. It is therefore recommend to avoid large depths and
+#' the default value of `max_depth` is smaller than in the single time series
+#' function [vlmc()].
+#'
+#' @param xs list of discrete times series
+#' @param alpha number in (0,1] (default: 0.05) cut off value in quantile scale
+#'   in the pruning phase.
+#' @param cutoff non negative number: cut off value in native (likelihood ratio)
+#'   scale in the pruning phase. Defaults to the value obtained from `alpha`.
+#'   Takes precedence over `alpha` is specified.
+#' @param min_size integer >= 1 (default: 2). Minimum number of observations for
+#'   a context in the growing phase of the context tree.
+#' @param max_depth integer >= 1 (default: 25). Longest context considered in
+#'   growing phase of the context tree.
+#' @param prune logical: specify whether the context tree should be pruned
+#'   (default behaviour).
+#' @param keep_match logical: specify whether to keep the context matches
+#'   (default to FALSE)
+#' @returns a fitted vlmc model (of class `multi_vlmc`)
+#' @examples
+#' pc <- powerconsumption[powerconsumption$week %in% 5:8, ]
+#' powerlevels <- quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1))
+#' dts <- lapply(
+#'   5:8,
+#'   function(x) {
+#'     cut(pc$active_power[pc$week == x],
+#'       breaks = c(0, powerlevels)
+#'     )
+#'   }
+#' )
+#' model <- multi_vlmc(dts, max_depth = 3)
+#' draw(model)
+#' depth(model)
+#' @export
+#' @seealso [multi_ctx_tree()], [vlmc()]
+multi_vlmc <- function(xs, alpha = 0.05, cutoff = NULL, min_size = 2L, max_depth = 25L,
+                       prune = TRUE, keep_match = FALSE) {
+  ## keep_match=TRUE is currently not supported
+  assertthat::assert_that(!keep_match)
+  assertthat::assert_that(is.list(xs))
+  ctx_tree <- multi_ctx_tree(xs,
+    min_size = min_size, max_depth = max_depth,
+    keep_position = keep_match
+  )
+  if (is.null(cutoff)) {
+    if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) {
+      stop("the alpha parameter must be in (0, 1]")
+    }
+    cutoff <- to_native(alpha, length(ctx_tree$vals))
+  } else {
+    ## cutoff takes precedence
+    if (!is.numeric(cutoff) || cutoff < 0) {
+      stop("the cutoff parameter must be a non negative number")
+    }
+    alpha <- to_quantile(cutoff, length(ctx_tree$vals))
+  }
+  if (prune) {
+    result <- prune_ctx_tree(ctx_tree, alpha = alpha, cutoff = cutoff)
+    class(result) <- c("multi_vlmc", class(result))
+  } else {
+    result <- new_ctx_tree(ctx_tree$vals, ctx_tree, class = c("multi_vlmc", "vlmc"))
+  }
+  result$alpha <- alpha
+  result$cutoff <- cutoff
+  result$keep_match <- keep_match
+  result$data_size <- sum(lengths(xs, use.names = FALSE))
+  result$pruned <- prune
+  result
+}
+
+#' @rdname prune
+#' @export
+prune.multi_vlmc <- function(vlmc, alpha = 0.05, cutoff = NULL, ...) {
+  if (is.null(cutoff)) {
+    if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) {
+      stop("the alpha parameter must be in (0, 1]")
+    }
+  }
+  result <- prune_ctx_tree(vlmc,
+    alpha = alpha, cutoff = cutoff,
+    class = c("multi_vlmc", "vlmc")
+  )
+  if (is.null(cutoff)) {
+    cutoff <- to_native(alpha, length(vlmc$vals))
+  } else {
+    ## cutoff takes precedence
+    alpha <- to_quantile(cutoff, length(vlmc$vals))
+  }
+  result$alpha <- alpha
+  result$cutoff <- cutoff
+  result$data_size <- vlmc$data_size
+  result$keep_match <- vlmc$keep_match
+  ## preserve the construction information
+  result$max_depth <- vlmc$max_depth
+  result$pruned <- TRUE
+  result
+}
diff --git a/R/vlmc_tune_multi.R b/R/vlmc_tune_multi.R
@@ -0,0 +1,137 @@
+tune_multi_vlmc <- function(xs, criterion = c("BIC", "AIC"),
+                            initial = c("truncated", "specific", "extended"),
+                            alpha_init = NULL, cutoff_init = NULL,
+                            min_size = 2L, max_depth = 10L,
+                            verbose = 0,
+                            save = c("best", "initial", "all")) {
+  criterion <- match.arg(criterion)
+  initial <- match.arg(initial)
+  save <- match.arg(save)
+  data_sizes <- lengths(xs)
+  if (is.null(alpha_init) && is.null(cutoff_init)) {
+    if (criterion == "BIC") {
+      cutoff <- 0.25 * log(sum(data_sizes))
+      f_criterion <- stats::BIC
+    } else {
+      cutoff <- 1
+      f_criterion <- stats::AIC
+    }
+  } else {
+    if (is.null(cutoff_init)) {
+      if (is.null(alpha_init) || !is.numeric(alpha_init) || alpha_init <= 0 || alpha_init > 1) {
+        stop("the alpha_init parameter must be in (0, 1]")
+      }
+      ## we need to compute the state model
+      nx <- to_dts(xs[[1]])
+      cutoff <- to_native(alpha_init, length(nx$vals))
+    } else {
+      ## cutoff takes precedence
+      if (!is.numeric(cutoff_init) || cutoff_init < 0) {
+        stop("the cutoff_init parameter must be a non negative number")
+      }
+      cutoff <- cutoff_init
+    }
+  }
+  if (criterion == "BIC") {
+    f_criterion <- stats::BIC
+  } else {
+    f_criterion <- stats::AIC
+  }
+  if (verbose > 0) {
+    cat("Fitting a vlmc with max_depth=", max_depth, "and cutoff=", cutoff, "\n")
+  }
+  saved_models <- list()
+  base_model <- multi_vlmc(xs,
+    cutoff = cutoff, min_size = min_size,
+    max_depth = max_depth
+  )
+  while (base_model$max_depth) {
+    n_max_depth <- min(2 * max_depth, min(data_sizes) - 1)
+    if (n_max_depth > max_depth) {
+      if (verbose > 0) {
+        cat("Max depth reached, increasing it to", n_max_depth, "\n")
+      }
+      max_depth <- n_max_depth
+      base_model <- multi_vlmc(xs, cutoff = cutoff, min_size = min_size, max_depth = max_depth)
+    } else {
+      warning("cannot find a suitable value for max_depth")
+      break
+    }
+  }
+  if (verbose > 0) {
+    cat("Pruning phase\n")
+  }
+  cutoffs <- cutoff(base_model, scale = "native")
+  results <- data.frame(
+    cutoff = c(cutoff, cutoffs),
+    alpha = to_quantile(c(cutoff, cutoffs), length(states(base_model))),
+    depth = rep(NA, length(cutoffs) + 1),
+    nb_contexts = rep(NA, length(cutoffs) + 1),
+    loglikelihood = rep(NA, length(cutoffs) + 1),
+    AIC = rep(NA, length(cutoffs) + 1),
+    BIC = rep(NA, length(cutoffs) + 1)
+  )
+  k <- 1
+  model <- base_model
+  best_crit <- Inf
+  if (verbose > 0) {
+    cat("Initial criterion =", best_crit, "\n")
+  }
+  if (save == "all") {
+    all_models <- vector(mode = "list", length = length(cutoffs))
+  }
+  max_order <- depth(model)
+  repeat {
+    if (verbose > 0) {
+      cat("Computing loglikelihood\n")
+    }
+    if (initial == "truncated") {
+      ll <- loglikelihood(model, initial = "truncated", newdata = xs, ignore = max_order)
+    } else {
+      ll <- loglikelihood(model, initial = initial, newdata = xs)
+    }
+    crit <- f_criterion(ll)
+    if (crit <= best_crit) {
+      best_crit <- crit
+      best_model <- model
+      if (verbose > 0) {
+        cat(
+          "Improving criterion =", best_crit, "likelihood =", ll,
+          "df =", attr(ll, "df"),
+          "nobs = ", attr(ll, "nobs"), "\n"
+        )
+      }
+    }
+    results$depth[k] <- depth(model)
+    results$nb_contexts[k] <- context_number(model)
+    results$loglikelihood[k] <- ll
+    results$AIC[k] <- stats::AIC(ll)
+    results$BIC[k] <- stats::BIC(ll)
+    if (k <= length(cutoffs)) {
+      if (verbose > 0) {
+        cat("Pruning vlmc with cutoff =", cutoffs[k], "\n")
+      }
+      model <- prune(model, cutoff = cutoffs[k])
+      if (save == "all") {
+        all_models[[k]] <- model
+      }
+      k <- k + 1
+    } else {
+      break
+    }
+  }
+  pre_result <- list(
+    best_model = best_model,
+    best_ll = loglikelihood(best_model, newdata = xs),
+    criterion = criterion,
+    initial = initial,
+    results = results,
+    cutoffs = c(cutoff, cutoffs)
+  )
+  if (save == "all") {
+    pre_result[["saved_models"]] <- list(initial = base_model, all = all_models)
+  } else if (save == "initial") {
+    pre_result[["saved_models"]] <- list(initial = base_model)
+  }
+  structure(pre_result, class = "tune_vlmc")
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -111,6 +111,8 @@ reference:
   desc: Functions used to estimate a (CO)VLMC model on a collection of time series rather than on a single one
 - contents:
   - multi_ctx_tree
+  - multi_vlmc
+  - loglikelihood.multi_vlmc
 - title: Global options
 - contents:
   - mixvlmc-package