diff --git a/NAMESPACE b/NAMESPACE index 1e99f96..d0c142f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -82,6 +82,7 @@ S3method(logLik,covlmc) S3method(logLik,vlmc) S3method(logLik,vlmc_cpp) S3method(loglikelihood,covlmc) +S3method(loglikelihood,multi_vlmc) S3method(loglikelihood,vlmc) S3method(loglikelihood,vlmc_cpp) S3method(metrics,covlmc) @@ -125,6 +126,7 @@ S3method(print,vlmc_cpp) S3method(probs,ctx_node) S3method(probs,ctx_node_cpp) S3method(prune,covlmc) +S3method(prune,multi_vlmc) S3method(prune,vlmc) S3method(prune,vlmc_cpp) S3method(restore_model,ctx_tree_cpp) @@ -182,6 +184,7 @@ export(merged_with) export(metrics) export(model) export(multi_ctx_tree) +export(multi_vlmc) export(parent) export(positions) export(prune) diff --git a/R/vlmc_likelihood_multi.R b/R/vlmc_likelihood_multi.R new file mode 100644 index 0000000..d9c5a91 --- /dev/null +++ b/R/vlmc_likelihood_multi.R @@ -0,0 +1,38 @@ +#' @inherit loglikelihood +#' @param newdata either a discrete time series or a list of discrete time +#' series +#' @description This function evaluates the log-likelihood of a VLMC fitted on a +#' collection of discrete time series. +#' +#' @section Limitation: +#' +#' VLMC fitted via [multi_vlmc()] on a collection discrete time series do not +#' support likelihood calculation with `newdata`. +#' +#' @seealso [multi_vlmc()] +#' @export +loglikelihood.multi_vlmc <- function(vlmc, newdata, initial = c("truncated", "specific", "extended"), + ignore, ...) { + assertthat::assert_that(!missing(newdata)) + if (!is.list(newdata)) { + NextMethod() + } else { + initial <- match.arg(initial) + ll <- 0 + nb_obs <- 0L + for (k in seq_along(newdata)) { + if (missing(ignore)) { + the_ll <- loglikelihood(vlmc, newdata[[k]], initial, ...) + } else { + the_ll <- loglikelihood(vlmc, newdata[[k]], initial, ignore, ...) + } + ll <- ll + as.numeric(the_ll) + nb_obs <- nb_obs + attr(the_ll, "nobs") + } + df <- attr(the_ll, "df") + attr(ll, "df") <- df + attr(ll, "nobs") <- nb_obs + attr(ll, "initial") <- initial + structure(ll, class = c("logLikMixVLMC", "logLik")) + } +} diff --git a/R/vlmc_multi.R b/R/vlmc_multi.R new file mode 100644 index 0000000..048c2ad --- /dev/null +++ b/R/vlmc_multi.R @@ -0,0 +1,106 @@ +## VLMC for multiple series + +#' Fit a Variable Length Markov Chain (VLMC) to a collection of time series +#' +#' This function fits a Variable Length Markov Chain (VLMC) to a collection of +#' discrete time series. +#' +#' Owing to the iterative nature of the construction, this function may use a large +#' quantity of memory as pruning infrequent contexts is only done after +#' computing all of them. It is therefore recommend to avoid large depths and +#' the default value of `max_depth` is smaller than in the single time series +#' function [vlmc()]. +#' +#' @param xs list of discrete times series +#' @param alpha number in (0,1] (default: 0.05) cut off value in quantile scale +#' in the pruning phase. +#' @param cutoff non negative number: cut off value in native (likelihood ratio) +#' scale in the pruning phase. Defaults to the value obtained from `alpha`. +#' Takes precedence over `alpha` is specified. +#' @param min_size integer >= 1 (default: 2). Minimum number of observations for +#' a context in the growing phase of the context tree. +#' @param max_depth integer >= 1 (default: 25). Longest context considered in +#' growing phase of the context tree. +#' @param prune logical: specify whether the context tree should be pruned +#' (default behaviour). +#' @param keep_match logical: specify whether to keep the context matches +#' (default to FALSE) +#' @returns a fitted vlmc model (of class `multi_vlmc`) +#' @examples +#' pc <- powerconsumption[powerconsumption$week %in% 5:8, ] +#' powerlevels <- quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1)) +#' dts <- lapply( +#' 5:8, +#' function(x) { +#' cut(pc$active_power[pc$week == x], +#' breaks = c(0, powerlevels) +#' ) +#' } +#' ) +#' model <- multi_vlmc(dts, max_depth = 3) +#' draw(model) +#' depth(model) +#' @export +#' @seealso [multi_ctx_tree()], [vlmc()] +multi_vlmc <- function(xs, alpha = 0.05, cutoff = NULL, min_size = 2L, max_depth = 25L, + prune = TRUE, keep_match = FALSE) { + ## keep_match=TRUE is currently not supported + assertthat::assert_that(!keep_match) + assertthat::assert_that(is.list(xs)) + ctx_tree <- multi_ctx_tree(xs, + min_size = min_size, max_depth = max_depth, + keep_position = keep_match + ) + if (is.null(cutoff)) { + if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) { + stop("the alpha parameter must be in (0, 1]") + } + cutoff <- to_native(alpha, length(ctx_tree$vals)) + } else { + ## cutoff takes precedence + if (!is.numeric(cutoff) || cutoff < 0) { + stop("the cutoff parameter must be a non negative number") + } + alpha <- to_quantile(cutoff, length(ctx_tree$vals)) + } + if (prune) { + result <- prune_ctx_tree(ctx_tree, alpha = alpha, cutoff = cutoff) + class(result) <- c("multi_vlmc", class(result)) + } else { + result <- new_ctx_tree(ctx_tree$vals, ctx_tree, class = c("multi_vlmc", "vlmc")) + } + result$alpha <- alpha + result$cutoff <- cutoff + result$keep_match <- keep_match + result$data_size <- sum(lengths(xs, use.names = FALSE)) + result$pruned <- prune + result +} + +#' @rdname prune +#' @export +prune.multi_vlmc <- function(vlmc, alpha = 0.05, cutoff = NULL, ...) { + if (is.null(cutoff)) { + if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) { + stop("the alpha parameter must be in (0, 1]") + } + } + result <- prune_ctx_tree(vlmc, + alpha = alpha, cutoff = cutoff, + class = c("multi_vlmc", "vlmc") + ) + if (is.null(cutoff)) { + cutoff <- to_native(alpha, length(vlmc$vals)) + } else { + ## cutoff takes precedence + alpha <- to_quantile(cutoff, length(vlmc$vals)) + } + result$alpha <- alpha + result$cutoff <- cutoff + result$data_size <- vlmc$data_size + result$keep_match <- vlmc$keep_match + ## preserve the construction information + result$max_depth <- vlmc$max_depth + result$pruned <- TRUE + result +} diff --git a/R/vlmc_tune_multi.R b/R/vlmc_tune_multi.R new file mode 100644 index 0000000..c9f4f36 --- /dev/null +++ b/R/vlmc_tune_multi.R @@ -0,0 +1,137 @@ +tune_multi_vlmc <- function(xs, criterion = c("BIC", "AIC"), + initial = c("truncated", "specific", "extended"), + alpha_init = NULL, cutoff_init = NULL, + min_size = 2L, max_depth = 10L, + verbose = 0, + save = c("best", "initial", "all")) { + criterion <- match.arg(criterion) + initial <- match.arg(initial) + save <- match.arg(save) + data_sizes <- lengths(xs) + if (is.null(alpha_init) && is.null(cutoff_init)) { + if (criterion == "BIC") { + cutoff <- 0.25 * log(sum(data_sizes)) + f_criterion <- stats::BIC + } else { + cutoff <- 1 + f_criterion <- stats::AIC + } + } else { + if (is.null(cutoff_init)) { + if (is.null(alpha_init) || !is.numeric(alpha_init) || alpha_init <= 0 || alpha_init > 1) { + stop("the alpha_init parameter must be in (0, 1]") + } + ## we need to compute the state model + nx <- to_dts(xs[[1]]) + cutoff <- to_native(alpha_init, length(nx$vals)) + } else { + ## cutoff takes precedence + if (!is.numeric(cutoff_init) || cutoff_init < 0) { + stop("the cutoff_init parameter must be a non negative number") + } + cutoff <- cutoff_init + } + } + if (criterion == "BIC") { + f_criterion <- stats::BIC + } else { + f_criterion <- stats::AIC + } + if (verbose > 0) { + cat("Fitting a vlmc with max_depth=", max_depth, "and cutoff=", cutoff, "\n") + } + saved_models <- list() + base_model <- multi_vlmc(xs, + cutoff = cutoff, min_size = min_size, + max_depth = max_depth + ) + while (base_model$max_depth) { + n_max_depth <- min(2 * max_depth, min(data_sizes) - 1) + if (n_max_depth > max_depth) { + if (verbose > 0) { + cat("Max depth reached, increasing it to", n_max_depth, "\n") + } + max_depth <- n_max_depth + base_model <- multi_vlmc(xs, cutoff = cutoff, min_size = min_size, max_depth = max_depth) + } else { + warning("cannot find a suitable value for max_depth") + break + } + } + if (verbose > 0) { + cat("Pruning phase\n") + } + cutoffs <- cutoff(base_model, scale = "native") + results <- data.frame( + cutoff = c(cutoff, cutoffs), + alpha = to_quantile(c(cutoff, cutoffs), length(states(base_model))), + depth = rep(NA, length(cutoffs) + 1), + nb_contexts = rep(NA, length(cutoffs) + 1), + loglikelihood = rep(NA, length(cutoffs) + 1), + AIC = rep(NA, length(cutoffs) + 1), + BIC = rep(NA, length(cutoffs) + 1) + ) + k <- 1 + model <- base_model + best_crit <- Inf + if (verbose > 0) { + cat("Initial criterion =", best_crit, "\n") + } + if (save == "all") { + all_models <- vector(mode = "list", length = length(cutoffs)) + } + max_order <- depth(model) + repeat { + if (verbose > 0) { + cat("Computing loglikelihood\n") + } + if (initial == "truncated") { + ll <- loglikelihood(model, initial = "truncated", newdata = xs, ignore = max_order) + } else { + ll <- loglikelihood(model, initial = initial, newdata = xs) + } + crit <- f_criterion(ll) + if (crit <= best_crit) { + best_crit <- crit + best_model <- model + if (verbose > 0) { + cat( + "Improving criterion =", best_crit, "likelihood =", ll, + "df =", attr(ll, "df"), + "nobs = ", attr(ll, "nobs"), "\n" + ) + } + } + results$depth[k] <- depth(model) + results$nb_contexts[k] <- context_number(model) + results$loglikelihood[k] <- ll + results$AIC[k] <- stats::AIC(ll) + results$BIC[k] <- stats::BIC(ll) + if (k <= length(cutoffs)) { + if (verbose > 0) { + cat("Pruning vlmc with cutoff =", cutoffs[k], "\n") + } + model <- prune(model, cutoff = cutoffs[k]) + if (save == "all") { + all_models[[k]] <- model + } + k <- k + 1 + } else { + break + } + } + pre_result <- list( + best_model = best_model, + best_ll = loglikelihood(best_model, newdata = xs), + criterion = criterion, + initial = initial, + results = results, + cutoffs = c(cutoff, cutoffs) + ) + if (save == "all") { + pre_result[["saved_models"]] <- list(initial = base_model, all = all_models) + } else if (save == "initial") { + pre_result[["saved_models"]] <- list(initial = base_model) + } + structure(pre_result, class = "tune_vlmc") +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 41179b5..cb9bc10 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -122,6 +122,8 @@ reference: desc: Functions used to estimate a (CO)VLMC model on a collection of time series rather than on a single one - contents: - multi_ctx_tree + - multi_vlmc + - loglikelihood.multi_vlmc - title: Global options - contents: - mixvlmc-package diff --git a/man/loglikelihood.multi_vlmc.Rd b/man/loglikelihood.multi_vlmc.Rd new file mode 100644 index 0000000..a98806e --- /dev/null +++ b/man/loglikelihood.multi_vlmc.Rd @@ -0,0 +1,127 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vlmc_likelihood_multi.R +\name{loglikelihood.multi_vlmc} +\alias{loglikelihood.multi_vlmc} +\title{Log-Likelihood of a VLMC} +\usage{ +\method{loglikelihood}{multi_vlmc}( + vlmc, + newdata, + initial = c("truncated", "specific", "extended"), + ignore, + ... +) +} +\arguments{ +\item{vlmc}{the vlmc representation.} + +\item{newdata}{either a discrete time series or a list of discrete time +series} + +\item{initial}{specifies the likelihood function, more precisely the way the +first few observations for which contexts cannot be calculated are integrated +in the likelihood. Defaults to \code{"truncated"}. See below for details.} + +\item{ignore}{specifies the number of initial values for which the loglikelihood +will not be computed. The minimal number depends on the likelihood function as +detailed below.} + +\item{...}{additional parameters for loglikelihood.} +} +\value{ +an object of class \code{logLikMixVLMC} and \code{logLik}. This is a number, +the log-likelihood of the (CO)VLMC with the following attributes: +\itemize{ +\item \code{df}: the number of parameters used by the VLMC for this likelihood calculation +\item \code{nobs}: the number of observations included in this likelihood calculation +\item \code{initial}: the value of the \code{initial} parameter used to compute this likelihood +} +} +\description{ +This function evaluates the log-likelihood of a VLMC fitted on a +collection of discrete time series. +} +\details{ +The definition of the likelihood function depends on the value of the +\code{initial} parameters, see the section below as well as the dedicated +vignette: \code{vignette("likelihood", package = "mixvlmc")}. + +For VLMC objects, the method \code{loglikelihood.vlmc} will be used. For VLMC with covariables, \code{loglikelihood.covlmc} +will instead be called. For more informations on \code{loglikelihood} methods, use \code{methods(loglikelihood)} and their associated documentation. +} +\section{Limitation}{ + + +VLMC fitted via \code{\link[=multi_vlmc]{multi_vlmc()}} on a collection discrete time series do not +support likelihood calculation with \code{newdata}. +} + +\section{likelihood calculation}{ + + +In a (CO)VLMC of \code{\link[=depth]{depth()}}=k, we need k past values in order to compute the +context of a given observation. As a consequence, in a time series \code{x}, the +contexts of \code{x[1]} to \code{x[k]} are unknown. Depending on the value of \code{initial} +different likelihood functions are used to tackle this difficulty: +\itemize{ +\item \code{initial=="truncated"}: the likelihood is computed using only +\code{x[(k+1):length(x)]} +\item \code{initial=="specific"}: the likelihood is computed on the full time series +using a specific context for the initial values, \code{x[1]} to \code{x[k]}. Each of +the specific context is unique, leading to a perfect likelihood of 1 (0 in +log scale). Thus the numerical value of the likelihood is identical as the +one obtained with \code{initial=="truncated"} but it is computed on \code{length(x)} +with a model with more parameters than in this previous case. +\item \code{initial=="extended"} (default): the likelihood is computed on the full time series +using an extended context matching for the initial values, \code{x[1]} to \code{x[k]}. +This can be seen as a compromised between the two other possibilities: +the relaxed context matching needs in general to turn internal nodes +of the context tree into actual context, increasing the number of parameters, +but not as much as with "specific". However, the likelihood of say \code{x[1]} +with an empty context is generally not 1 and thus the full likelihood is +smaller than the one computed with "specific". +} + +In all cases, the \code{ignore} first values of the time series are not included +in the computed likelihood, but still used to compute contexts. If \code{ignore} +is not specified, it is set to the minimal possible value, that is k for the +\code{truncated} likelihood and 0 for the other ones. If it is specified, it must +be larger or equal to k for \code{truncated}. + +See the dedicated vignette for a more mathematically oriented discussion: +\code{vignette("likelihood", package = "mixvlmc")}. +} + +\examples{ +## Likelihood for a fitted VLMC. +pc <- powerconsumption[powerconsumption$week == 5, ] +breaks <- c( + 0, + median(powerconsumption$active_power, na.rm = TRUE), + max(powerconsumption$active_power, na.rm = TRUE) +) +labels <- c(0, 1) +dts <- cut(pc$active_power, breaks = breaks, labels = labels) +m_nocovariate <- vlmc(dts) +ll <- loglikelihood(m_nocovariate) +ll +attr(ll, "nobs") +attr(ll, "df") + +## Likelihood for a new time series with previously fitted VLMC. +pc_new <- powerconsumption[powerconsumption$week == 11, ] +dts_new <- cut(pc_new$active_power, breaks = breaks, labels = labels) +ll_new <- loglikelihood(m_nocovariate, newdata = dts_new) +ll_new +attributes(ll_new) +ll_new_specific <- loglikelihood(m_nocovariate, initial = "specific", newdata = dts_new) +ll_new_specific +attributes(ll_new_specific) +ll_new_extended <- loglikelihood(m_nocovariate, initial = "extended", newdata = dts_new) +ll_new_extended +attributes(ll_new_extended) + +} +\seealso{ +\code{\link[=multi_vlmc]{multi_vlmc()}} +} diff --git a/man/multi_vlmc.Rd b/man/multi_vlmc.Rd new file mode 100644 index 0000000..beb0f3c --- /dev/null +++ b/man/multi_vlmc.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vlmc_multi.R +\name{multi_vlmc} +\alias{multi_vlmc} +\title{Fit a Variable Length Markov Chain (VLMC) to a collection of time series} +\usage{ +multi_vlmc( + xs, + alpha = 0.05, + cutoff = NULL, + min_size = 2L, + max_depth = 25L, + prune = TRUE, + keep_match = FALSE +) +} +\arguments{ +\item{xs}{list of discrete times series} + +\item{alpha}{number in (0,1] (default: 0.05) cut off value in quantile scale +in the pruning phase.} + +\item{cutoff}{non negative number: cut off value in native (likelihood ratio) +scale in the pruning phase. Defaults to the value obtained from \code{alpha}. +Takes precedence over \code{alpha} is specified.} + +\item{min_size}{integer >= 1 (default: 2). Minimum number of observations for +a context in the growing phase of the context tree.} + +\item{max_depth}{integer >= 1 (default: 25). Longest context considered in +growing phase of the context tree.} + +\item{prune}{logical: specify whether the context tree should be pruned +(default behaviour).} + +\item{keep_match}{logical: specify whether to keep the context matches +(default to FALSE)} +} +\value{ +a fitted vlmc model (of class \code{multi_vlmc}) +} +\description{ +This function fits a Variable Length Markov Chain (VLMC) to a collection of +discrete time series. +} +\details{ +Owing to the iterative nature of the construction, this function may use a large +quantity of memory as pruning infrequent contexts is only done after +computing all of them. It is therefore recommend to avoid large depths and +the default value of \code{max_depth} is smaller than in the single time series +function \code{\link[=vlmc]{vlmc()}}. +} +\examples{ +pc <- powerconsumption[powerconsumption$week \%in\% 5:8, ] +powerlevels <- quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1)) +dts <- lapply( + 5:8, + function(x) { + cut(pc$active_power[pc$week == x], + breaks = c(0, powerlevels) + ) + } +) +model <- multi_vlmc(dts, max_depth = 3) +draw(model) +depth(model) +} +\seealso{ +\code{\link[=multi_ctx_tree]{multi_ctx_tree()}}, \code{\link[=vlmc]{vlmc()}} +} diff --git a/man/prune.Rd b/man/prune.Rd index db055c5..85ba28d 100644 --- a/man/prune.Rd +++ b/man/prune.Rd @@ -1,9 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/vlmc.R, R/vlmc_cpp.R +% Please edit documentation in R/vlmc.R, R/vlmc_cpp.R, R/vlmc_multi.R \name{prune} \alias{prune} \alias{prune.vlmc} \alias{prune.vlmc_cpp} +\alias{prune.multi_vlmc} \title{Prune a Variable Length Markov Chain (VLMC)} \usage{ prune(vlmc, alpha = 0.05, cutoff = NULL, ...) @@ -11,6 +12,8 @@ prune(vlmc, alpha = 0.05, cutoff = NULL, ...) \method{prune}{vlmc}(vlmc, alpha = 0.05, cutoff = NULL, ...) \method{prune}{vlmc_cpp}(vlmc, alpha = 0.05, cutoff = NULL, ...) + +\method{prune}{multi_vlmc}(vlmc, alpha = 0.05, cutoff = NULL, ...) } \arguments{ \item{vlmc}{a fitted VLMC model.} diff --git a/tests/testthat/test-vlmc_multi.R b/tests/testthat/test-vlmc_multi.R new file mode 100644 index 0000000..92b6295 --- /dev/null +++ b/tests/testthat/test-vlmc_multi.R @@ -0,0 +1,12 @@ +test_that("multi_vlmc results are identical to vlmc ones for a single dts", { + withr::local_seed(0) + for (k in 1:9) { + x <- sample(0:k, 1000 + 100 * k, replace = TRUE) + svlmc <- vlmc(x, alpha = 0.1, max_depth = 15) + mvlmc <- multi_vlmc(list(x), alpha = 0.1, max_depth = 15) + expect_true(compare_ctx( + contexts(mvlmc), + contexts(svlmc) + )) + } +})