diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index fee07312..c96815cf 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,10 +1,6 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - push: - branches: [master] - pull_request: - branches: [master] release: types: [published] workflow_dispatch: diff --git a/DESCRIPTION b/DESCRIPTION index 7a00b5a5..a492f8fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: BayesMallows Type: Package Title: Bayesian Preference Learning with the Mallows Rank Model -Version: 2.0.1 +Version: 2.0.1.9003 Authors@R: c(person("Oystein", "Sorensen", email = "oystein.sorensen.1985@gmail.com", role = c("aut", "cre"), @@ -46,7 +46,7 @@ BugReports: https://github.com/ocbe-uio/BayesMallows/issues License: GPL-3 Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Depends: R (>= 3.5.0) Imports: Rcpp (>= 1.0.0), ggplot2 (>= 3.1.0), diff --git a/NAMESPACE b/NAMESPACE index 6ca2947d..a5ede157 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,12 +6,15 @@ S3method(compute_consensus,BayesMallows) S3method(compute_consensus,SMCMallows) S3method(compute_posterior_intervals,BayesMallows) S3method(compute_posterior_intervals,SMCMallows) +S3method(generate_initial_ranking,BayesMallowsIntransitive) +S3method(generate_initial_ranking,BayesMallowsTransitiveClosure) S3method(plot,BayesMallows) S3method(plot,SMCMallows) S3method(print,BayesMallows) S3method(print,BayesMallowsMixtures) S3method(print,SMCMallows) S3method(update_mallows,BayesMallows) +S3method(update_mallows,BayesMallowsPriorSamples) S3method(update_mallows,SMCMallows) export(assess_convergence) export(assign_cluster) @@ -33,6 +36,7 @@ export(plot_elbow) export(plot_top_k) export(predict_top_k) export(sample_mallows) +export(sample_prior) export(set_compute_options) export(set_initial_values) export(set_model_options) diff --git a/NEWS.md b/NEWS.md index 3e6c6f7a..15f7b791 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,26 @@ +# BayesMallows (development versions) + +* Fixed a bug in heat_plot() when the model has been estimated with + rho_thinning > 1, causing the probabilities to be unnormalized. Issue #381. + Thanks to Marta Crispino for discovering the bug. +* Added stratified, systematic, and residual resampling to the sequential + Monte Carlo algorithm. These distributions should in general be preferred to + multinomial resampling, which was the only available option until now. +* The move step of the SMC algorithm now allows a user-defined lag for the + sampling of latent ranks, specified in the "latent_sampling_lag" argument + to set_smc_options(). +* Prior for precision parameter alpha is now a gamma distribution. Until now + an exponential distribution has been assumed. Since the exponential is a special + case of the gamma with shape parameter equal to 1 (the default), this is not + a breaking change. However, it adds flexibility when it comes to specifying the prior. +* setup_rank_data() now accepts a single vector of rankings, silently converting a to matrix with a single row. +* Sequential Monte Carlo algorithm can now start from a sample from the prior + distribution, see the sample_prior() function for an example. +* Added support for parallelism under-the-hood with oneTBB. + # BayesMallows 2.0.1 +* Edits to C++ code fixing memory leaks. * Edits to unit tests which caused issues on CRAN. # BayesMallows 2.0.0 diff --git a/R/compute_mallows.R b/R/compute_mallows.R index 1b4e84f0..4c53fa99 100644 --- a/R/compute_mallows.R +++ b/R/compute_mallows.R @@ -79,21 +79,10 @@ compute_mallows <- function( validate_class(initial_values, "BayesMallowsInitialValues") validate_rankings(data) validate_preferences(data, model_options) + validate_rankings(data) validate_initial_values(initial_values, data) - pfun_values <- tryCatch( - prepare_partition_function(model_options$metric, data$n_items), - error = function(e) { - if (is.null(pfun_estimate)) { - stop( - "Exact partition function not known. Please provide an ", - "estimate in argument pfun_estimate." - ) - } else { - return(NULL) - } - } - ) + pfun_values <- extract_pfun_values(model_options, data, pfun_estimate) if (is.null(cl)) { lapplyfun <- lapply diff --git a/R/estimate_partition_function.R b/R/estimate_partition_function.R index de98d281..465f9dca 100644 --- a/R/estimate_partition_function.R +++ b/R/estimate_partition_function.R @@ -112,6 +112,22 @@ estimate_partition_function <- function( matrix(c(power, stats::lm(form, data = estimate)$coefficients), ncol = 2) } +extract_pfun_values <- function(model_options, data, pfun_estimate) { + tryCatch( + prepare_partition_function(model_options$metric, data$n_items), + error = function(e) { + if (is.null(pfun_estimate)) { + stop( + "Exact partition function not known. Please provide an ", + "estimate in argument pfun_estimate." + ) + } else { + return(NULL) + } + } + ) +} + prepare_partition_function <- function(metric, n_items) { if (metric %in% c("cayley", "hamming", "kendall")) { return(NULL) diff --git a/R/generate_initial_ranking.R b/R/generate_initial_ranking.R index a890e9e6..431d99ad 100644 --- a/R/generate_initial_ranking.R +++ b/R/generate_initial_ranking.R @@ -4,6 +4,7 @@ generate_initial_ranking <- function( UseMethod("generate_initial_ranking") } +#' @export generate_initial_ranking.BayesMallowsTransitiveClosure <- function( preferences, cl = NULL, shuffle_unranked = FALSE, random = FALSE, random_limit = 8L) { @@ -34,12 +35,7 @@ generate_initial_ranking.BayesMallowsTransitiveClosure <- function( } } - -.S3method( - "generate_initial_ranking", "BayesMallowsTransitiveClosure", - generate_initial_ranking.BayesMallowsTransitiveClosure -) - +#' @export generate_initial_ranking.BayesMallowsIntransitive <- function( preferences, cl = NULL, shuffle_unranked = FALSE, random = FALSE, random_limit = 8L) { diff --git a/R/heat_plot.R b/R/heat_plot.R index 6ff13fb0..ab017781 100644 --- a/R/heat_plot.R +++ b/R/heat_plot.R @@ -36,14 +36,14 @@ heat_plot <- function(model_fit, burnin = model_fit$burnin, ...) { drop = FALSE ] posterior_ranks$probability <- 1 - posterior_ranks$iteration <- NULL + heatplot_data <- aggregate(posterior_ranks[, "probability", drop = FALSE], by = list( cluster = posterior_ranks$cluster, item = posterior_ranks$item, value = posterior_ranks$value ), - FUN = function(x) sum(x) / (model_fit$nmc - burnin) + FUN = function(x) sum(x) / length(unique(posterior_ranks$iteration)) ) heatplot_data$item <- factor(heatplot_data$item, levels = item_order) diff --git a/R/sample_prior.R b/R/sample_prior.R new file mode 100644 index 00000000..b8e94c06 --- /dev/null +++ b/R/sample_prior.R @@ -0,0 +1,30 @@ +#' Sample from prior distribution +#' +#' Function to obtain samples from the prior distributions of the Bayesian +#' Mallows model. Intended to be given to [update_mallows()]. +#' +#' @param n An integer specifying the number of samples to take. +#' @param n_items An integer specifying the number of items to be ranked. +#' @param priors An object of class "BayesMallowsPriors" returned from +#' [set_priors()]. +#' +#' @return An object of class "BayesMallowsPriorSample", containing `n` +#' independent samples of \eqn{\alpha} and \eqn{\rho}. +#' +#' @export +#' +#' @family modeling +#' @example /inst/examples/sample_prior_example.R +sample_prior <- function(n, n_items, priors = set_priors()) { + validate_positive(n) + validate_positive(n_items) + ret <- list( + alpha = stats::rgamma(n, shape = priors$gamma, rate = priors$lambda), + rho = replicate(n, sample(n_items, n_items)), + priors = priors, + n_items = n_items, + items = seq_len(n_items) + ) + class(ret) <- "BayesMallowsPriorSamples" + ret +} diff --git a/R/set_priors.R b/R/set_priors.R index ea7c3a1e..76e95888 100644 --- a/R/set_priors.R +++ b/R/set_priors.R @@ -3,8 +3,13 @@ #' @description Set values related to the prior distributions for the Bayesian #' Mallows model. #' +#' @param gamma Strictly positive numeric value specifying the shape parameter +#' of the gamma prior distribution of \eqn{\alpha}. Defaults to `1`, thus +#' recovering the exponential prior distribution used by +#' \insertCite{vitelli2018}{BayesMallows}. +#' #' @param lambda Strictly positive numeric value specifying the rate parameter -#' of the truncated exponential prior distribution of \eqn{\alpha}. Defaults +#' of the gamma prior distribution of \eqn{\alpha}. Defaults #' to `0.001`. When `n_cluster > 1`, each mixture component \eqn{\alpha_{c}} #' has the same prior distribution. #' @@ -26,10 +31,13 @@ #' [update_mallows()]. #' @export #' +#' @references \insertAllCited{} +#' #' @family preprocessing #' -set_priors <- function(lambda = 0.001, psi = 10, kappa = c(1, 3)) { +set_priors <- function(gamma = 1, lambda = 0.001, psi = 10, kappa = c(1, 3)) { stopifnot(length(kappa) == 2) + validate_positive(gamma) validate_positive(lambda) validate_integer(psi) validate_positive(psi) diff --git a/R/set_smc_options.R b/R/set_smc_options.R index 81e952c2..7e23af9c 100644 --- a/R/set_smc_options.R +++ b/R/set_smc_options.R @@ -1,20 +1,73 @@ #' @title Set SMC compute options #' -#' @description Sets the SMC compute options to be used in [update_mallows.BayesMallows()]. +#' @description Sets the SMC compute options to be used in +#' [update_mallows.BayesMallows()]. #' #' @param n_particles Integer specifying the number of particles. #' @param mcmc_steps Number of MCMC steps to be applied in the resample-move #' step. +#' @param resampler Character string defining the resampling method to use. One +#' of "stratified", "systematic", "residual", and "multinomial". Defaults to +#' "stratified". While multinomial resampling was used in +#' \insertCite{steinSequentialInferenceMallows2023;textual}{BayesMallows}, +#' stratified, systematic, or residual resampling typically give lower Monte +#' Carlo error \insertCite{Douc2005,Hol2006,Naesseth2019}{BayesMallows}. +#' @param latent_sampling_lag Parameter specifying the number of timesteps to go +#' back when resampling the latent ranks in the move step. See Section 6.2.3 +#' of \insertCite{Kantas2015}{BayesMallows} for details. The \eqn{L} in their +#' notation corresponds to `latent_sampling_lag`. See more under Details. +#' Defaults to `NA`, which means that all latent ranks from previous timesteps +#' are resampled. If set to `0`, no move step is applied to the latent ranks. #' #' @return An object of class "SMCOptions". +#' +#' +#' @section Lag parameter in move step: +#' +#' The parameter `latent_sampling_lag` corresponds to \eqn{L} in +#' \insertCite{Kantas2015}{BayesMallows}. Its use in this package is can be +#' explained in terms of Algorithm 12 in +#' \insertCite{steinSequentialInferenceMallows2023}{BayesMallows}. The +#' relevant line of the algorithm is: +#' +#' **for** \eqn{j = 1 : M_{t}} **do** \cr +#' **M-H step:** update \eqn{\tilde{\mathbf{R}}_{j}^{(i)}} with proposal +#' \eqn{\tilde{\mathbf{R}}_{j}' \sim q(\tilde{\mathbf{R}}_{j}^{(i)} | +#' \mathbf{R}_{j}, \boldsymbol{\rho}_{t}^{(i)}, \alpha_{t}^{(i)})}.\cr +#' **end** +#' +#' Let \eqn{L} denote the value of `latent_sampling_lag`. With this parameter, +#' we modify for algorithm so it becomes +#' +#' **for** \eqn{j = M_{t-L+1} : M_{t}} **do** \cr +#' **M-H step:** update \eqn{\tilde{\mathbf{R}}_{j}^{(i)}} with proposal +#' \eqn{\tilde{\mathbf{R}}_{j}' \sim q(\tilde{\mathbf{R}}_{j}^{(i)} | +#' \mathbf{R}_{j}, \boldsymbol{\rho}_{t}^{(i)}, \alpha_{t}^{(i)})}.\cr +#' **end** +#' +#' This means that with \eqn{L=0} no move step is performed on any latent +#' ranks, whereas \eqn{L=1} means that the move step is only applied to the +#' parameters entering at the given timestep. The default, +#' `latent_sampling_lag = NA` means that \eqn{L=t} at each timestep, and hence +#' all latent ranks are part of the move step at each timestep. +#' +#' #' @export +#' @references \insertAllCited{} #' #' @family preprocessing set_smc_options <- function( n_particles = 1000, - mcmc_steps = 5) { + mcmc_steps = 5, + resampler = c("stratified", "systematic", "residual", "multinomial"), + latent_sampling_lag = NA_integer_) { validate_integer(n_particles) validate_integer(mcmc_steps) + if (!is.na(latent_sampling_lag)) validate_integer(latent_sampling_lag) + resampler <- match.arg( + resampler, + c("stratified", "systematic", "residual", "multinomial") + ) ret <- as.list(environment()) class(ret) <- "SMCOptions" diff --git a/R/setup_rank_data.R b/R/setup_rank_data.R index 2148574b..c5a1d46d 100644 --- a/R/setup_rank_data.R +++ b/R/setup_rank_data.R @@ -8,7 +8,9 @@ #' optional initial value of the rankings. If `rankings` has column names, #' these are assumed to be the names of the items. `NA` values in rankings are #' treated as missing data and automatically augmented; to change this -#' behavior, see the `na_action` argument to [set_model_options()]. +#' behavior, see the `na_action` argument to [set_model_options()]. A vector +#' length `n_items` is silently converted to a matrix of length `1 x n_items`, +#' and names (if any), are used as column names. #' #' @param preferences A data frame with one row per pairwise comparison, and #' columns `assessor`, `top_item`, and `bottom_item`. Each column contains the @@ -80,6 +82,12 @@ #' when all possible orderings are computed, i.e., when `random=TRUE`. #' Defaults to `8L`. #' +#' @param timepoint Integer vector specifying the timepoint. Defaults to `NULL`, +#' which means that a vector of ones, one for each observation, is generated. +#' Used by [update_mallows()] to identify data with a given iteration of the +#' sequential Monte Carlo algorithm. If not `NULL`, must contain one integer +#' for each row in `rankings`. +#' #' @note Setting `random=TRUE` means that all possible orderings of each #' assessor's preferences are generated, and one of them is picked at random. #' This can be useful when experiencing convergence issues, e.g., if the MCMC @@ -126,7 +134,8 @@ setup_rank_data <- function( cl = NULL, shuffle_unranked = FALSE, random = FALSE, - random_limit = 8L) { + random_limit = 8L, + timepoint = NULL) { na_action <- match.arg(na_action, c("augment", "fail", "omit")) if (is.null(rankings) && is.null(preferences)) { @@ -138,10 +147,19 @@ setup_rank_data <- function( if (na_action == "fail" && any(is.na(rankings))) { stop("rankings matrix contains NA values") } + if (!is.matrix(rankings)) { + rankings <- matrix(rankings, + nrow = 1, + dimnames = list(NULL, names(rankings)) + ) + } if (na_action == "omit" && any(is.na(rankings))) { keeps <- apply(rankings, 1, function(x) !any(is.na(x))) - print(paste("Omitting", sum(!keeps), "row(s) from rankings due to NA values")) + print(paste( + "Omitting", sum(!keeps), + "row(s) from rankings due to NA values" + )) rankings <- rankings[keeps, , drop = FALSE] } } else { @@ -153,7 +171,10 @@ setup_rank_data <- function( if (!is.null(observation_frequency)) { validate_positive_vector(observation_frequency) if (nrow(rankings) != length(observation_frequency)) { - stop("observation_frequency must be of same length as the number of rows in rankings") + stop( + "observation_frequency must be of same ", + "length as the number of rows in rankings" + ) } } else { observation_frequency <- rep(1, nrow(rankings)) @@ -164,6 +185,19 @@ setup_rank_data <- function( stop("invalid permutations provided in rankings matrix") } n_items <- ncol(rankings) + + if (!is.null(colnames(rankings))) { + items <- colnames(rankings) + } else { + items <- paste("Item", seq(from = 1, to = n_items, by = 1)) + } + + if (is.null(timepoint)) timepoint <- rep(1, nrow(rankings)) + if (length(timepoint) != nrow(rankings)) { + stop("must have one timepoint per row") + } + + constraints <- generate_constraints(preferences, n_items, cl) consistent <- matrix(integer(0)) diff --git a/R/smc_misc.R b/R/smc_misc.R new file mode 100644 index 00000000..f53c2b70 --- /dev/null +++ b/R/smc_misc.R @@ -0,0 +1,107 @@ +tidy_smc <- function(ret, items) { + result <- list() + result$alpha <- tidy_alpha(matrix(ret$alpha_samples, nrow = 1), 1, 1) + + rho_mat <- array(dim = c(dim(ret$rho_samples)[[1]], 1, dim(ret$rho_samples)[[2]])) + rho_mat[, 1, ] <- ret$rho_samples + result$rho <- tidy_rho(rho_mat, 1, 1, items) + + result +} + +extract_alpha_init <- function(model, n_particles) { + thinned_inds <- floor( + seq( + from = model$burnin + 1, to = ncol(model$alpha_samples), + length.out = n_particles + ) + ) + model$alpha_samples[1, thinned_inds, drop = TRUE] +} + +extract_rho_init <- function(model, n_particles) { + thinned_inds <- floor( + seq( + from = model$burnin + 1, to = dim(model$rho_samples)[[3]], + length.out = n_particles + ) + ) + model$rho_samples[, 1, thinned_inds, drop = TRUE] +} + +prepare_new_data <- function(model, new_data) { + if (!is.null(new_data$user_ids) && !is.null(model$data$user_ids)) { + old_users <- setdiff(model$data$user_ids, new_data$user_ids) + updated_users <- intersect(model$data$user_ids, new_data$user_ids) + new_users <- setdiff(new_data$user_ids, model$data$user_ids) + + rankings <- rbind( + model$data$rankings[old_users, , drop = FALSE], + new_data$rankings[c(updated_users, new_users), , drop = FALSE] + ) + + user_ids <- c(old_users, updated_users, new_users) + + data <- setup_rank_data(rankings = rankings, user_ids = user_ids) + new_data <- setup_rank_data( + rankings = rankings[new_users, , drop = FALSE], + user_ids = new_users + ) + + if (!is.null(model$augmented_rankings)) { + consistent <- matrix( + TRUE, + nrow = nrow(rankings), ncol = model$smc_options$n_particles + ) + for (uu in updated_users) { + index <- which(rownames(rankings) == uu) + to_compare <- as.numeric(stats::na.omit(rankings[index, ])) + + consistent[index, ] <- apply(model$augmented_rankings[, index, ], 2, function(ar) { + all(ar[ar %in% to_compare] == to_compare) + }) + } + data$consistent <- consistent * 1L + } + } else { + rankings <- rbind(model$data$rankings, new_data$rankings) + data <- setup_rank_data( + rankings = rankings, + user_ids = seq_len(nrow(rankings)), + timepoint = c(model$data$timepoint, new_data$timepoint) + ) + } + list(data = data, new_data = new_data) +} + +run_common_part <- function( + data, new_data, model_options, smc_options, compute_options, priors, + initial_values, pfun_list, model) { + ret <- run_smc( + data = data, + new_data = new_data, + model_options = model_options, + smc_options = smc_options, + compute_options = compute_options, + priors = priors, + initial_values = initial_values, + pfun_values = pfun_list$pfun_values, + pfun_estimate = pfun_list$pfun_estimate + ) + + ret <- c(ret, tidy_smc(ret, data$items)) + ret$model_options <- model_options + ret$smc_options <- smc_options + ret$compute_options <- compute_options + ret$priors <- priors + ret$n_items <- model$n_items + ret$burnin <- 0 + ret$n_clusters <- 1 + ret$data <- new_data + ret$pfun_values <- pfun_list$pfun_values + ret$pfun_estimate <- pfun_list$pfun_estimate + ret$metric <- model_options$metric + ret$items <- data$items + class(ret) <- c("SMCMallows", "BayesMallows") + ret +} diff --git a/R/tidy_mcmc.R b/R/tidy_mcmc.R index 0af0af42..a7320d3f 100644 --- a/R/tidy_mcmc.R +++ b/R/tidy_mcmc.R @@ -1,14 +1,6 @@ tidy_mcmc <- function(fits, data, model_options, compute_options) { fit <- list() fit$save_aug <- compute_options$save_aug - - # Add names of item - if (!is.null(colnames(data$rankings))) { - items <- colnames(data$rankings) - } else { - items <- paste("Item", seq(from = 1, to = data$n_items, by = 1)) - } - rho_dims <- dim(fits[[1]]$rho) fit$rho_samples <- fits[[1]]$rho for (f in fits[-1]) { @@ -17,7 +9,7 @@ tidy_mcmc <- function(fits, data, model_options, compute_options) { rhos <- lapply(seq_along(fits), function(i) fits[[i]]$rho) fit$rho <- do.call(rbind, lapply(seq_along(rhos), function(i) { - tidy_rho(rhos[[i]], i, compute_options$rho_thinning, items) + tidy_rho(rhos[[i]], i, compute_options$rho_thinning, data$items) })) alpha_dims <- dim(fits[[1]]$alpha) @@ -51,7 +43,7 @@ tidy_mcmc <- function(fits, data, model_options, compute_options) { fit$augmented_data <- do.call(rbind, lapply(seq_along(fits), function(i) { tidy_augmented_data( - fits[[i]]$augmented_data, i, items, + fits[[i]]$augmented_data, i, data$items, compute_options$aug_thinning ) })) @@ -61,7 +53,7 @@ tidy_mcmc <- function(fits, data, model_options, compute_options) { })) fit$n_clusters <- model_options$n_clusters - fit$items <- items + fit$items <- data$items fit$n_items <- data$n_items fit$n_assessors <- fits[[1]]$n_assessors @@ -72,8 +64,6 @@ tidy_mcmc <- function(fits, data, model_options, compute_options) { return(fit) } - - tidy_rho <- function(rho_mat, chain, rho_thinning, items) { # Tidy rho rho_dims <- dim(rho_mat) diff --git a/R/update_mallows.R b/R/update_mallows.R index e5dec0b0..79bd842e 100644 --- a/R/update_mallows.R +++ b/R/update_mallows.R @@ -6,8 +6,9 @@ #' \insertCite{steinSequentialInferenceMallows2023;textual}{BayesMallows}. #' #' @param model A model object of class "BayesMallows" returned from -#' [compute_mallows()] or an object of class "SMCMallows" returned from this -#' function. +#' [compute_mallows()], an object of class "SMCMallows" returned from this +#' function, or an object of class "BayesMallowsPriorSamples" returned from +#' [sample_prior()]. #' @param new_data An object of class "BayesMallowsData" returned from #' [setup_rank_data()]. The object should contain the new data being provided. #' @param model_options An object of class "BayesMallowsModelOptions" returned @@ -18,6 +19,11 @@ #' returned from [set_compute_options()]. #' @param priors An object of class "BayesMallowsPriors" returned from #' [set_priors()]. Defaults to the priors used in `model`. +#' @param pfun_estimate Object returned from [estimate_partition_function()]. +#' Defaults to \code{NULL}, and will only be used for footrule, Spearman, or +#' Ulam distances when the cardinalities are not available, cf. +#' [get_cardinalities()]. Only used by the specialization for objects of type +#' "BayesMallowsPriorSamples". #' @param ... Optional arguments. Currently not used. #' #' @return An updated model, of class "SMCMallows". @@ -31,6 +37,35 @@ update_mallows <- function(model, new_data, ...) { UseMethod("update_mallows") } +#' @export +#' @rdname update_mallows +update_mallows.BayesMallowsPriorSamples <- function( + model, new_data, model_options = set_model_options(), + smc_options = set_smc_options(), + compute_options = set_compute_options(), + priors = model$priors, + pfun_estimate = NULL, + ...) { + alpha_init <- sample(model$alpha, smc_options$n_particles, replace = TRUE) + rho_init <- model$rho[, sample(ncol(model$rho), smc_options$n_particles, replace = TRUE)] + pfun_values <- extract_pfun_values(model_options, new_data, pfun_estimate) + + run_common_part( + data = new_data, new_data = new_data, model_options = model_options, + smc_options = smc_options, compute_options = compute_options, + priors = priors, + initial_values = list( + alpha_init = alpha_init, rho_init = rho_init, + aug_init = NULL + ), + pfun_list = list( + pfun_values = pfun_values, + pfun_estimate = pfun_estimate + ), + model = model + ) +} + #' @export #' @rdname update_mallows update_mallows.BayesMallows <- function( @@ -45,36 +80,20 @@ update_mallows.BayesMallows <- function( alpha_init <- extract_alpha_init(model, smc_options$n_particles) rho_init <- extract_rho_init(model, smc_options$n_particles) - ret <- run_smc( - data = new_data, - new_data = new_data, - model_options = model_options, - smc_options = smc_options, - compute_options = compute_options, + run_common_part( + data = new_data, new_data = new_data, model_options = model_options, + smc_options = smc_options, compute_options = compute_options, priors = priors, initial_values = list( alpha_init = alpha_init, rho_init = rho_init, aug_init = NULL ), - pfun_values = model$pfun_values, - pfun_estimate = model$pfun_estimate + pfun_list = list( + pfun_values = model$pfun_values, + pfun_estimate = model$pfun_estimate + ), + model = model ) - - ret <- c(ret, tidy_smc(ret, model$items)) - ret$model_options <- model_options - ret$smc_options <- smc_options - ret$compute_options <- compute_options - ret$priors <- priors - ret$n_items <- model$n_items - ret$burnin <- 0 - ret$n_clusters <- 1 - ret$data <- new_data - ret$pfun_values <- model$pfun_values - ret$pfun_estimate <- model$pfun_estimate - ret$metric <- model$metric - ret$items <- model$items - class(ret) <- c("SMCMallows", "BayesMallows") - ret } #' @export @@ -97,6 +116,9 @@ update_mallows.SMCMallows <- function(model, new_data, ...) { pfun_values = model$pfun_values, pfun_estimate = model$pfun_estimate ) + model$alpha_samples <- ret$alpha_samples + model$rho_samples <- ret$rho_samples + model$augmented_rankings <- ret$augmented_rankings tidy_parameters <- tidy_smc(ret, model$items) model$alpha <- tidy_parameters$alpha model$rho <- tidy_parameters$rho @@ -106,78 +128,3 @@ update_mallows.SMCMallows <- function(model, new_data, ...) { class(model) <- c("SMCMallows", "BayesMallows") model } - -tidy_smc <- function(ret, items) { - result <- list() - result$alpha <- tidy_alpha(matrix(ret$alpha_samples, nrow = 1), 1, 1) - - rho_mat <- array(dim = c(dim(ret$rho_samples)[[1]], 1, dim(ret$rho_samples)[[2]])) - rho_mat[, 1, ] <- ret$rho_samples - result$rho <- tidy_rho(rho_mat, 1, 1, items) - - result -} - -extract_alpha_init <- function(model, n_particles) { - thinned_inds <- floor( - seq( - from = model$burnin + 1, to = ncol(model$alpha_samples), - length.out = n_particles - ) - ) - model$alpha_samples[1, thinned_inds, drop = TRUE] -} - -extract_rho_init <- function(model, n_particles) { - thinned_inds <- floor( - seq( - from = model$burnin + 1, to = dim(model$rho_samples)[[3]], - length.out = n_particles - ) - ) - model$rho_samples[, 1, thinned_inds, drop = TRUE] -} - -prepare_new_data <- function(model, new_data) { - if (!is.null(new_data$user_ids) && !is.null(model$data$user_ids)) { - old_users <- setdiff(model$data$user_ids, new_data$user_ids) - updated_users <- intersect(model$data$user_ids, new_data$user_ids) - new_users <- setdiff(new_data$user_ids, model$data$user_ids) - - rankings <- rbind( - model$data$rankings[old_users, , drop = FALSE], - new_data$rankings[c(updated_users, new_users), , drop = FALSE] - ) - - user_ids <- c(old_users, updated_users, new_users) - - data <- setup_rank_data(rankings = rankings, user_ids = user_ids) - new_data <- setup_rank_data( - rankings = rankings[new_users, , drop = FALSE], - user_ids = new_users - ) - - if (!is.null(model$augmented_rankings)) { - consistent <- matrix( - TRUE, - nrow = nrow(rankings), ncol = model$smc_options$n_particles - ) - for (uu in updated_users) { - index <- which(rownames(rankings) == uu) - to_compare <- as.numeric(stats::na.omit(rankings[index, ])) - - consistent[index, ] <- apply(model$augmented_rankings[, index, ], 2, function(ar) { - all(ar[ar %in% to_compare] == to_compare) - }) - } - data$consistent <- consistent * 1L - } - } else { - rankings <- rbind(model$data$rankings, new_data$rankings) - data <- setup_rank_data( - rankings = rankings, - user_ids = seq_len(nrow(rankings)) - ) - } - list(data = data, new_data = new_data) -} diff --git a/README.Rmd b/README.Rmd index fb23fec5..87bc79bb 100644 --- a/README.Rmd +++ b/README.Rmd @@ -113,6 +113,16 @@ Among the current applications, @liu2019b applied the Bayesian Mallows model for Plans for future extensions of the package include implementation of a variational Bayes algorithm for approximation the posterior distribution. The sequential Monte Carlo algorithms will also be extended to cover a larger part of the model framework, and we will add more options for specifications of prior distributions. +## Compilation with Thread Building Blocks (TBB) + +Parts of the underlying C++ code can be easily parallelized. We have support for this through [oneAPI Threading Building Blocks](https://github.com/oneapi-src/oneTBB). If you want to make use of this, build the package from source with the `-ltbb` argument. On an M1 Mac this can be achieved by first installing TBB through `brew install tbb` and then adding the following line to `~/.R/Makevars`: + +```{sh, eval=FALSE} +CXX17=g++-13 -I/opt/homebrew/include -L/opt/homebrew/lib -ltbb +``` + +The arguments on other platforms should be similar. If TBB is not available, the package will fall back on sequential processing. + ## Citation If using the BayesMallows package in academic work, please cite @sorensen2020, in addition to the relevant methodological papers. diff --git a/README.md b/README.md index 06bd6d19..87451358 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,23 @@ distribution. The sequential Monte Carlo algorithms will also be extended to cover a larger part of the model framework, and we will add more options for specifications of prior distributions. +## Compilation with Thread Building Blocks (TBB) + +Parts of the underlying C++ code can be easily parallelized. We have +support for this through [oneAPI Threading Building +Blocks](https://github.com/oneapi-src/oneTBB). If you want to make use +of this, build the package from source with the `-ltbb` argument. On an +M1 Mac this can be achieved by first installing TBB through +`brew install tbb` and then adding the following line to +`~/.R/Makevars`: + +``` sh +CXX17=g++-13 -I/opt/homebrew/include -L/opt/homebrew/lib -ltbb +``` + +The arguments on other platforms should be similar. If TBB is not +available, the package will fall back on sequential processing. + ## Citation If using the BayesMallows package in academic work, please cite Sørensen diff --git a/codecov.yml b/codecov.yml index eb33c156..3358cd89 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,3 +1,6 @@ +ignore: + - "work-docs" + comment: false coverage: diff --git a/cran-comments.md b/cran-comments.md index f7447802..eb86ca68 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -8,10 +8,15 @@ There are also two failing unit tests on noLD. We believe these to be cause be d ## Test Environments + +* Local Ubuntu 22.04, R 4.3.2, running R CMD check with --use-valgrind option. +* Local Ubuntu 23.04 with R 4.3.2 built from source with option "--with-valgrind-instrumentation=2", running R CMD check with --use-valgrind option. +* r-devel-san via rocker/r-devel-san, running R CMD check with --use-valgrind option. * r-devel-san via rocker/r-devel-san. * local Windows install, R 4.3.2. * windows, devel, release and old-release. * R-CMD-check via GitHub Actions on windows-latest, macOS-latest, ubuntu-20.04 (release), and ubuntu-20.04 (devel). +* M1 builder. ## R CMD CHECK results diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 65c22981..e9a1bee2 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -1,3 +1,54 @@ +@inproceedings{Hol2006, + title = {On Resampling Algorithms for Particle Filters}, + url = {http://dx.doi.org/10.1109/NSSPW.2006.4378824}, + DOI = {10.1109/nsspw.2006.4378824}, + booktitle = {2006 IEEE Nonlinear Statistical Signal Processing Workshop}, + publisher = {IEEE}, + author = {Hol, Jeroen D. and Schon, Thomas B. and Gustafsson, Fredrik}, + year = {2006}, + month = sep +} + +@inproceedings{Douc2005, + title = {Comparison of resampling schemes for particle filtering}, + ISSN = {1845-5921}, + url = {http://dx.doi.org/10.1109/ISPA.2005.195385}, + DOI = {10.1109/ispa.2005.195385}, + booktitle = {ISPA 2005. Proceedings of the 4th International Symposium on Image and Signal Processing and Analysis, 2005.}, + publisher = {IEEE}, + author = {Douc, R. and Cappe, O.}, + year = {2005} +} + +@article{Naesseth2019, + title = {Elements of Sequential Monte Carlo}, + volume = {12}, + ISSN = {1935-8245}, + url = {http://dx.doi.org/10.1561/2200000074}, + DOI = {10.1561/2200000074}, + number = {3}, + journal = {Foundations and Trends{\textregistered} in Machine Learning}, + publisher = {Now Publishers}, + author = {Naesseth, Christian A. and Lindsten, Fredrik and Sch\"{o}n, Thomas B.}, + year = {2019}, + pages = {187–306} +} + + + +@article{Kantas2015, + title = {On Particle Methods for Parameter Estimation in State-Space Models}, + volume = {30}, + ISSN = {0883-4237}, + url = {http://dx.doi.org/10.1214/14-STS511}, + DOI = {10.1214/14-sts511}, + number = {3}, + journal = {Statistical Science}, + publisher = {Institute of Mathematical Statistics}, + author = {Kantas, Nikolas and Doucet, Arnaud and Singh, Sumeetpal S. and Maciejowski, Jan and Chopin, Nicolas}, + year = {2015}, + month = aug +} @phdthesis{steinSequentialInferenceMallows2023, title = {Sequential {{Inference}} with the {{Mallows Model}}}, diff --git a/inst/examples/sample_prior_example.R b/inst/examples/sample_prior_example.R new file mode 100644 index 00000000..8cac95cc --- /dev/null +++ b/inst/examples/sample_prior_example.R @@ -0,0 +1,22 @@ +# We can use a collection of particles from the prior distribution as +# initial values for the sequential Monte Carlo algorithm. +# Here we start by drawing 1000 particles from the priors, using default +# parameters. +prior_samples <- sample_prior(1000, ncol(sushi_rankings)) +# Next, we provide the prior samples to update_mallws(), together +# with the first five rows of the sushi dataset +model1 <- update_mallows( + model = prior_samples, + new_data = setup_rank_data(sushi_rankings[1:5, ])) +plot(model1) + +# We keep adding more data +model2 <- update_mallows( + model = model1, + new_data = setup_rank_data(sushi_rankings[6:10, ])) +plot(model2) + +model3 <- update_mallows( + model = model2, + new_data = setup_rank_data(sushi_rankings[11:15, ])) +plot(model3) diff --git a/inst/examples/update_mallows_example.R b/inst/examples/update_mallows_example.R index 99a650cd..717bef39 100644 --- a/inst/examples/update_mallows_example.R +++ b/inst/examples/update_mallows_example.R @@ -18,7 +18,10 @@ data_second_batch <- potato_visual[5:8, ] # We can now update the model using sequential Monte Carlo mod_second <- update_mallows( - model = mod_init, new_data = setup_rank_data(rankings = data_second_batch)) + model = mod_init, + new_data = setup_rank_data(rankings = data_second_batch), + smc_options = set_smc_options(resampler = "systematic") + ) # This model now has a collection of particles approximating the posterior # distribution after the first and second batch @@ -66,7 +69,8 @@ mod_init$burnin <- 2000 # Next assume the users update their rankings, so we have top-12 instead. mod1 <- update_mallows( model = mod_init, - new_data = setup_rank_data(rankings = potato_top_12, user_ids = user_ids) + new_data = setup_rank_data(rankings = potato_top_12, user_ids = user_ids), + smc_options = set_smc_options(resampler = "stratified") ) plot(mod1) diff --git a/man/BayesMallows-package.Rd b/man/BayesMallows-package.Rd index 39006863..c8165b5a 100644 --- a/man/BayesMallows-package.Rd +++ b/man/BayesMallows-package.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/BayesMallows.R \docType{package} \name{BayesMallows-package} +\alias{BayesMallows} \alias{BayesMallows-package} \title{BayesMallows: Bayesian Preference Learning with the Mallows Rank Model} \description{ diff --git a/man/compute_mallows.Rd b/man/compute_mallows.Rd index 9c74134f..53933212 100644 --- a/man/compute_mallows.Rd +++ b/man/compute_mallows.Rd @@ -303,6 +303,7 @@ subset(get_transitive_closure(beach_data), assessor \%in\% c(1, 2) & \seealso{ Other modeling: \code{\link{compute_mallows_mixtures}()}, +\code{\link{sample_prior}()}, \code{\link{update_mallows}()} } \concept{modeling} diff --git a/man/compute_mallows_mixtures.Rd b/man/compute_mallows_mixtures.Rd index 337de2d1..63778687 100644 --- a/man/compute_mallows_mixtures.Rd +++ b/man/compute_mallows_mixtures.Rd @@ -164,6 +164,7 @@ plot(mixture_model, parameter = "cluster_assignment") \seealso{ Other modeling: \code{\link{compute_mallows}()}, +\code{\link{sample_prior}()}, \code{\link{update_mallows}()} } \concept{modeling} diff --git a/man/figures/README-unnamed-chunk-8-1.png b/man/figures/README-unnamed-chunk-8-1.png index 3c0e8957..80346b53 100644 Binary files a/man/figures/README-unnamed-chunk-8-1.png and b/man/figures/README-unnamed-chunk-8-1.png differ diff --git a/man/sample_prior.Rd b/man/sample_prior.Rd new file mode 100644 index 00000000..d26efc58 --- /dev/null +++ b/man/sample_prior.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_prior.R +\name{sample_prior} +\alias{sample_prior} +\title{Sample from prior distribution} +\usage{ +sample_prior(n, n_items, priors = set_priors()) +} +\arguments{ +\item{n}{An integer specifying the number of samples to take.} + +\item{n_items}{An integer specifying the number of items to be ranked.} + +\item{priors}{An object of class "BayesMallowsPriors" returned from +\code{\link[=set_priors]{set_priors()}}.} +} +\value{ +An object of class "BayesMallowsPriorSample", containing \code{n} +independent samples of \eqn{\alpha} and \eqn{\rho}. +} +\description{ +Function to obtain samples from the prior distributions of the Bayesian +Mallows model. Intended to be given to \code{\link[=update_mallows]{update_mallows()}}. +} +\examples{ +# We can use a collection of particles from the prior distribution as +# initial values for the sequential Monte Carlo algorithm. +# Here we start by drawing 1000 particles from the priors, using default +# parameters. +prior_samples <- sample_prior(1000, ncol(sushi_rankings)) +# Next, we provide the prior samples to update_mallws(), together +# with the first five rows of the sushi dataset +model1 <- update_mallows( + model = prior_samples, + new_data = setup_rank_data(sushi_rankings[1:5, ])) +plot(model1) + +# We keep adding more data +model2 <- update_mallows( + model = model1, + new_data = setup_rank_data(sushi_rankings[6:10, ])) +plot(model2) + +model3 <- update_mallows( + model = model2, + new_data = setup_rank_data(sushi_rankings[11:15, ])) +plot(model3) +} +\seealso{ +Other modeling: +\code{\link{compute_mallows}()}, +\code{\link{compute_mallows_mixtures}()}, +\code{\link{update_mallows}()} +} +\concept{modeling} diff --git a/man/set_priors.Rd b/man/set_priors.Rd index 85d47700..a5324684 100644 --- a/man/set_priors.Rd +++ b/man/set_priors.Rd @@ -4,11 +4,16 @@ \alias{set_priors} \title{Set prior parameters for Bayesian Mallows model} \usage{ -set_priors(lambda = 0.001, psi = 10, kappa = c(1, 3)) +set_priors(gamma = 1, lambda = 0.001, psi = 10, kappa = c(1, 3)) } \arguments{ +\item{gamma}{Strictly positive numeric value specifying the shape parameter +of the gamma prior distribution of \eqn{\alpha}. Defaults to \code{1}, thus +recovering the exponential prior distribution used by +\insertCite{vitelli2018}{BayesMallows}.} + \item{lambda}{Strictly positive numeric value specifying the rate parameter -of the truncated exponential prior distribution of \eqn{\alpha}. Defaults +of the gamma prior distribution of \eqn{\alpha}. Defaults to \code{0.001}. When \code{n_cluster > 1}, each mixture component \eqn{\alpha_{c}} has the same prior distribution.} @@ -34,6 +39,9 @@ An object of class \code{"BayesMallowsPriors"}, to be provided in the Set values related to the prior distributions for the Bayesian Mallows model. } +\references{ +\insertAllCited{} +} \seealso{ Other preprocessing: \code{\link{get_transitive_closure}()}, diff --git a/man/set_smc_options.Rd b/man/set_smc_options.Rd index ec77d358..5671deae 100644 --- a/man/set_smc_options.Rd +++ b/man/set_smc_options.Rd @@ -4,19 +4,73 @@ \alias{set_smc_options} \title{Set SMC compute options} \usage{ -set_smc_options(n_particles = 1000, mcmc_steps = 5) +set_smc_options( + n_particles = 1000, + mcmc_steps = 5, + resampler = c("stratified", "systematic", "residual", "multinomial"), + latent_sampling_lag = NA_integer_ +) } \arguments{ \item{n_particles}{Integer specifying the number of particles.} \item{mcmc_steps}{Number of MCMC steps to be applied in the resample-move step.} + +\item{resampler}{Character string defining the resampling method to use. One +of "stratified", "systematic", "residual", and "multinomial". Defaults to +"stratified". While multinomial resampling was used in +\insertCite{steinSequentialInferenceMallows2023;textual}{BayesMallows}, +stratified, systematic, or residual resampling typically give lower Monte +Carlo error \insertCite{Douc2005,Hol2006,Naesseth2019}{BayesMallows}.} + +\item{latent_sampling_lag}{Parameter specifying the number of timesteps to go +back when resampling the latent ranks in the move step. See Section 6.2.3 +of \insertCite{Kantas2015}{BayesMallows} for details. The \eqn{L} in their +notation corresponds to \code{latent_sampling_lag}. See more under Details. +Defaults to \code{NA}, which means that all latent ranks from previous timesteps +are resampled. If set to \code{0}, no move step is applied to the latent ranks.} } \value{ An object of class "SMCOptions". } \description{ -Sets the SMC compute options to be used in \code{\link[=update_mallows.BayesMallows]{update_mallows.BayesMallows()}}. +Sets the SMC compute options to be used in +\code{\link[=update_mallows.BayesMallows]{update_mallows.BayesMallows()}}. +} +\section{Lag parameter in move step}{ + + +The parameter \code{latent_sampling_lag} corresponds to \eqn{L} in +\insertCite{Kantas2015}{BayesMallows}. Its use in this package is can be +explained in terms of Algorithm 12 in +\insertCite{steinSequentialInferenceMallows2023}{BayesMallows}. The +relevant line of the algorithm is: + +\strong{for} \eqn{j = 1 : M_{t}} \strong{do} \cr +\strong{M-H step:} update \eqn{\tilde{\mathbf{R}}_{j}^{(i)}} with proposal +\eqn{\tilde{\mathbf{R}}_{j}' \sim q(\tilde{\mathbf{R}}_{j}^{(i)} | + \mathbf{R}_{j}, \boldsymbol{\rho}_{t}^{(i)}, \alpha_{t}^{(i)})}.\cr +\strong{end} + +Let \eqn{L} denote the value of \code{latent_sampling_lag}. With this parameter, +we modify for algorithm so it becomes + +\strong{for} \eqn{j = M_{t-L+1} : M_{t}} \strong{do} \cr +\strong{M-H step:} update \eqn{\tilde{\mathbf{R}}_{j}^{(i)}} with proposal +\eqn{\tilde{\mathbf{R}}_{j}' \sim q(\tilde{\mathbf{R}}_{j}^{(i)} | + \mathbf{R}_{j}, \boldsymbol{\rho}_{t}^{(i)}, \alpha_{t}^{(i)})}.\cr +\strong{end} + +This means that with \eqn{L=0} no move step is performed on any latent +ranks, whereas \eqn{L=1} means that the move step is only applied to the +parameters entering at the given timestep. The default, +\code{latent_sampling_lag = NA} means that \eqn{L=t} at each timestep, and hence +all latent ranks are part of the move step at each timestep. +} + +\references{ +\insertAllCited{} } \seealso{ Other preprocessing: diff --git a/man/setup_rank_data.Rd b/man/setup_rank_data.Rd index a1a14dd2..e2d40b23 100644 --- a/man/setup_rank_data.Rd +++ b/man/setup_rank_data.Rd @@ -14,7 +14,8 @@ setup_rank_data( cl = NULL, shuffle_unranked = FALSE, random = FALSE, - random_limit = 8L + random_limit = 8L, + timepoint = NULL ) } \arguments{ @@ -24,7 +25,9 @@ converted to rankings. If \code{preferences} is provided, \code{rankings} is an optional initial value of the rankings. If \code{rankings} has column names, these are assumed to be the names of the items. \code{NA} values in rankings are treated as missing data and automatically augmented; to change this -behavior, see the \code{na_action} argument to \code{\link[=set_model_options]{set_model_options()}}.} +behavior, see the \code{na_action} argument to \code{\link[=set_model_options]{set_model_options()}}. A vector +length \code{n_items} is silently converted to a matrix of length \verb{1 x n_items}, +and names (if any), are used as column names.} \item{preferences}{A data frame with one row per pairwise comparison, and columns \code{assessor}, \code{top_item}, and \code{bottom_item}. Each column contains the @@ -95,6 +98,12 @@ generated for each assessor, and one of them is picked at random.} \item{random_limit}{Integer specifying the maximum number of items allowed when all possible orderings are computed, i.e., when \code{random=TRUE}. Defaults to \code{8L}.} + +\item{timepoint}{Integer vector specifying the timepoint. Defaults to \code{NULL}, +which means that a vector of ones, one for each observation, is generated. +Used by \code{\link[=update_mallows]{update_mallows()}} to identify data with a given iteration of the +sequential Monte Carlo algorithm. If not \code{NULL}, must contain one integer +for each row in \code{rankings}.} } \value{ An object of class \code{"BayesMallowsData"}, to be provided in the \code{data} diff --git a/man/update_mallows.Rd b/man/update_mallows.Rd index e0c358e9..c62285c8 100644 --- a/man/update_mallows.Rd +++ b/man/update_mallows.Rd @@ -2,12 +2,24 @@ % Please edit documentation in R/update_mallows.R \name{update_mallows} \alias{update_mallows} +\alias{update_mallows.BayesMallowsPriorSamples} \alias{update_mallows.BayesMallows} \alias{update_mallows.SMCMallows} \title{Update a Bayesian Mallows model with new users} \usage{ update_mallows(model, new_data, ...) +\method{update_mallows}{BayesMallowsPriorSamples}( + model, + new_data, + model_options = set_model_options(), + smc_options = set_smc_options(), + compute_options = set_compute_options(), + priors = model$priors, + pfun_estimate = NULL, + ... +) + \method{update_mallows}{BayesMallows}( model, new_data, @@ -22,8 +34,9 @@ update_mallows(model, new_data, ...) } \arguments{ \item{model}{A model object of class "BayesMallows" returned from -\code{\link[=compute_mallows]{compute_mallows()}} or an object of class "SMCMallows" returned from this -function.} +\code{\link[=compute_mallows]{compute_mallows()}}, an object of class "SMCMallows" returned from this +function, or an object of class "BayesMallowsPriorSamples" returned from +\code{\link[=sample_prior]{sample_prior()}}.} \item{new_data}{An object of class "BayesMallowsData" returned from \code{\link[=setup_rank_data]{setup_rank_data()}}. The object should contain the new data being provided.} @@ -41,6 +54,12 @@ returned from \code{\link[=set_compute_options]{set_compute_options()}}.} \item{priors}{An object of class "BayesMallowsPriors" returned from \code{\link[=set_priors]{set_priors()}}. Defaults to the priors used in \code{model}.} + +\item{pfun_estimate}{Object returned from \code{\link[=estimate_partition_function]{estimate_partition_function()}}. +Defaults to \code{NULL}, and will only be used for footrule, Spearman, or +Ulam distances when the cardinalities are not available, cf. +\code{\link[=get_cardinalities]{get_cardinalities()}}. Only used by the specialization for objects of type +"BayesMallowsPriorSamples".} } \value{ An updated model, of class "SMCMallows". @@ -147,7 +166,8 @@ plot(mod_final) } \seealso{ Other modeling: +\code{\link{compute_mallows}()}, \code{\link{compute_mallows_mixtures}()}, -\code{\link{compute_mallows}()} +\code{\link{sample_prior}()} } \concept{modeling} diff --git a/src/classes.h b/src/classes.h index 6addb4dd..1d06f743 100644 --- a/src/classes.h +++ b/src/classes.h @@ -22,10 +22,11 @@ struct Data { struct Priors { Priors( const Rcpp::List& priors - ) : lambda { priors["lambda"] }, kappa(priors["kappa"]), - psi { priors["psi"] } {} + ) : gamma { priors["gamma"] }, lambda { priors["lambda"] }, + kappa(priors["kappa"]), psi { priors["psi"] } {} ~Priors() = default; + const double gamma; const double lambda; const arma::ivec kappa; const unsigned int psi; diff --git a/src/distances.cpp b/src/distances.cpp index 05febe3c..211ba675 100644 --- a/src/distances.cpp +++ b/src/distances.cpp @@ -24,3 +24,109 @@ arma::vec get_rank_distance(arma::mat rankings, arma::vec rho, auto distfun = choose_distance_function(metric); return distfun->matdist(rankings, rho); } + +double CayleyDistance::d(const arma::vec& r1, const arma::vec& r2) { + double distance{}; + arma::vec tmp2 = r1; + + for(size_t i{}; i < r1.n_elem; ++i){ + if(tmp2(i) != r2(i)) { + distance += 1; + double tmp1 = tmp2(i); + tmp2(i) = r2(i); + arma::uvec inds = find(tmp2 == r2(i)); + tmp2.elem(inds).fill(tmp1); + } + } + return distance; +} + +arma::vec Distance::matdist(const arma::mat& r1, const arma::vec& r2) { + arma::vec result(r1.n_cols); + for(size_t i{}; i < r1.n_cols; i++) { + const arma::vec& v1 = r1.col(i); + result(i) = d(v1, r2); + } + return result; +} + +arma::vec Distance::scalardist(const arma::vec& r1, const double r2) { + arma::vec v2 = arma::ones(r1.size()) * r2; + arma::vec result = arma::zeros(r1.size()); + for(size_t i{}; i < r1.n_elem; i++) { + result(i) = d(arma::vec{r1(i)}, arma::vec{v2(i)}); + } + return result; +} + +void Distance::update_leap_and_shift_indices(arma::uvec& indices, int n_items) { + return; +}; + +double CayleyDistance::d( + const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) { + return d(r1, r2); +} + +void CayleyDistance::update_leap_and_shift_indices( + arma::uvec& indices, int n_items) { + indices = arma::regspace(0, n_items - 1); +} + +double FootruleDistance::d(const arma::vec& r1, const arma::vec& r2) { + return arma::norm(r1 - r2, 1); +} + +double FootruleDistance::d( + const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) { + return d(r1(inds), r2(inds)); +} + +double HammingDistance::d(const arma::vec& r1, const arma::vec& r2) { + return arma::sum(r1 != r2); +} + +double HammingDistance::d( + const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) { + return d(r1(inds), r2(inds)); +} + +double KendallDistance::d(const arma::vec& r1, const arma::vec& r2) { + double distance{}; + for(size_t i{}; i < r1.n_elem; ++i){ + for(size_t j{}; j < i; ++j){ + if (((r1(j) > r1(i)) && (r2(j) < r2(i)) ) || + ((r1(j) < r1(i)) && (r2(j) > r2(i)))) { + distance += 1; + } + } + } + return distance; +} + +double KendallDistance::d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) { + return d(r1, r2); +} + +double SpearmanDistance::d(const arma::vec& r1, const arma::vec& r2) { + return std::pow(arma::norm(r1 - r2, 2), 2); +} + +double SpearmanDistance::d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) { + return d(r1(inds), r2(inds)); +} + +double UlamDistance::d(const arma::vec& r1, const arma::vec& r2) { + arma::ivec a = arma::conv_to::from(r1) - 1; + arma::ivec b = arma::conv_to::from(r2) - 1; + auto distance = perm0_distance ( a, b ); + return static_cast(distance); +} + +double UlamDistance::d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) { + return d(r1, r2); +} + +void UlamDistance::update_leap_and_shift_indices(arma::uvec& indices, int n_items) { + indices = arma::regspace(0, n_items - 1); +} diff --git a/src/distances.h b/src/distances.h index 1c25c4a0..b4913fa7 100644 --- a/src/distances.h +++ b/src/distances.h @@ -12,119 +12,41 @@ struct Distance { virtual double d(const arma::vec& r1, const arma::vec& r2) = 0; virtual double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) = 0; - arma::vec matdist(const arma::mat& r1, const arma::vec& r2) { - arma::vec result(r1.n_cols); - for(size_t i{}; i < r1.n_cols; i++) { - const arma::vec& v1 = r1.col(i); - result(i) = d(v1, r2); - } - return result; - } - arma::vec scalardist(const arma::vec& r1, const double r2) { - arma::vec v2 = arma::ones(r1.size()) * r2; - arma::vec result = arma::zeros(r1.size()); - for(size_t i{}; i < r1.n_elem; i++) { - result(i) = d(arma::vec{r1(i)}, arma::vec{v2(i)}); - } - return result; - } - virtual void update_leap_and_shift_indices(arma::uvec& indices, int n_items) = 0; + arma::vec matdist(const arma::mat& r1, const arma::vec& r2); + arma::vec scalardist(const arma::vec& r1, const double r2); + virtual void update_leap_and_shift_indices(arma::uvec& indices, int n_items); }; std::unique_ptr choose_distance_function(std::string metric); struct CayleyDistance : Distance { - double d(const arma::vec& r1, const arma::vec& r2) override { - double distance{}; - arma::vec tmp2 = r1; - - for(size_t i{}; i < r1.n_elem; ++i){ - if(tmp2(i) != r2(i)) { - distance += 1; - double tmp1 = tmp2(i); - tmp2(i) = r2(i); - arma::uvec inds = find(tmp2 == r2(i)); - tmp2.elem(inds).fill(tmp1); - } - } - return distance; - } - double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override { - return d(r1, r2); - } - void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override { - indices = arma::regspace(0, n_items - 1); - } + double d(const arma::vec& r1, const arma::vec& r2) override; + double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override; + void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override; }; struct FootruleDistance : Distance { - double d(const arma::vec& r1, const arma::vec& r2) override { - return arma::norm(r1 - r2, 1); - } - double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override { - return arma::norm(r1(inds) - r2(inds), 1); - } - void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override { - return; - } + double d(const arma::vec& r1, const arma::vec& r2) override; + double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override; }; struct HammingDistance : Distance { - double d(const arma::vec& r1, const arma::vec& r2) override { - return arma::sum(r1 != r2); - } - double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override { - return arma::sum(r1(inds) != r2(inds)); - } - void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override { - return; - } + double d(const arma::vec& r1, const arma::vec& r2) override; + double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override; }; struct KendallDistance : Distance { - double d(const arma::vec& r1, const arma::vec& r2) override { - double distance{}; - for(size_t i{}; i < r1.n_elem; ++i){ - for(size_t j{}; j < i; ++j){ - if (((r1(j) > r1(i)) && (r2(j) < r2(i)) ) || - ((r1(j) < r1(i)) && (r2(j) > r2(i)))) { - distance += 1; - } - } - } - return distance; - } - double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override { - return d(r1, r2); - } - void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override { - return; - } + double d(const arma::vec& r1, const arma::vec& r2) override; + double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override; }; struct SpearmanDistance : Distance { - double d(const arma::vec& r1, const arma::vec& r2) override { - return std::pow(arma::norm(r1 - r2, 2), 2); - } - double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override { - return std::pow(arma::norm(r1(inds) - r2(inds), 2), 2); - } - void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override { - return; - } + double d(const arma::vec& r1, const arma::vec& r2) override; + double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override; }; struct UlamDistance : Distance { - double d(const arma::vec& r1, const arma::vec& r2) override { - arma::ivec a = arma::conv_to::from(r1) - 1; - arma::ivec b = arma::conv_to::from(r2) - 1; - auto distance = perm0_distance ( a, b ); - return static_cast(distance); - } - double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override { - return d(r1, r2); - } - void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override { - indices = arma::regspace(0, n_items - 1); - } + double d(const arma::vec& r1, const arma::vec& r2) override; + double d(const arma::vec& r1, const arma::vec& r2, const arma::uvec& inds) override; + void update_leap_and_shift_indices(arma::uvec& indices, int n_items) override; }; diff --git a/src/leapandshift.cpp b/src/leapandshift.cpp index 08398639..72c640a5 100644 --- a/src/leapandshift.cpp +++ b/src/leapandshift.cpp @@ -5,7 +5,7 @@ using namespace arma; // [[Rcpp::depends(RcppArmadillo)]] void shift_step(vec& rho_proposal, const vec& rho, - const int& u, uvec& indices){ + int u, uvec& indices){ // Shift step: double delta_r = rho_proposal(u) - rho(u); indices = zeros(std::abs(delta_r) + 1); @@ -49,7 +49,8 @@ void leap_and_shift(vec& rho_proposal, uvec& indices, // Picked element index-1 from the support set rho_proposal(u) = support(index); - double support_new = std::min(rho_proposal(u) - 1, leap_size * 1.0) + std::min(n_items - rho_proposal(u), leap_size * 1.0); + double support_new = std::min(rho_proposal(u) - 1, leap_size * 1.0) + + std::min(n_items - rho_proposal(u), leap_size * 1.0); if(std::abs(rho_proposal(u) - rho(u)) == 1){ prob_forward = 1.0 / (n_items * support.n_elem) + 1.0 / (n_items * support_new); prob_backward = prob_forward; diff --git a/src/leapandshift.h b/src/leapandshift.h index c38f47dd..a453556b 100644 --- a/src/leapandshift.h +++ b/src/leapandshift.h @@ -7,4 +7,4 @@ void leap_and_shift(arma::vec& rho_proposal, arma::uvec& indices, const arma::vec& rho, int leap_size, const std::unique_ptr& distfun); void shift_step(arma::vec& rho_proposal, const arma::vec& rho, - const int& u, arma::uvec& indices); + int u, arma::uvec& indices); diff --git a/src/missing_data.h b/src/missing_data.h index d7ba8ec0..1a36c757 100644 --- a/src/missing_data.h +++ b/src/missing_data.h @@ -5,7 +5,7 @@ arma::vec make_new_augmentation( const arma::vec& rankings, const arma::uvec& missing_indicator, - const double& alpha, + double alpha, const arma::vec& rho, const std::unique_ptr& distfun, const std::unique_ptr& pseudo_aug_distance, @@ -21,7 +21,7 @@ struct RankProposal{ RankProposal() {}; RankProposal( const arma::vec& rankings, - const double& probability, + double probability, const arma::uvec& mutated_items) : rankings { rankings }, probability { probability }, mutated_items { mutated_items } {} @@ -39,7 +39,7 @@ RankProposal make_uniform_proposal( RankProposal make_pseudo_proposal( arma::vec ranks, const arma::uvec& indicator, - const double& alpha, + double alpha, const arma::vec& rho, const std::unique_ptr& distfun ) noexcept; diff --git a/src/missing_data_functions.cpp b/src/missing_data_functions.cpp index e0785c68..ed076a65 100644 --- a/src/missing_data_functions.cpp +++ b/src/missing_data_functions.cpp @@ -45,7 +45,7 @@ mat initialize_missing_ranks(mat rankings, const umat& missing_indicator) { } vec make_new_augmentation(const vec& rankings, const uvec& missing_indicator, - const double& alpha, const vec& rho, + double alpha, const vec& rho, const std::unique_ptr& distfun, const std::unique_ptr& pseudo_aug_distance, double& log_aug_prob) { @@ -85,7 +85,7 @@ RankProposal make_uniform_proposal(const vec& ranks, const uvec& indicator) noex } RankProposal make_pseudo_proposal( - vec ranks, const uvec& indicator, const double& alpha, const vec& rho, + vec ranks, const uvec& indicator, double alpha, const vec& rho, const std::unique_ptr& distfun ) noexcept { uvec missing_inds = find(indicator == 1); diff --git a/src/pairwise_comparison_functions.cpp b/src/pairwise_comparison_functions.cpp index 3538dac9..5de59df3 100644 --- a/src/pairwise_comparison_functions.cpp +++ b/src/pairwise_comparison_functions.cpp @@ -7,18 +7,19 @@ using namespace arma; // [[Rcpp::depends(RcppArmadillo)]] -void find_pairwise_limits(int& left_limit, int& right_limit, const int& item, - const uvec& items_above_item, - const uvec& items_below_item, - const vec& current_ranking) { +int find_lower_limit(int item, const uvec& items_above_item, const vec& current_ranking) { if(items_above_item.size() > 0) { - vec rankings_above = current_ranking.elem(items_above_item - 1); - left_limit = max(rankings_above); + return max(current_ranking.elem(items_above_item - 1)) + 1; + } else { + return 1; } +} +int find_upper_limit(int item, const uvec& items_below_item, const vec& current_ranking) { if(items_below_item.size() > 0) { - vec rankings_below = current_ranking.elem(items_below_item - 1); - right_limit = min(rankings_below); + return min(current_ranking.elem(items_below_item - 1)) - 1; + } else { + return current_ranking.size(); } } @@ -30,19 +31,13 @@ vec propose_pairwise_augmentation( ivec a = Rcpp::sample(n_items, 1) - 1; int item = a(0); - // Left and right limits of the interval we draw ranks from - // Correspond to l_j and r_j, respectively, in Vitelli et al. (2018), JMLR, Sec. 4.2. - int left_limit = 0, right_limit = n_items + 1; - find_pairwise_limits(left_limit, right_limit, item, items_above[item], - items_below[item], ranking); + int lower_limit = find_lower_limit(item, items_above[item], ranking); + int upper_limit = find_upper_limit(item, items_below[item], ranking); - // Now complete the leap step by sampling a new proposal uniformly between - // left_limit + 1 and right_limit - 1 - Rcpp::IntegerVector b = Rcpp::seq(left_limit + 1, right_limit - 1); + Rcpp::IntegerVector b = Rcpp::seq(lower_limit, upper_limit); ivec d = Rcpp::sample(b, 1); int proposed_rank = d(0); - // Assign the proposal to the (item-1)th item vec proposal = ranking; proposal(item) = proposed_rank; @@ -58,7 +53,7 @@ vec propose_swap( const vec& ranking, const doubly_nested& items_above, const doubly_nested& items_below, - int& g_diff, const int& swap_leap) { + int& g_diff, int swap_leap) { int n_items = ranking.n_elem; ivec l = Rcpp::sample(swap_leap, 1); ivec a = Rcpp::sample(n_items - l(0), 1); diff --git a/src/pairwise_comparisons.h b/src/pairwise_comparisons.h index 8d7a4fcb..bd6fa55e 100644 --- a/src/pairwise_comparisons.h +++ b/src/pairwise_comparisons.h @@ -12,4 +12,4 @@ arma::vec propose_swap( const doubly_nested& items_above, const doubly_nested& items_below, int& g_diff, - const int& swap_leap); + int swap_leap); diff --git a/src/parallel_utils.h b/src/parallel_utils.h new file mode 100644 index 00000000..7d87b246 --- /dev/null +++ b/src/parallel_utils.h @@ -0,0 +1,15 @@ +#pragma once +#include + +#ifdef __TBB_H +#include +#endif + +template +void par_for_each(Iterator first, Iterator last, Function func) { +#ifdef __TBB_H + tbb::parallel_for_each(first, last, func); +#else + std::for_each(first, last, func); +#endif +} diff --git a/src/particles.cpp b/src/particles.cpp index cf3c87b2..3023cc8a 100644 --- a/src/particles.cpp +++ b/src/particles.cpp @@ -56,6 +56,9 @@ std::vector initialize_particles( vec alpha_samples(initial_values["alpha_init"]); mat rho_samples(initial_values["rho_init"]); Rcpp::Nullable aug_init(initial_values["aug_init"]); + if(rho_samples.n_rows != dat.n_items) { + Rcpp::stop("Wrong format for initial values for rho."); + } std::vector pvec; pvec.reserve(n_particles); diff --git a/src/partition_functions.cpp b/src/partition_functions.cpp index 157c5442..26a3e816 100644 --- a/src/partition_functions.cpp +++ b/src/partition_functions.cpp @@ -38,3 +38,102 @@ double get_partition_function( n_items, metric, pfun_values, R_NilValue); return pfun->logz(alpha); } + +Cayley::Cayley(int n_items) : n_items { n_items } {} + +double Cayley::logz(double alpha) { + double res{}; + for(int i{1}; i < n_items; ++i){ + res += std::log(1.0 + i * std::exp(- alpha / n_items)); + } + return res; +} + +double Cayley::expected_distance(double alpha) { + arma::vec idx = arma::regspace(1, n_items - 1); + return arma::sum(idx / (idx + std::exp(alpha / n_items))); +} + +Hamming::Hamming(int n_items) : n_items { n_items } {} + +double Hamming::logz(double alpha) { + double res{}; + for(int i{}; i < (n_items + 1); ++i){ + res += tgamma(n_items + 1.0) * std::exp(-alpha) * + std::pow((std::exp(alpha / n_items) - 1.0), i) / tgamma(i + 1.0); + } + return std::log(res); +} + +double Hamming::expected_distance(double alpha) { + arma::vec idx1 = arma::regspace(0, n_items - 1); + arma::vec idx2 = arma::regspace(0, n_items); + + return n_items - std::exp(alpha / n_items) * + arma::sum( + arma::pow(std::exp(alpha / n_items) - arma::ones(idx1.size()), idx1) / + arma::tgamma(idx1 + 1) + ) / + arma::sum( + arma::pow(std::exp(alpha / n_items) - arma::ones(idx2.size()), idx2) / + arma::tgamma(idx2 + 1)); +} + +Kendall::Kendall(int n_items) : n_items { n_items } {} + +double Kendall::logz(double alpha) { + double res{}; + for(int i{1}; i < (n_items + 1); ++i){ + res += std::log((1.0 - std::exp(-i * alpha / n_items)) / + (1.0 - std::exp(-alpha / n_items))); + } + return res; +} + +double Kendall::expected_distance(double alpha) { + arma::vec idx = arma::regspace(1, n_items); + if(alpha > 0) { + return n_items * std::exp(-alpha / n_items) / + (1 - std::exp(-alpha / n_items)) - + arma::sum((idx % arma::exp(-idx * alpha / n_items)) / + (1 - arma::exp(-idx * alpha / n_items))); + } else if(alpha == 0) { + return n_items * (n_items - 1) / 4; + } else { + Rcpp::stop("alpha must be non-negative."); + } +} + +Cardinal::Cardinal( + int n_items, + const arma::mat& pfun_values) : + n_items { n_items }, + distances { pfun_values.col(0) }, + cardinalities { pfun_values.col(1) }{} + +double Cardinal::logz(double alpha) { + return std::log(arma::sum(cardinalities % exp(-alpha / n_items * distances))); +} + +double Cardinal::expected_distance(double alpha) { + return arma::sum(distances % cardinalities % exp(-alpha * distances / n_items)) + * std::exp(-logz(alpha)); +} + +Estimated::Estimated( + int n_items, + const arma::mat& pfun_estimate) : + n_items { n_items }, + power { pfun_estimate.col(0) }, + coefficients { pfun_estimate.col(1) } {} + +double Estimated::logz(double alpha) { + return arma::sum( + arma::pow(alpha + arma::zeros(coefficients.size()), power) % coefficients + ); +} + +double Estimated::expected_distance(double alpha) { + Rcpp::stop( + "Expected distance not available with estimated partition function."); +} diff --git a/src/partition_functions.h b/src/partition_functions.h index 55430a1e..b69befd9 100644 --- a/src/partition_functions.h +++ b/src/partition_functions.h @@ -16,117 +16,39 @@ std::unique_ptr choose_partition_function( const Rcpp::Nullable& pfun_estimate); struct Cayley : PartitionFunction { - Cayley(int n_items) : n_items { n_items } {} - - double logz(double alpha) override { - double res{}; - for(int i{1}; i < n_items; ++i){ - res += std::log(1.0 + i * std::exp(- alpha / n_items)); - } - return res; - } - double expected_distance(double alpha) override { - arma::vec idx = arma::regspace(1, n_items - 1); - return arma::sum(idx / (idx + std::exp(alpha / n_items))); - } + Cayley(int n_items); + double logz(double alpha) override; + double expected_distance(double alpha) override; const int n_items; }; struct Hamming : PartitionFunction { - Hamming(int n_items) : n_items { n_items } {} - - double logz(double alpha) override { - double res{}; - for(int i{}; i < (n_items + 1); ++i){ - res += tgamma(n_items + 1.0) * std::exp(-alpha) * - std::pow((std::exp(alpha / n_items) - 1.0), i) / tgamma(i + 1.0); - } - return std::log(res); - } - double expected_distance(double alpha) override { - arma::vec idx1 = arma::regspace(0, n_items - 1); - arma::vec idx2 = arma::regspace(0, n_items); - - return n_items - std::exp(alpha / n_items) * - arma::sum( - arma::pow(std::exp(alpha / n_items) - arma::ones(idx1.size()), idx1) / - arma::tgamma(idx1 + 1) - ) / - arma::sum( - arma::pow(std::exp(alpha / n_items) - arma::ones(idx2.size()), idx2) / - arma::tgamma(idx2 + 1)); - } - + Hamming(int n_items); + double logz(double alpha) override; + double expected_distance(double alpha) override; const int n_items; }; struct Kendall : PartitionFunction { - Kendall(int n_items) : n_items { n_items } {} - - double logz(double alpha) override { - double res{}; - for(int i{1}; i < (n_items + 1); ++i){ - res += std::log((1.0 - std::exp(-i * alpha / n_items)) / - (1.0 - std::exp(-alpha / n_items))); - } - return res; - } - double expected_distance(double alpha) override { - arma::vec idx = arma::regspace(1, n_items); - if(alpha > 0) { - return n_items * std::exp(-alpha / n_items) / - (1 - std::exp(-alpha / n_items)) - - arma::sum((idx % arma::exp(-idx * alpha / n_items)) / - (1 - arma::exp(-idx * alpha / n_items))); - } else if(alpha == 0) { - return n_items * (n_items - 1) / 4; - } else { - Rcpp::stop("alpha must be non-negative."); - } - } - + Kendall(int n_items); + double logz(double alpha) override; + double expected_distance(double alpha) override; const int n_items; }; struct Cardinal : PartitionFunction { - Cardinal( - int n_items, - const arma::mat& pfun_values) : - n_items { n_items }, - distances { pfun_values.col(0) }, - cardinalities { pfun_values.col(1) }{} - - double logz(double alpha) override { - return std::log(arma::sum(cardinalities % exp(-alpha / n_items * distances))); - } - double expected_distance(double alpha) override { - return arma::sum(distances % cardinalities % exp(-alpha * distances / n_items)) - * std::exp(-logz(alpha)); - } - + Cardinal(int n_items, const arma::mat& pfun_values); + double logz(double alpha) override; + double expected_distance(double alpha) override; const int n_items; const arma::vec distances; const arma::vec cardinalities; }; struct Estimated : PartitionFunction { - Estimated( - int n_items, - const arma::mat& pfun_estimate) : - n_items { n_items }, - power { pfun_estimate.col(0) }, - coefficients { pfun_estimate.col(1) } {} - - double logz(double alpha) override { - return arma::sum( - arma::pow(alpha + arma::zeros(coefficients.size()), power) % coefficients - ); - } - double expected_distance(double alpha) override { - Rcpp::stop( - "Expected distance not available with estimated partition function."); - } - + Estimated(int n_items, const arma::mat& pfun_estimate); + double logz(double alpha) override; + double expected_distance(double alpha) override; const int n_items; const arma::vec power; const arma::vec coefficients; diff --git a/src/proposal_functions.cpp b/src/proposal_functions.cpp index 5d0c2424..e8216ef1 100644 --- a/src/proposal_functions.cpp +++ b/src/proposal_functions.cpp @@ -4,14 +4,14 @@ using namespace arma; AlphaRatio make_new_alpha( - const double& alpha_old, + double alpha_old, const vec& rho_old, - const double& alpha_prop_sd, + double alpha_prop_sd, const std::unique_ptr& distfun, const std::unique_ptr& pfun, const arma::mat& rankings, const arma::vec& observation_frequency, - const double& n_items, + double n_items, const Priors& priors) { double alpha_proposal = R::rlnorm(std::log(alpha_old), alpha_prop_sd); @@ -23,7 +23,7 @@ AlphaRatio make_new_alpha( priors.lambda * alpha_diff + sum(observation_frequency) * (pfun->logz(alpha_old) - pfun->logz(alpha_proposal)) + - std::log(alpha_proposal) - std::log(alpha_old); + priors.gamma * (std::log(alpha_proposal) - std::log(alpha_old)); return AlphaRatio{alpha_proposal, ratio > std::log(R::unif_rand())}; } diff --git a/src/proposal_functions.h b/src/proposal_functions.h index 72b6705d..0627fc7c 100644 --- a/src/proposal_functions.h +++ b/src/proposal_functions.h @@ -10,14 +10,14 @@ struct AlphaRatio{ }; AlphaRatio make_new_alpha( - const double& alpha_old, + double alpha_old, const arma::vec& rho_old, - const double& alpha_prop_sd, + double alpha_prop_sd, const std::unique_ptr& distfun, const std::unique_ptr& pfun, const arma::mat& rankings, const arma::vec& observation_frequency, - const double& n_items, + double n_items, const Priors& priors); arma::vec make_new_rho( diff --git a/src/resampler.cpp b/src/resampler.cpp new file mode 100644 index 00000000..846defc1 --- /dev/null +++ b/src/resampler.cpp @@ -0,0 +1,86 @@ +#include +#include "resampler.h" + +arma::ivec count_to_index(const arma::vec& counts) { + arma::ivec indices(arma::sum(counts)); + + int idx = 0; + for (arma::uword i = 0; i < counts.n_elem; ++i) { + for (int j = 0; j < counts(i); ++j) { + indices(idx++) = i; + } + } + return indices; +} + +arma::ivec digitize(const arma::vec& bins, const arma::vec& values) { + arma::ivec indices(values.n_elem); + + for (arma::uword i = 0; i < values.n_elem; ++i) { + double val = values(i); + arma::uword index = 0; + while (index < bins.n_elem && val >= bins(index)) { + ++index; + } + indices(i) = index; + } + + return indices; +} + +arma::ivec stratsys(arma::vec probs, bool stratified) { + size_t N = probs.size(); + arma::vec u(N); + Rcpp::NumericVector rn; + if(stratified) { + rn = Rcpp::runif(N, 0, 1); + } else { + rn = Rcpp::NumericVector(N, R::runif(0, 1)); + } + + for(size_t i{}; i < N; i++) u(i) = (i + rn(i)) / N; + return digitize(arma::cumsum(probs), u); +} + +arma::ivec Multinomial::resample(arma::vec probs) { + return Rcpp::sample( + probs.size(), probs.size(), true, + Rcpp::as(Rcpp::wrap(probs)), false); +} + +arma::ivec Residual::resample(arma::vec probs) { + int N = probs.size(); + arma::vec counts = arma::floor(probs * N); + int R = N - arma::sum(counts); + arma::ivec result = count_to_index(counts); + + arma::vec residual_weights = probs - counts / N; + residual_weights = residual_weights / arma::sum(residual_weights); + arma::ivec part2 = Rcpp::sample( + N, R, true, + Rcpp::as(Rcpp::wrap(residual_weights)), false); + + return arma::join_cols(result, part2); +} + +arma::ivec Stratified::resample(arma::vec probs) { + return stratsys(probs, true); +} + +arma::ivec Systematic::resample(arma::vec probs) { + return stratsys(probs, false); +} + +std::unique_ptr choose_resampler(std::string resampler) { + if(resampler == "multinomial") { + return std::make_unique(); + } else if(resampler == "residual") { + return std::make_unique(); + } else if(resampler == "stratified") { + return std::make_unique(); + } else if(resampler == "systematic") { + return std::make_unique(); + } else { + Rcpp::stop("Unknown resampler."); + } +} diff --git a/src/resampler.h b/src/resampler.h new file mode 100644 index 00000000..6160d96c --- /dev/null +++ b/src/resampler.h @@ -0,0 +1,26 @@ +#pragma once +#include + +struct Resampler { + Resampler() {}; + virtual ~Resampler() = default; + virtual arma::ivec resample(arma::vec probs) = 0; +}; + +struct Multinomial : Resampler { + arma::ivec resample(arma::vec probs) override; +}; + +struct Residual : Resampler { + arma::ivec resample(arma::vec probs) override; +}; + +struct Stratified : Resampler { + arma::ivec resample(arma::vec probs) override; +}; + +struct Systematic : Resampler { + arma::ivec resample(arma::vec probs) override; +}; + +std::unique_ptr choose_resampler(std::string resampler); diff --git a/src/run_smc.cpp b/src/run_smc.cpp index 3835d7ba..dec3a35e 100644 --- a/src/run_smc.cpp +++ b/src/run_smc.cpp @@ -1,4 +1,6 @@ #include + +#include "parallel_utils.h" #include "smc_classes.h" #include "particles.h" @@ -21,7 +23,7 @@ Rcpp::List run_smc( SMCData dat{data, new_data}; SMCParameters pars{smc_options, compute_options}; Priors pris{priors}; - SMCAugmentation aug{dat, compute_options}; + SMCAugmentation aug{dat, compute_options, smc_options}; std::string metric = model_options["metric"]; auto pfun = choose_partition_function( dat.n_items, metric, pfun_values, pfun_estimate); @@ -30,21 +32,17 @@ Rcpp::List run_smc( aug.reweight(pvec, dat, pfun, distfun); pars.resample(pvec); - std::for_each( - pvec.begin(), pvec.end(), [ - pars = std::move(pars), dat = std::move(dat), - distfun = std::move(distfun), - pfun = std::move(pfun), - pris = std::move(pris), - aug = std::move(aug) - ](Particle& p) mutable { - for (size_t kk{}; kk < pars.mcmc_steps; ++kk) { - dat.update_data(p); - pars.update_rho(p, dat, distfun); - pars.update_alpha(p, dat, pfun, distfun, pris); - aug.update_missing_ranks(p, dat, distfun); - } - } + par_for_each( + pvec.begin(), pvec.end(), + [&pars, &dat, &pris, &aug, distfun = std::ref(distfun), + pfun = std::ref(pfun)] + (Particle& p) { + for(size_t i{}; i < pars.mcmc_steps; i++) { + pars.update_rho(p, dat, distfun); + pars.update_alpha(p, dat, pfun, distfun, pris); + aug.update_missing_ranks(p, dat, distfun); + } + } ); Rcpp::List particle_history = Rcpp::List::create( diff --git a/src/smc_augmentation_class.cpp b/src/smc_augmentation_class.cpp index ac6a99ef..61318964 100644 --- a/src/smc_augmentation_class.cpp +++ b/src/smc_augmentation_class.cpp @@ -1,29 +1,35 @@ +#include "parallel_utils.h" #include "smc_classes.h" #include "missing_data.h" using namespace arma; SMCAugmentation::SMCAugmentation( - SMCData& dat, - const Rcpp::List& compute_options) : + const SMCData& dat, + const Rcpp::List& compute_options, + const Rcpp::List& smc_options + ) : missing_indicator { set_up_missing(dat) }, aug_method(compute_options["aug_method"]), - pseudo_aug_metric(compute_options["pseudo_aug_metric"]) {} + pseudo_aug_metric(compute_options["pseudo_aug_metric"]), + lag_helper { Rcpp::as(smc_options["latent_sampling_lag"]) }, + latent_sampling_lag { + Rcpp::IntegerVector::is_na(lag_helper[0]) ? max(dat.timepoint) : lag_helper[0] } {} void SMCAugmentation::reweight( std::vector& pvec, const SMCData& dat, const std::unique_ptr& pfun, const std::unique_ptr& distfun -) { +) const { if(dat.any_missing) { - std::for_each( + par_for_each( pvec.begin(), pvec.end(), [distfun = &distfun](Particle& p){ p.previous_distance = distfun->get()->matdist(p.augmented_data, p.rho); }); augment_partial(pvec, dat); } - std::for_each( + par_for_each( pvec.begin(), pvec.end(), [n_assessors = dat.n_assessors, num_new_obs = dat.num_new_obs, any_missing = dat.any_missing, distfun = &distfun, pfun = &pfun, @@ -68,8 +74,8 @@ void SMCAugmentation::reweight( void SMCAugmentation::augment_partial( std::vector& pvec, const SMCData& dat -){ - std::for_each( +) const { + par_for_each( pvec.begin(), pvec.end(), [n_assessors = dat.n_assessors, num_new_obs = dat.num_new_obs, aug_method = aug_method, pseudo_aug_metric = pseudo_aug_metric, @@ -106,12 +112,13 @@ void SMCAugmentation::augment_partial( void SMCAugmentation::update_missing_ranks( Particle& p, const SMCData& dat, - const std::unique_ptr& distfun) { + const std::unique_ptr& distfun) const { if(!dat.any_missing) return; auto pseudo_aug_distance = aug_method == "uniform" ? nullptr : choose_distance_function(pseudo_aug_metric); - for (unsigned int jj{}; jj < dat.n_assessors; ++jj) { + uvec indices_to_loop = find(max(dat.timepoint) - dat.timepoint < latent_sampling_lag); + for (auto jj : indices_to_loop) { p.augmented_data.col(jj) = make_new_augmentation( p.augmented_data.col(jj), missing_indicator.col(jj), p.alpha, diff --git a/src/smc_classes.h b/src/smc_classes.h index 4fa96bfd..14d0040d 100644 --- a/src/smc_classes.h +++ b/src/smc_classes.h @@ -1,6 +1,7 @@ #pragma once #include "classes.h" +#include "resampler.h" struct Particle { Particle(double alpha, const arma::vec& rho, const arma::mat& augmented_data, @@ -21,6 +22,7 @@ struct SMCData : Data { void update_data(const Particle& p); arma::mat new_rankings; const unsigned int num_new_obs; + const arma::uvec timepoint; }; struct SMCParameters { @@ -33,44 +35,50 @@ struct SMCParameters { const SMCData& dat, const std::unique_ptr& pfun, const std::unique_ptr& distfun, - const Priors& priors); + const Priors& priors) const; void update_rho( Particle& p, const SMCData& dat, const std::unique_ptr& distfun - ); + ) const; - Rcpp::IntegerVector draw_resampling_index(const std::vector& pvec); - void resample(std::vector& pvec); + arma::ivec draw_resampling_index( + const std::vector& pvec) const; + void resample(std::vector& pvec) const; const unsigned int mcmc_steps; private: const double alpha_prop_sd; const unsigned int leap_size; + const std::unique_ptr resampler; }; struct SMCAugmentation { - SMCAugmentation(SMCData& dat, const Rcpp::List& compute_options); + SMCAugmentation( + const SMCData& dat, + const Rcpp::List& compute_options, + const Rcpp::List& smc_options + ); ~SMCAugmentation() = default; void reweight( std::vector& pvec, const SMCData& dat, const std::unique_ptr& pfun, - const std::unique_ptr& distfun); + const std::unique_ptr& distfun) const; - void augment_partial( - std::vector& pvec, - const SMCData& dat); + void augment_partial(std::vector& pvec, const SMCData& dat) const; void update_missing_ranks(Particle& p, const SMCData& dat, - const std::unique_ptr& distfun); + const std::unique_ptr& distfun) const; - void resample(const arma::uvec& index, const SMCData& dat); const arma::umat missing_indicator; - const std::string aug_method; const std::string pseudo_aug_metric; + +private: + const Rcpp::IntegerVector lag_helper; + const unsigned int latent_sampling_lag; }; diff --git a/src/smc_data_class.cpp b/src/smc_data_class.cpp index 7b0546fd..77c97ef9 100644 --- a/src/smc_data_class.cpp +++ b/src/smc_data_class.cpp @@ -6,7 +6,8 @@ SMCData::SMCData( const Rcpp::List& new_data ) : Data(data), new_rankings { Rcpp::as(new_data["rankings"]).t() }, -num_new_obs { new_rankings.n_cols } {} +num_new_obs { new_rankings.n_cols }, +timepoint { Rcpp::as(data["timepoint"]) } {} void SMCData::update_data(const Particle& p) { if(!any_missing) return; diff --git a/src/smc_parameters_class.cpp b/src/smc_parameters_class.cpp index e06978dc..9db6263e 100644 --- a/src/smc_parameters_class.cpp +++ b/src/smc_parameters_class.cpp @@ -8,20 +8,20 @@ SMCParameters::SMCParameters( ) : mcmc_steps { smc_options["mcmc_steps"] }, alpha_prop_sd { compute_options["alpha_prop_sd"] }, - leap_size { compute_options["leap_size"] } {} + leap_size { compute_options["leap_size"] }, + resampler { choose_resampler(std::string(smc_options["resampler"])) }{} void SMCParameters::update_alpha( Particle& p, const SMCData& dat, const std::unique_ptr& pfun, const std::unique_ptr& distfun, - const Priors& priors) { + const Priors& priors) const { AlphaRatio test = make_new_alpha( - p.alpha, p.rho, - alpha_prop_sd, distfun, pfun, - dat.rankings, dat.observation_frequency, - dat.n_items, priors + p.alpha, p.rho, alpha_prop_sd, distfun, pfun, + dat.any_missing ? p.augmented_data : dat.rankings, + dat.observation_frequency, dat.n_items, priors ); if(test.accept) p.alpha = test.proposal; } @@ -29,27 +29,28 @@ void SMCParameters::update_alpha( void SMCParameters::update_rho( Particle& p, const SMCData& dat, - const std::unique_ptr& distfun) { + const std::unique_ptr& distfun) const { p.rho = make_new_rho( - p.rho, dat.rankings, p.alpha, leap_size, distfun, + p.rho, dat.any_missing ? p.augmented_data : dat.rankings, + p.alpha, leap_size, distfun, dat.observation_frequency); } -Rcpp::IntegerVector SMCParameters::draw_resampling_index( +arma::ivec SMCParameters::draw_resampling_index( const std::vector& pvec -) { - Rcpp::NumericVector log_inc_wgt(pvec.size()); +) const { + arma::vec log_inc_wgt(pvec.size()); std::transform(pvec.cbegin(), pvec.cend(), log_inc_wgt.begin(), [](const Particle& p){ return p.log_inc_wgt; }); - Rcpp::NumericVector probs = exp(log_inc_wgt - max(log_inc_wgt) - + arma::vec probs = exp(log_inc_wgt - max(log_inc_wgt) - log(sum(exp(log_inc_wgt - max(log_inc_wgt))))); - return Rcpp::sample(log_inc_wgt.size(), log_inc_wgt.size(), true, probs, false); + return resampler->resample(probs); } -void SMCParameters::resample(std::vector& pvec) { - Rcpp::IntegerVector index = draw_resampling_index(pvec); +void SMCParameters::resample(std::vector& pvec) const { + arma::ivec index = draw_resampling_index(pvec); std::vector pvec_old = pvec; for(size_t i{}; i < pvec.size(); i++) pvec[i] = pvec_old[index[i]]; } diff --git a/tests/testthat/test-compute_consensus.R b/tests/testthat/test-compute_consensus.R index 928c7942..8c856e36 100644 --- a/tests/testthat/test-compute_consensus.R +++ b/tests/testthat/test-compute_consensus.R @@ -83,4 +83,22 @@ test_that("compute_consensus.SMCMallows works", { expect_equal(length(unique(a1$probability)), 1) a2 <- compute_consensus(mod_final, type = "CP") expect_equal(dim(a2), c(20, 4)) + + mod <- sample_prior( + 1000, ncol(sushi_rankings), + priors = set_priors(gamma = 2, lambda = .1) + ) + for (i in seq_len(30)) { + mod <- update_mallows( + model = mod, + new_data = setup_rank_data(sushi_rankings[i, , drop = FALSE]) + ) + } + expect_equal( + compute_consensus(mod)$item, + c( + "fatty tuna", "shrimp", "salmon roe", "squid", "sea urchin", + "tuna", "tuna roll", "sea eel", "egg", "cucumber roll" + ) + ) }) diff --git a/tests/testthat/test-compute_posterior_intervals.R b/tests/testthat/test-compute_posterior_intervals.R index 8408bde8..6df70dc8 100644 --- a/tests/testthat/test-compute_posterior_intervals.R +++ b/tests/testthat/test-compute_posterior_intervals.R @@ -71,9 +71,9 @@ test_that("compute_posterior_intervals works for SMC", { pi <- compute_posterior_intervals(mod2) expect_equal(pi$parameter, "alpha") - expect_equal(pi$median, "2.744") - expect_equal(pi$hpdi, "[2.015,3.476]") - expect_equal(pi$central_interval, "[2.047,3.530]") + expect_equal(pi$median, "2.745") + expect_equal(pi$hpdi, "[2.009,3.517]") + expect_equal(pi$central_interval, "[2.016,3.519]") mod3 <- update_mallows( mod2, @@ -81,7 +81,7 @@ test_that("compute_posterior_intervals works for SMC", { ) pi <- compute_posterior_intervals(mod3, decimals = 2) - expect_equal(pi$hpdi, "[2.18,3.75]") + expect_equal(pi$hpdi, "[2.23,3.74]") pi <- compute_posterior_intervals(mod3, parameter = "rho") expect_equal(pi$hpdi[[20]], "[1,5]") diff --git a/tests/testthat/test-heat_plot.R b/tests/testthat/test-heat_plot.R index 55053fcc..ae47b94e 100644 --- a/tests/testthat/test-heat_plot.R +++ b/tests/testthat/test-heat_plot.R @@ -2,9 +2,18 @@ test_that("heat_plot works", { set.seed(1) model_fit <- compute_mallows( setup_rank_data(potato_visual), - compute_options = set_compute_options(nmc = 2) + compute_options = set_compute_options(nmc = 20, burnin = 3) ) expect_s3_class(heat_plot(model_fit, burnin = 0, type = "MAP"), "ggplot") + obj <- heat_plot(model_fit) + expect_equal(sum(obj$data$probability), 20) + + model_fit <- compute_mallows( + setup_rank_data(potato_visual), + compute_options = set_compute_options(nmc = 20, burnin = 3, rho_thinning = 3) + ) + obj <- heat_plot(model_fit) + expect_equal(sum(obj$data$probability), 20) set.seed(1) model_fit <- compute_mallows( diff --git a/tests/testthat/test-mcmc_correctness.R b/tests/testthat/test-mcmc_correctness.R index 1bb50596..79c2ae1b 100644 --- a/tests/testthat/test-mcmc_correctness.R +++ b/tests/testthat/test-mcmc_correctness.R @@ -112,7 +112,7 @@ test_that("compute_mallows is correct for top-k ranks", { expect_equal( sd(mod_bmm$alpha$value[mod_bmm$alpha$iteration > 1000]), expectations$sd[[i]], - tolerance = .2 + tolerance = 1e-4 ) } }) diff --git a/tests/testthat/test-plot.R b/tests/testthat/test-plot.R index b0cf628b..ede312c3 100644 --- a/tests/testthat/test-plot.R +++ b/tests/testthat/test-plot.R @@ -18,7 +18,7 @@ test_that("plot.SMCMallows works", { expect_s3_class(p, "ggplot") expect_equal(dim(p$data), c(10, 4)) p <- plot(mod_second, parameter = "rho", items = 1:4) - expect_equal(dim(p$data), c(16, 5)) + expect_equal(dim(p$data), c(19, 5)) expect_message( p <- plot(mod_second, parameter = "rho"), "Items not provided by user. Picking 5 at random." @@ -35,7 +35,7 @@ test_that("plot.SMCMallows works", { p <- plot(mod_final, parameter = "rho", items = c("P19", "P8")) expect_s3_class(p, "ggplot") - expect_equal(dim(p$data), c(7, 5)) + expect_equal(dim(p$data), c(11, 5)) expect_equal(as.character(unique(p$data$item)), c("P8", "P19")) expect_error( diff --git a/tests/testthat/test-resampling.R b/tests/testthat/test-resampling.R new file mode 100644 index 00000000..6513fbaa --- /dev/null +++ b/tests/testthat/test-resampling.R @@ -0,0 +1,22 @@ +test_that("resampling options work", { + set.seed(1) + mod0 <- sample_prior(1000, 20) + + for (r in c("stratified", "systematic", "residual", "multinomial")) { + mod1 <- update_mallows( + model = mod0, + new_data = setup_rank_data(potato_weighing[1:3, ]), + smc_options = set_smc_options(n_particles = 20, resampler = r) + ) + expect_s3_class(mod1, "SMCMallows") + } + + expect_error( + update_mallows( + model = mod0, + new_data = setup_rank_data(potato_weighing[1:3, ]), + smc_options = set_smc_options(n_particles = 20, resampler = "gaussian") + ), + "'arg' should be one of" + ) +}) diff --git a/tests/testthat/test-sample_prior.R b/tests/testthat/test-sample_prior.R new file mode 100644 index 00000000..e05eac96 --- /dev/null +++ b/tests/testthat/test-sample_prior.R @@ -0,0 +1,27 @@ +test_that("sample_prior works", { + set.seed(1) + mm <- sample_prior(3, 10) + expect_s3_class(mm, "BayesMallowsPriorSamples") + expect_equal(mm$n_items, 10) + expect_equal(mm$items, 1:10) + expect_equal(dim(mm$rho), c(10, 3)) + expect_equal(length(mm$alpha), 3) + expect_equal(mm$alpha[[2]], 1882.40159634668) + expect_equal(mm$rho[2, 2], 5) +}) + +test_that("scale and shape matter", { + set.seed(1) + mm1 <- sample_prior(100, 20) + mm2 <- sample_prior(100, 20, priors = set_priors(gamma = 2, lambda = .1)) + + expect_lt(mean(mm2$alpha), mean(mm1$alpha)) +}) + +test_that("scale and shape matter", { + set.seed(1) + mm1 <- sample_prior(100, 20) + mm2 <- sample_prior(100, 20, priors = set_priors(gamma = 2, lambda = .1)) + + expect_lt(mean(mm2$alpha), mean(mm1$alpha)) +}) diff --git a/tests/testthat/test-set_smc_options.R b/tests/testthat/test-set_smc_options.R new file mode 100644 index 00000000..7fa2ce91 --- /dev/null +++ b/tests/testthat/test-set_smc_options.R @@ -0,0 +1,34 @@ +test_that("set_smc_options works", { + expect_error( + set_smc_options(n_particles = 3.4), + "n_particles must be a positive integer" + ) + expect_error( + set_smc_options(n_particles = -1), + "n_particles must be a positive integer" + ) + expect_error( + set_smc_options(mcmc_steps = 3.4), + "mcmc_steps must be a positive integer" + ) + expect_error( + set_smc_options(mcmc_steps = -1), + "mcmc_steps must be a positive integer" + ) + expect_error( + set_smc_options(latent_sampling_lag = -1), + "latent_sampling_lag must be a positive integer" + ) + + val <- set_smc_options(n_particles = 35, mcmc_steps = 2) + expect_equal(val$n_particles, 35) + expect_equal(val$mcmc_steps, 2) + expect_equal(val$latent_sampling_lag, NA_integer_) + expect_equal(val$resampler, "stratified") + + val <- set_smc_options(latent_sampling_lag = 3, resampler = "residual") + expect_equal(val$n_particles, 1000) + expect_equal(val$mcmc_steps, 5) + expect_equal(val$latent_sampling_lag, 3) + expect_equal(val$resampler, "residual") +}) diff --git a/tests/testthat/test-setup_rank_data.R b/tests/testthat/test-setup_rank_data.R index cd8ec36e..a7c777f9 100644 --- a/tests/testthat/test-setup_rank_data.R +++ b/tests/testthat/test-setup_rank_data.R @@ -45,6 +45,14 @@ test_that("setup_rank_data works with rankings", { dat <- setup_rank_data(rr, validate_rankings = FALSE) expect_equal(dat$rankings, rr) + + input1 <- potato_weighing[2, , drop = FALSE] + rownames(input1) <- NULL + input2 <- potato_weighing[2, ] + expect_equal( + setup_rank_data(input1), + setup_rank_data(input2) + ) }) test_that("setup_rank_data works for preferences", { diff --git a/tests/testthat/test-smc_update_correctness.R b/tests/testthat/test-smc_update_correctness.R index 8c76aee8..1331dd6a 100644 --- a/tests/testthat/test-smc_update_correctness.R +++ b/tests/testthat/test-smc_update_correctness.R @@ -17,8 +17,7 @@ test_that("update_mallows is correct for new rankings", { mod_smc <- update_mallows( model = mod_init, - new_data = setup_rank_data(rankings = triple_potato[5:20, ]), - smc_options = set_smc_options(n_particles = 10000, mcmc_steps = 15) + new_data = setup_rank_data(rankings = triple_potato[5:20, ]) ) mod_smc_next <- update_mallows( @@ -74,12 +73,12 @@ test_that("update_mallows is correct for new rankings", { # Posterior mean of alpha should be the same in both SMC methods, and close to BMM expect_equal(mean(mod_smc_next$alpha$value), mean(mod_bmm$alpha$value[mod_bmm$alpha$iteration > 200]), - tolerance = 0.02 + tolerance = 0.01 ) expect_equal(sd(mod_smc_next$alpha$value), sd(mod_bmm$alpha$value[mod_bmm$alpha$iteration > 200]), - tolerance = 0.05 + tolerance = 0.04 ) bmm_consensus <- compute_consensus(mod_bmm) @@ -91,7 +90,7 @@ test_that("update_mallows is correct for new rankings", { as.numeric(as.factor(smc_consensus$item)), metric = "ulam" ), - 2 + 0 ) }) @@ -123,7 +122,7 @@ test_that("update_mallows is correct for new partial rankings", { mod_smc <- update_mallows( model = mod_init, new_data = setup_rank_data(rankings = dat[5:20, ]), - smc_options = set_smc_options(n_particles = 10000, mcmc_steps = 10), + smc_options = set_smc_options(n_particles = 2000), compute_options = set_compute_options(aug_method = aug) ) @@ -135,13 +134,13 @@ test_that("update_mallows is correct for new partial rankings", { expect_equal( mean(mod_smc_next$alpha$value), mean(bmm_mod$alpha$value[bmm_mod$alpha$iteration > 1000]), - tolerance = .1 + tolerance = .01 ) expect_equal( sd(mod_smc_next$alpha$value), sd(bmm_mod$alpha$value[bmm_mod$alpha$iteration > 1000]), - tolerance = .1 + tolerance = ifelse(aug == "uniform", .2, .1) ) bmm_consensus <- compute_consensus(bmm_mod) @@ -165,7 +164,7 @@ test_that("update_mallows is correct for updated partial rankings", { mod0 <- compute_mallows( data = setup_rank_data(rankings = dat0), - compute_options = set_compute_options(nmc = 5000, burnin = 2500) + compute_options = set_compute_options(nmc = 20000, burnin = 2500) ) dat1 <- potato_visual @@ -188,7 +187,7 @@ test_that("update_mallows is correct for updated partial rankings", { expect_equal( mean(mod1$alpha$value), mean(mod_bmm1$alpha$value[mod_bmm1$alpha$iteration > 4000]), - tolerance = .1 + tolerance = .05 ) dat2 <- potato_visual @@ -209,13 +208,6 @@ test_that("update_mallows is correct for updated partial rankings", { mean(mod_bmm$alpha$value[mod_bmm$alpha$iteration > 5000]), tolerance = .1 ) - - skip_on_os("mac", arch = "aarch64") - expect_equal( - sd(mod2$alpha$value), - sd(mod_bmm$alpha$value[mod_bmm$alpha$iteration > 5000]), - tolerance = .2 - ) }) test_that("update_mallows does not suffer from numerical overflow", { @@ -245,3 +237,49 @@ test_that("update_mallows does not suffer from numerical overflow", { expect_s3_class(mod2, "SMCMallows") }) + +test_that("update_mallows works for data one at a time", { + n <- 50 + set.seed(2) + mod_bmm <- compute_mallows(data = setup_rank_data(sushi_rankings[1:n, ])) + + mod <- sample_prior(1000, ncol(sushi_rankings)) + for (i in seq_len(n)) { + mod <- update_mallows( + model = mod, + new_data = setup_rank_data(sushi_rankings[i, ]) + ) + } + expect_equal( + mean(mod$alpha_samples), + mean(mod_bmm$alpha_samples[-(1:500)]), + tolerance = .01 + ) + expect_equal( + sd(mod$alpha_samples), + sd(mod_bmm$alpha_samples[-(1:500)]), + tolerance = .04 + ) + + dat <- sushi_rankings[sample(nrow(sushi_rankings), n), ] + dat[dat > 6] <- NA + mod_bmm <- compute_mallows(data = setup_rank_data(dat)) + + mod <- sample_prior(1000, ncol(dat)) + for (i in seq_len(n)) { + mod <- update_mallows( + model = mod, + new_data = setup_rank_data(dat[i, , drop = FALSE]) + ) + } + expect_equal( + mean(mod$alpha_samples), + mean(mod_bmm$alpha_samples[-(1:500)]), + tolerance = .02 + ) + expect_equal( + sd(mod$alpha_samples), + sd(mod_bmm$alpha_samples[-(1:500)]), + tolerance = .1 + ) +}) diff --git a/tests/testthat/test-update_mallows.R b/tests/testthat/test-update_mallows.R index e0e5e56a..e6022cf0 100644 --- a/tests/testthat/test-update_mallows.R +++ b/tests/testthat/test-update_mallows.R @@ -16,15 +16,15 @@ test_that("update_mallows works", { ) pi <- compute_posterior_intervals(mod_second) - expect_equal(pi$hpdi, "[0.331,0.771]") - expect_equal(mod_second$alpha$value[[9]], 0.528698473629568) + expect_equal(pi$hpdi, "[0.423,0.753]") + expect_equal(mod_second$alpha$value[[9]], 0.753246865159393) data_third_batch <- potato_visual[9:12, ] mod_final <- update_mallows( model = mod_second, new_data = setup_rank_data(rankings = data_third_batch) ) - expect_equal(mod_final$rho$value[169], 17) + expect_equal(mod_final$rho$value[169], 18) # Need this because random numbers don't reproduce exactly on M1 skip_on_os("mac", arch = "aarch64") @@ -52,7 +52,7 @@ test_that("update_mallows works", { compute_options = set_compute_options(aug_method = "pseudo") ) - expect_equal(mod1$alpha$value[[13]], 0.544567207683911) + expect_equal(mod1$alpha$value[[13]], 0.388038251055491) mod2 <- update_mallows( model = mod1, @@ -69,9 +69,21 @@ test_that("update_mallows works", { ) expect_s3_class(mod_final, "SMCMallows") - skip_on_cran() - expect_equal(mod2$rho$value[[300]], 2) - expect_equal(mod_final$rho$value[[300]], 3) +}) + +test_that("update_mallows can start from prior", { + set.seed(1) + prior_samples <- sample_prior(100, 20, set_priors(gamma = 2, lambda = .1)) + mod1 <- update_mallows( + prior_samples, + new_data = setup_rank_data(potato_visual[1, , drop = FALSE]), + smc_options = set_smc_options(n_particles = 100) + ) + mod2 <- update_mallows( + mod1, + new_data = setup_rank_data(potato_visual[2, , drop = FALSE]) + ) + expect_equal(mod2$alpha_samples[[56]], 3.51628380350389) }) test_that("update_mallows handles estimated partition function", { diff --git a/vignettes/SMC-Mallows.Rmd b/vignettes/SMC-Mallows.Rmd index 897985dc..9f5aeb92 100644 --- a/vignettes/SMC-Mallows.Rmd +++ b/vignettes/SMC-Mallows.Rmd @@ -175,9 +175,9 @@ rbind( ) #> parameter mean median hpdi central_interval #> 1 alpha 1.768 1.766 [1.603,1.917] [1.604,1.922] -#> 2 alpha 1.777 1.776 [1.603,1.916] [1.617,1.930] -#> 3 alpha 1.753 1.753 [1.678,1.830] [1.675,1.828] -#> 4 alpha 1.706 1.707 [1.660,1.746] [1.663,1.749] +#> 2 alpha 1.777 1.773 [1.630,1.935] [1.620,1.931] +#> 3 alpha 1.753 1.756 [1.676,1.827] [1.677,1.827] +#> 4 alpha 1.712 1.714 [1.667,1.748] [1.669,1.752] ``` As an assurance that the implementation is correct, we can compare the final model to what we get by running `compute_mallows` on the complete dataset: @@ -223,7 +223,7 @@ rbind( ) #> parameter mean median hpdi central_interval #> 1 alpha 1.691 1.690 [1.648,1.734] [1.643,1.732] -#> 2 alpha 1.706 1.707 [1.660,1.746] [1.663,1.749] +#> 2 alpha 1.712 1.714 [1.667,1.748] [1.669,1.752] ``` The cumulative probability consensus is also in good agrement: @@ -233,11 +233,11 @@ The cumulative probability consensus is also in good agrement: compute_consensus(model4) #> cluster ranking item cumprob #> 1 Cluster 1 1 fatty tuna 1.000 -#> 2 Cluster 1 2 salmon roe 0.998 +#> 2 Cluster 1 2 salmon roe 1.000 #> 3 Cluster 1 3 tuna 1.000 #> 4 Cluster 1 4 shrimp 1.000 #> 5 Cluster 1 5 sea eel 1.000 -#> 6 Cluster 1 6 tuna roll 0.893 +#> 6 Cluster 1 6 tuna roll 0.835 #> 7 Cluster 1 7 squid 1.000 #> 8 Cluster 1 8 sea urchin 1.000 #> 9 Cluster 1 9 egg 1.000 @@ -493,7 +493,7 @@ Before proceeding, it is instructive to study why this situation needs special c ```r (v1 <- model2$augmented_rankings[, 1, 1]) -#> [1] 2 7 9 3 4 1 5 6 8 10 +#> [1] 2 9 6 3 4 1 5 10 8 7 ``` Next, we show the data provided by user 1 in `data_batch3`: @@ -511,7 +511,7 @@ By comparing the non-missing ranks, we can check if they are consistent or not: (v2b <- v2a[!is.na(v2a)]) #> [1] 2 8 3 4 1 5 7 6 v1[v1 %in% v2b] -#> [1] 2 7 3 4 1 5 6 8 +#> [1] 2 6 3 4 1 5 8 7 all(v1[v1 %in% v2b] == v2b) #> [1] FALSE ``` diff --git a/vignettes/smc_complete_model2-1.png b/vignettes/smc_complete_model2-1.png index 81473778..677c4a26 100644 Binary files a/vignettes/smc_complete_model2-1.png and b/vignettes/smc_complete_model2-1.png differ diff --git a/vignettes/smc_complete_model2_alpha-1.png b/vignettes/smc_complete_model2_alpha-1.png index 3a20b612..e98b511f 100644 Binary files a/vignettes/smc_complete_model2_alpha-1.png and b/vignettes/smc_complete_model2_alpha-1.png differ diff --git a/vignettes/smc_complete_model3-1.png b/vignettes/smc_complete_model3-1.png index dd715646..fe5eb05c 100644 Binary files a/vignettes/smc_complete_model3-1.png and b/vignettes/smc_complete_model3-1.png differ diff --git a/vignettes/smc_complete_model4_alpha-1.png b/vignettes/smc_complete_model4_alpha-1.png index 184c4950..02cef19b 100644 Binary files a/vignettes/smc_complete_model4_alpha-1.png and b/vignettes/smc_complete_model4_alpha-1.png differ diff --git a/vignettes/smc_complete_model4_rho-1.png b/vignettes/smc_complete_model4_rho-1.png index f2322b64..36af1568 100644 Binary files a/vignettes/smc_complete_model4_rho-1.png and b/vignettes/smc_complete_model4_rho-1.png differ diff --git a/vignettes/smc_second_updated_posterior_partial-1.png b/vignettes/smc_second_updated_posterior_partial-1.png index 4c4a9403..086243e1 100644 Binary files a/vignettes/smc_second_updated_posterior_partial-1.png and b/vignettes/smc_second_updated_posterior_partial-1.png differ diff --git a/vignettes/smc_updated_posterior_partial-1.png b/vignettes/smc_updated_posterior_partial-1.png index 43898be1..a08a5a21 100644 Binary files a/vignettes/smc_updated_posterior_partial-1.png and b/vignettes/smc_updated_posterior_partial-1.png differ diff --git a/vignettes/sushi_updated_batch2_posterior-1.png b/vignettes/sushi_updated_batch2_posterior-1.png index df0ea631..55a2d042 100644 Binary files a/vignettes/sushi_updated_batch2_posterior-1.png and b/vignettes/sushi_updated_batch2_posterior-1.png differ diff --git a/vignettes/sushi_updated_batch3_posterior-1.png b/vignettes/sushi_updated_batch3_posterior-1.png index 7dd9e344..a23a0ffe 100644 Binary files a/vignettes/sushi_updated_batch3_posterior-1.png and b/vignettes/sushi_updated_batch3_posterior-1.png differ diff --git a/vignettes/sushi_updated_batch4_posterior-1.png b/vignettes/sushi_updated_batch4_posterior-1.png index 81b75067..a7c084fc 100644 Binary files a/vignettes/sushi_updated_batch4_posterior-1.png and b/vignettes/sushi_updated_batch4_posterior-1.png differ diff --git a/vignettes/sushi_updated_bmm_posterior-1.png b/vignettes/sushi_updated_bmm_posterior-1.png index 9e58d283..c0ddaf22 100644 Binary files a/vignettes/sushi_updated_bmm_posterior-1.png and b/vignettes/sushi_updated_bmm_posterior-1.png differ diff --git a/vignettes/sushi_updated_burnin-1.png b/vignettes/sushi_updated_burnin-1.png index 167eb308..fe993aba 100644 Binary files a/vignettes/sushi_updated_burnin-1.png and b/vignettes/sushi_updated_burnin-1.png differ diff --git a/work-docs/lag-issue-369.R b/work-docs/lag-issue-369.R new file mode 100644 index 00000000..5ff9c92b --- /dev/null +++ b/work-docs/lag-issue-369.R @@ -0,0 +1,35 @@ +rm(list=ls()) +library(patchwork) +devtools::load_all() +dat <- potato_visual +dat[dat < 15] <- NA + +mod0 <- compute_mallows( + data = setup_rank_data(dat[1:3, ]), + compute_options = set_compute_options(nmc = 10000, burnin = 100) + ) + +timepoint <- 1 +mod1 <- mod0 +for(i in 4:12) { + mod1 <- update_mallows( + model = mod1, + new_data = setup_rank_data(rankings = dat[i, ]), + smc_options = set_smc_options(n_particles = 2000) + ) + timepoint <- timepoint + 1 +} + + +timepoint <- 1 +mod2 <- mod0 +for(i in 4:12) { + mod2 <- update_mallows( + model = mod2, + new_data = setup_rank_data(rankings = dat[i, ], timepoint = timepoint), + smc_options = set_smc_options(n_particles = 2000, latent_sampling_lag = 1) + ) + timepoint <- timepoint + 1 +} + +plot(mod1) + plot(mod2) diff --git a/work-docs/leapshift.cpp b/work-docs/leapshift.cpp new file mode 100644 index 00000000..ba0abd7d --- /dev/null +++ b/work-docs/leapshift.cpp @@ -0,0 +1,146 @@ +#include + +using namespace arma; + +// [[Rcpp::depends(RcppArmadillo)]] + +void shift_step(vec& rho_proposal, const vec& rho, + int u){ + // Shift step: + double delta_r = rho_proposal(u) - rho(u); + uvec indices = zeros(std::abs(delta_r) + 1); + indices[0] = u; + + if(delta_r > 0){ + for(int k = 1; k <= delta_r; ++k){ + int index = as_scalar(find(rho == rho(u) + k)); + rho_proposal(index) -= 1; + indices[k] = index; + } + } else if(delta_r < 0) { + for(int k = (-1); k >= delta_r; --k){ + int index = as_scalar(find(rho == rho(u) + k)); + rho_proposal(index) += 1; + indices[-(k)] = index; + } + } +} + +// [[Rcpp::export]] +Rcpp::List leap_and_shift(arma::vec rho, int leap_size){ + vec rho_proposal = rho; + int n_items = rho.n_elem; + + // Leap 1 + // 1, sample u randomly between 1 and n_items + ivec a = Rcpp::sample(n_items, 1) - 1; + int u = a(0); + + // 2, compute the set S for sampling the new rank + vec support = join_cols(regspace(std::max(1.0, rho(u) - leap_size), 1, rho(u) - 1), + regspace(rho(u) + 1, 1, std::min(n_items * 1.0, rho(u) + leap_size))); + + // 3. assign a random element of the support set, this completes the leap step + ivec b = Rcpp::sample(support.n_elem, 1) - 1; + int index = b(0); + // Picked element index-1 from the support set + rho_proposal(u) = support(index); + + vec support_new = join_cols(regspace(std::max(1.0, rho_proposal(u) - leap_size), 1, rho_proposal(u) - 1), + regspace(rho_proposal(u) + 1, 1, std::min(n_items * 1.0, rho_proposal(u) + leap_size))); + + // double support_new = std::min(rho_proposal(u) - 1, leap_size * 1.0) + + // std::min(n_items - rho_proposal(u), leap_size * 1.0); + + double prob_forward{}; + double prob_backward{}; + if(std::abs(rho_proposal(u) - rho(u)) == 1){ + prob_forward = 1.0 / (n_items * support.n_elem) + 1.0 / (n_items * support_new.size()); + prob_backward = prob_forward; + } else { + prob_forward = 1.0 / (n_items * support.n_elem); + prob_backward = 1.0 / (n_items * support_new.size()); + } + shift_step(rho_proposal, rho, u); + return Rcpp::List::create( + Rcpp::Named("rho_proposal") = rho_proposal, + Rcpp::Named("prob_forward") = prob_forward, + Rcpp::Named("prob_backward") = prob_backward, + Rcpp::Named("u") = u, + Rcpp::Named("r") = index + ); +} + + +/*** R +library(tidyverse) +set.seed(1) +n_items <- 6 +nmc <- 3000 +leap_size <- 3 +rho <- seq_len(n_items) +res <- map_dfr(seq_len(nmc), function(i) { + ls <- leap_and_shift(rho, leap_size) + tibble( + distance = sum(abs(ls$rho_proposal - rho)), + rho_proposal = paste(ls$rho_proposal, collapse = ","), + prob_forward = ls$prob_forward, + prob_backward = ls$prob_backward, + u = ls$u, + r = ls$r + ) +}) + +res %>% + mutate(ur = paste(u, r, sep = ",")) %>% + group_by(rho_proposal, prob_forward, distance) %>% + summarise(n_distinct(ur)) %>% View() + +res %>% + group_by(rho_proposal, distance, prob_backward, prob_forward, u, r) %>% + summarise(pct = n() / nmc) %>% + View() + +res %>% + filter(rho_proposal == "1,5,4,3,2,6") %>% + summarise( + prob_forward = mean(prob_forward), + prob_backward = mean(prob_backward), + empirical = n() / nmc + ) + +# verify that backward probability is correct +rho_backward <- c(1, 5, 4, 3, 2, 6) +res_backward <- map_dfr(seq_len(nmc), function(i) { + ls <- leap_and_shift(rho_backward, leap_size) + tibble( + rho_proposal = paste(ls$rho_proposal, collapse = ","), + prob_forward = ls$prob_forward, + prob_backward = ls$prob_backward + ) +}) + +res_backward %>% + filter(rho_proposal == paste(rho, collapse = ",")) %>% + summarise( + prob_forward = mean(prob_forward), + prob_backward = mean(prob_backward), + empirical = n() / nmc + ) + +dd <- res %>% + group_by(rho_proposal) %>% + summarise( + proportion = n() / nmc, + mean_prob_forward = mean(prob_forward), + .groups = "drop" + ) + +ggplot(dd, aes(x = proportion, y = mean_prob_forward)) + + geom_point() + + geom_abline() + +ggplot(res, aes(x = prob_forward, y = prob_backward)) + + geom_point() + + geom_abline() +*/ diff --git a/work-docs/reproduce-meta-analysis.R b/work-docs/reproduce-meta-analysis.R new file mode 100644 index 00000000..30134956 --- /dev/null +++ b/work-docs/reproduce-meta-analysis.R @@ -0,0 +1,75 @@ +library(BayesMallows) +library(RankAggreg) +library(parallel) +data("geneLists") +set.seed(2233) + +genes <- sort(unique(as.character(geneLists))) +n_items <- length(genes) +alpha_vector <- seq(from = 0, to = 5, by = .1) + + +# In JMLR the partition function was estimate for alpha between 0 and 40, +# but since the posterior mean was 0.56, the mode 0.43 and the 95% HPDI was +# (0.04, 1.29), I estimated it instead between 0 and 5. + +iterations <- c(1e2, 1e3, 1e4) +estimates <- vector("list", 3) +fits <- vector("list", 3) + +cl <- makeCluster(12) + +for(i in seq_along(iterations)) { + fit <- estimate_partition_function( + method = "importance_sampling", + alpha_vector = alpha_vector, + n_items = n_items, + metric = "footrule", + n_iterations = iterations[[i]], + cl = cl + ) + fits[[i]] <- fit + estimates[[i]] <- + vapply(alpha_vector, function(a) sum(a^fit[, 1] * fit[, 2]), numeric(1)) +} + +stopCluster(cl) + +plot(x = alpha_vector, + y = estimates[[1]], + type = "l", xlab = expression(alpha), ylab = expression(Z(alpha))) + +for(i in seq(from = 2, to = length(iterations))) { + lines(x = alpha_vector, y = estimates[[i]], col = i) +} + +# Create rank data +dat <- matrix(nrow = nrow(geneLists), ncol = n_items) +colnames(dat) <- genes + +for(i in seq_len(nrow(dat))) { + ordering <- match(geneLists[i, ], genes) + dat[i, ordering] <- seq_along(ordering) +} + +cl <- makeCluster(10) +mod <- compute_mallows( + data = setup_rank_data(rankings = dat), + model_options = set_model_options(metric = "footrule"), + compute_options = set_compute_options( + nmc = 1020000, burnin = 20000, alpha_prop_sd = .5, leap_size = 15, + alpha_jump = 100, aug_thinning = 100, rho_thinning = 100), + priors = set_priors(lambda = .1), + pfun_estimate = fits[[3]], + cl = cl +) +stopCluster(cl) + +assess_convergence(mod) +assess_convergence(mod, parameter = "rho", items = c(34, 4, 50)) + +plot(mod) +compute_posterior_intervals(mod) + +compute_consensus(mod, type = "CP") +heat_plot(mod) diff --git a/work-docs/reproducing_arsia.pdf b/work-docs/reproducing_arsia.pdf new file mode 100644 index 00000000..56936532 Binary files /dev/null and b/work-docs/reproducing_arsia.pdf differ diff --git a/work-docs/reproducing_arsia.qmd b/work-docs/reproducing_arsia.qmd new file mode 100644 index 00000000..efa91500 --- /dev/null +++ b/work-docs/reproducing_arsia.qmd @@ -0,0 +1,89 @@ +--- +title: "Reproducing top-k experiment from ARSIA paper" +format: pdf +--- + +```{r} +library(BayesMallows) +library(tidyverse) +library(patchwork) +library(furrr) +library(extraDistr) +sessionInfo() +``` + +## Simulations + +```{r} +plan(multisession) +models <- future_map(1:10, function(i) { + k <- rtpois(12, lambda = 7, a = 1, b = 17) + dat <- potato_visual + colnames(dat) <- LETTERS[seq_len(ncol(dat))] + for(i in seq_along(k)) { + dat[i, ][dat[i, ] > k[[i]]] <- NA + } + compute_mallows( + data = setup_rank_data(rankings = dat), + compute_options = set_compute_options( + nmc = 5e5, burnin = 1e4, alpha_jump = 10, rho_thinning = 10, + aug_thinning = 10), + priors = set_priors(lambda = .1) + ) +}, .options = furrr_options(seed = 123L)) +plan("default") +``` + +## Trace plots + +The dashed black lines shows the burn-in. This looks like it converges well in the $\alpha$. + +```{r} +map(models, function(m) { + assess_convergence(m) + + geom_vline(xintercept = 1e4, linetype = "dashed") + + theme_classic() + + theme(legend.position = "none", axis.title.x = element_blank(), + axis.title.y = element_blank()) +}) %>% + wrap_plots() +``` + +Potato 8 is ranked as the 20th by everyone except assessor 8, who ranks it 17th. However, this does not mean that potato 8 has to be ranked last, since in each simulated dataset there are many other potatoes that may also never be ranked, depending on the $k_{j}$. Anyhow, these trace plots look reasonable to me. + +```{r} +map(models, function(m) { + assess_convergence(m, parameter = "rho", items = 8) + + geom_vline(xintercept = 1e4, linetype = "dashed") + + theme_classic() + + ylim(0, 20) + + theme(legend.position = "none", axis.title.x = element_blank(), + axis.title.y = element_blank()) +}) %>% + wrap_plots() +``` + + +## Heat plots + +I'll show the heat plots one-by-one. They all look reasonable to me. We cannot hope to exactly reproduce ARSIA, since Figure 3a in the paper shows the result of just one random simulation. Note that the color scale differs between the papers. + +```{r} +for(i in seq_along(models)) { + print(heat_plot(models[[i]])) +} +``` + + +## Mean Squared Error + +Again I cannot exactly reproduce the random number seeds etc used in the ARSIA paper, but the order of magnitude here is very close to Figure 3c. + +```{r} +mse <- map_dbl(models, function(m) { + mean((potato_true_ranking - create_ranking(match(compute_consensus(m)$item, LETTERS)))^2) +}) + +plot(mse, xlab = "run", ylab = "MSE", ylim = c(0, max(mse))) +``` + diff --git a/work-docs/resampling-methods.cpp b/work-docs/resampling-methods.cpp new file mode 100644 index 00000000..6e169d7c --- /dev/null +++ b/work-docs/resampling-methods.cpp @@ -0,0 +1,105 @@ +#include + +// [[Rcpp::depends(RcppArmadillo)]] + +arma::ivec count_to_index(const arma::vec& counts) { + arma::ivec indices(arma::sum(counts)); + + int idx = 0; + for (arma::uword i = 0; i < counts.n_elem; ++i) { + for (int j = 0; j < counts(i); ++j) { + indices(idx++) = i; + } + } + return indices; +} + +arma::ivec digitize(const arma::vec& bins, const arma::vec& values) { + arma::ivec indices(values.n_elem); + + for (arma::uword i = 0; i < values.n_elem; ++i) { + double val = values(i); + arma::uword index = 0; + while (index < bins.n_elem && val >= bins(index)) { + ++index; + } + indices(i) = index; + } + + return indices; +} + +arma::ivec stratsys(arma::vec probs, bool stratified) { + size_t N = probs.size(); + arma::vec u(N); + Rcpp::NumericVector rn; + if(stratified) { + rn = Rcpp::runif(N, 0, 1); + } else { + rn = Rcpp::NumericVector(N, R::runif(0, 1)); + } + + for(size_t i{}; i < N; i++) u(i) = (i + rn(i)) / N; + return digitize(arma::cumsum(probs), u); +} + +// [[Rcpp::export]] +arma::ivec multinomial_resampling(arma::vec probs) { + return Rcpp::sample( + probs.size(), probs.size(), true, + Rcpp::as(Rcpp::wrap(probs)), false); +} + +// [[Rcpp::export]] +arma::ivec residual_resampling(arma::vec probs) { + int N = probs.size(); + arma::vec counts = arma::floor(probs * N); + int R = N - arma::sum(counts); + arma::ivec result = count_to_index(counts); + + arma::vec residual_weights = probs - counts / N; + residual_weights = residual_weights / arma::sum(residual_weights); + arma::ivec part2 = Rcpp::sample( + N, R, true, + Rcpp::as(Rcpp::wrap(residual_weights)), false); + + return arma::join_cols(result, part2); +} + +// [[Rcpp::export]] +arma::ivec stratified_resampling(arma::vec probs) { + return stratsys(probs, true); +} + +// [[Rcpp::export]] +arma::ivec systematic_resampling(arma::vec probs) { + return stratsys(probs, false); +} + + +/*** R +library(tidyverse) + +n <- 4000 +probs <- rexp(n, rate = .0001) +probs <- probs / sum(probs) + +samplefun <- function(f) { + as.numeric(table(factor(f(probs), levels = 0:(n-1)))) +} + +dat <- tibble( + expected_value = n * probs, + multinomial_resampling = samplefun(multinomial_resampling), + residual_resampling = samplefun(residual_resampling), + stratified_resampling = samplefun(stratified_resampling), + systematic_resampling = samplefun(systematic_resampling) +) + +dat %>% + pivot_longer(cols = -expected_value) %>% + ggplot(aes(x = value, y = expected_value)) + + geom_point() + + geom_abline() + + facet_wrap(vars(name)) +*/ diff --git a/work-docs/segfault.cpp b/work-docs/segfault.cpp new file mode 100644 index 00000000..8c9c0fe7 --- /dev/null +++ b/work-docs/segfault.cpp @@ -0,0 +1,15 @@ +#include + +using namespace Rcpp; + +// [[Rcpp::export]] +double takeLog3(double val) { + if (val <= 0.0) { // log() not defined here + stop("Inadmissible value"); + } + return log(val); +} + +/*** R +takeLog3(-1) +*/ diff --git a/work-docs/timing.R b/work-docs/timing.R new file mode 100644 index 00000000..52448c67 --- /dev/null +++ b/work-docs/timing.R @@ -0,0 +1,30 @@ +library(BayesMallows) +set.seed(123) + +mod_bmm <- compute_mallows( + data = setup_rank_data(sushi_rankings), + compute_options = set_compute_options(nmc = 2000, burnin = 200) +) + +mod_init <- compute_mallows( + data = setup_rank_data(sushi_rankings[1:100, ]), + compute_options = set_compute_options(nmc = 10000, burnin = 1000) +) + +system.time({ + mod_smc <- update_mallows( + model = mod_init, + new_data = setup_rank_data(rankings = sushi_rankings[101:2000, ]), + smc_options = set_smc_options(n_particles = 2000, mcmc_steps = 5) + ) +}) +# 0.25 in parallel +# 0.31 sequentially +system.time({ + mod_smc_next <- update_mallows( + model = mod_smc, + new_data = setup_rank_data(sushi_rankings[2001:5000, ]) + ) +}) +# 0.77 in parallel +# 0.82 sequentially