Fixing lots of issue (#415)

* incrementing dev version * updating news * Allow different initial values across particles also when using pairwise preference data (#406) * simplifications * used Cpp for all_topological_sorts. Much much faster, since it is recursive code * converted preferences to matrix for SMC * starting to set up preferences * done * styling * removed shuffle_unranked argument * restructured arguments to setup_rank_data for pairwise preferences * fixing some errors * removing unnecessary statement * fixed bug in augmented rankings for existing users * made a better progress reporter * updated news and description * fixed #407 (#408) * Fixing bug in Ulam distance (#411) * fixed bug and added test * simplifying * styling * Exporting exact partition function (#412) * fixing documentation typo * fixing #409 * Consistency checks with pairwise preferences (#414) * fixed issue with updated users with pairwise preferences * fixed #404
ocbe-uio · Mar 26, 2024 · ad4b935 · ad4b935
1 parent 6f6b85d
commit ad4b935
Show file tree

Hide file tree

Showing 70 changed files with 1,081 additions and 414 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: BayesMallows
 Type: Package
 Title: Bayesian Preference Learning with the Mallows Rank Model
-Version: 2.1.1
+Version: 2.1.1.9007
 Authors@R: c(person("Oystein", "Sorensen",
                     email = "oystein.sorensen.1985@gmail.com",
                     role = c("aut", "cre"),
@@ -51,7 +51,6 @@ Depends: R (>= 3.5.0)
 Imports: Rcpp (>= 1.0.0),
   ggplot2 (>= 3.1.0),
   Rdpack (>= 1.0),
-  igraph (>= 1.2.5),
   sets (>= 1.0-18),
   relations (>= 0.6-8),
   rlang (>= 0.3.1)

diff --git a/NAMESPACE b/NAMESPACE
@@ -28,6 +28,7 @@ export(assess_convergence)
 export(assign_cluster)
 export(burnin)
 export(compute_consensus)
+export(compute_exact_partition_function)
 export(compute_expected_distance)
 export(compute_mallows)
 export(compute_mallows_mixtures)
@@ -52,6 +53,7 @@ export(set_compute_options)
 export(set_initial_values)
 export(set_model_options)
 export(set_priors)
+export(set_progress_report)
 export(set_smc_options)
 export(setup_rank_data)
 export(update_mallows)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,37 @@
+# BayesMallows (development versions)
+
+* The SMC function now check for consistency with previous latent ranks for 
+  existing users also when data arrive in the form of pairwise preferences.
+* A function compute_exact_partition_function() is now added, which returns the
+  logarithm of the exact partition function for Cayley, Hamming, and Kendall
+  distance.
+* Fixed a bug in the Ulam distance. Thanks for Marta Crispino for discovering 
+  it.
+* Fixed a bug in SMC algorithm for pairwise preference data, where the proposal
+  distribution incorrectly was assumed to be uniform.
+* It is now possible to report progress of MCMC more flexibly using 
+  compute_mallows() or compute_mallows_mixtures(). The old argument "verbose"
+  which by default reported every 1000'th iteration has been replaced by an
+  argument "progress_report" which can be set by calling set_progress_report().
+  The latter allows setting the interval between reports. This is particularly
+  useful for big data, where running 1000 iterations may take very long time.
+* Fixed a bug which caused inconsistent partial rank data to be retained from
+  previous timepoints when existing users update their preferences.
+* Arguments random and random_limit to setup_rank_data() have been removed. A 
+  new argument max_topological_sorts has been added instead, which captures all 
+  previous use cases, but also allows the user to specify the number of 
+  topological sorts to generate. This makes it useful also with a relatively 
+  large number of items, while it previously would be computationally unfeasible
+  for anything more than 8-9 items.
+* Argument shuffle_unranked to setup_rank_data() has been removed. If there are 
+  unranked items they will now always be shuffled. For reproducibility, set the
+  random number seed.
+* SMC Mallows with pairwise preference data now allows different initially
+  values for the augmented rankings across the particles. This is obtained by
+  generating (a subset of) all topological sorts consistent with the transitive
+  closure for the user, and sampling from these. Can be set with the 
+  max_topological_sorts argument to set_smc_options().
+
 # BayesMallows 2.1.1
 
 * Fixed gcc-UBSAN issue happening when compute_mallows_sequentially() is run

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -5,6 +5,10 @@ abind <- function(x, y) {
     .Call(`_BayesMallows_abind`, x, y)
 }
 
+all_topological_sorts <- function(prefs, n_items, maxit = 1000L) {
+    .Call(`_BayesMallows_all_topological_sorts`, prefs, n_items, maxit)
+}
+
 #' Asymptotic Approximation of Partition Function
 #'
 #' Compute the asymptotic approximation of the logarithm of the partition function,
@@ -68,8 +72,8 @@ rmallows <- function(rho0, alpha0, n_samples, burnin, thinning, leap_size = 1L,
     .Call(`_BayesMallows_rmallows`, rho0, alpha0, n_samples, burnin, thinning, leap_size, metric)
 }
 
-run_mcmc <- function(data, model_options, compute_options, priors, initial_values, pfun_values, pfun_estimate, verbose = FALSE) {
-    .Call(`_BayesMallows_run_mcmc`, data, model_options, compute_options, priors, initial_values, pfun_values, pfun_estimate, verbose)
+run_mcmc <- function(data, model_options, compute_options, priors, initial_values, pfun_values, pfun_estimate, progress_report) {
+    .Call(`_BayesMallows_run_mcmc`, data, model_options, compute_options, priors, initial_values, pfun_values, pfun_estimate, progress_report)
 }
 
 run_smc <- function(data, new_data, model_options, smc_options, compute_options, priors, initial_values, pfun_values, pfun_estimate) {

diff --git a/R/all_topological_sorts.R b/R/all_topological_sorts.R
diff --git a/R/compute_exact_partition_function.R b/R/compute_exact_partition_function.R
@@ -0,0 +1,26 @@
+#' @title Compute exact partition function
+#'
+#' @description For Cayley, Hamming, and Kendall distances, computationally
+#' tractable functions are available for the exact partition function.
+#'
+#' @param alpha Dispersion parameter.
+#' @param n_items Number of items.
+#' @param metric Distance function, one of "cayley", "hamming", or "kendall".
+#'
+#' @return The logarithm of the partition function.
+#' @export
+#'
+#' @references \insertAllCited{}
+#'
+#' @example inst/examples/compute_exact_partition_function_example.R
+#' @family partition function
+compute_exact_partition_function <- function(
+    alpha, n_items,
+    metric = c("cayley", "hamming", "kendall")) {
+  metric <- match.arg(metric, c("cayley", "hamming", "kendall"))
+  validate_integer(n_items)
+  validate_positive(n_items)
+  validate_positive(alpha)
+
+  get_partition_function(alpha, n_items, metric, NULL)
+}
diff --git a/R/compute_mallows.R b/R/compute_mallows.R
@@ -14,8 +14,8 @@
 #'   is the latent consensus ranking, \eqn{Z_{n}(\alpha)} is the partition
 #'   function (normalizing constant), and \eqn{d(r,\rho)} is a distance function
 #'   measuring the distance between \eqn{r} and \eqn{\rho}. We refer to
-#'   \insertCite{vitelli2018;textual}{BayesMallows} for further details of the Bayesian
-#'   Mallows model.
+#'   \insertCite{vitelli2018;textual}{BayesMallows} for further details of the
+#'   Bayesian Mallows model.
 #'
 #'   `compute_mallows` always returns posterior distributions of the latent
 #'   consensus ranking \eqn{\rho} and the scale parameter \eqn{\alpha}. Several
@@ -44,9 +44,8 @@
 #'   Ulam distances when the cardinalities are not available, cf.
 #'   [get_cardinalities()].
 #'
-#' @param verbose Logical specifying whether to print out the progress of the
-#'   Metropolis-Hastings algorithm. If `TRUE`, a notification is printed
-#'   every 1000th iteration. Defaults to `FALSE`.
+#' @param progress_report An object of class "BayesMallowsProgressReported"
+#'   returned from [set_progress_report()].
 #'
 #' @param cl Optional cluster returned from [parallel::makeCluster()]. If
 #'   provided, chains will be run in parallel, one on each node of `cl`.
@@ -70,7 +69,7 @@ compute_mallows <- function(
     priors = set_priors(),
     initial_values = set_initial_values(),
     pfun_estimate = NULL,
-    verbose = FALSE,
+    progress_report = set_progress_report(),
     cl = NULL) {
   validate_class(data, "BayesMallowsData")
   validate_class(model_options, "BayesMallowsModelOptions")
@@ -92,7 +91,7 @@ compute_mallows <- function(
   } else {
     lapplyfun <- prepare_cluster(cl, c(
       "data", "model_options", "compute_options", "priors", "initial_values",
-      "pfun_values", "pfun_estimate", "verbose"
+      "pfun_values", "pfun_estimate", "progress_report"
     ))
     chain_seq <- seq_along(cl)
   }
@@ -109,7 +108,7 @@ compute_mallows <- function(
       initial_values = initial_values,
       pfun_values = pfun_values,
       pfun_estimate = pfun_estimate,
-      verbose = verbose
+      progress_report = progress_report
     )
   })
 

diff --git a/R/compute_mallows_mixtures.R b/R/compute_mallows_mixtures.R
@@ -31,7 +31,7 @@ compute_mallows_mixtures <- function(
     priors = set_priors(),
     initial_values = set_initial_values(),
     pfun_estimate = NULL,
-    verbose = FALSE,
+    progress_report = set_progress_report(),
     cl = NULL) {
   stopifnot(is.null(cl) || inherits(cl, "cluster"))
 
@@ -40,15 +40,16 @@ compute_mallows_mixtures <- function(
   } else {
     lapplyfun <- prepare_cluster(cl, c(
       "data", "model_options", "compute_options", "priors", "initial_values",
-      "pfun_estimate", "verbose"
+      "pfun_estimate", "progress_report"
     ))
   }
 
   models <- lapplyfun(n_clusters, function(x) {
     model_options$n_clusters <- x
     compute_mallows(
-      data = data, model_options = model_options, compute_options = compute_options,
-      priors = priors, initial_values = initial_values, verbose = verbose
+      data = data, model_options = model_options,
+      compute_options = compute_options, priors = priors,
+      initial_values = initial_values, progress_report = progress_report
     )
   })
 

diff --git a/R/compute_mallows_sequentially.R b/R/compute_mallows_sequentially.R
@@ -61,6 +61,14 @@ compute_mallows_sequentially <- function(
   )) {
     stop("User IDs must be set.")
   }
+  data <- lapply(data, function(x) {
+    if (!is.null(x$preferences)) {
+      x$preferences <- as.matrix(x$preferences)
+    } else {
+      x$preferences <- matrix(0, 0, 0)
+    }
+    x
+  })
   pfun_values <- extract_pfun_values(model_options$metric, data[[1]]$n_items, pfun_estimate)
   alpha_init <- sample(initial_values$alpha, smc_options$n_particles, replace = TRUE)
   rho_init <- initial_values$rho[, sample(ncol(initial_values$rho), smc_options$n_particles, replace = TRUE)]

diff --git a/R/estimate_partition_function.R b/R/estimate_partition_function.R
@@ -10,7 +10,7 @@
 #' any number of items, for footrule distances with up to 50 items, Spearman
 #' distance with up to 20 items, and Ulam distance with up to 60 items. This
 #' function is thus intended for the complement of these cases. See
-#' [get_cardinalities()] for details.
+#' [get_cardinalities()] and [compute_exact_partition_function()] for details.
 #'
 #' @param method Character string specifying the method to use in order to
 #'   estimate the logarithm of the partition function. Available options are

diff --git a/R/generate_initial_ranking.R b/R/generate_initial_ranking.R
@@ -1,91 +1,56 @@
 splitpref <- function(preferences) {
-  split(
+  lapply(split(
     preferences[, c("bottom_item", "top_item"), drop = FALSE],
     preferences$assessor
-  )
+  ), as.matrix)
 }
 
 generate_initial_ranking <- function(
-    preferences, n_items, cl = NULL, shuffle_unranked = FALSE,
-    random = FALSE, random_limit = 8L) {
+    preferences, n_items, cl = NULL, max_topological_sorts) {
   UseMethod("generate_initial_ranking")
 }
 
 #' @export
 generate_initial_ranking.BayesMallowsTransitiveClosure <- function(
-    preferences, n_items, cl = NULL, shuffle_unranked = FALSE, random = FALSE,
-    random_limit = 8L) {
+    preferences, n_items, cl = NULL, max_topological_sorts) {
   stopifnot(is.null(cl) || inherits(cl, "cluster"))
-  if (n_items > random_limit && random) {
-    stop(paste(
-      "Number of items exceeds the limit for generation of random permutations,\n",
-      "modify the random_limit argument to override this.\n"
-    ))
-  }
 
   prefs <- splitpref(preferences)
 
   if (is.null(cl)) {
     do.call(rbind, lapply(
-      prefs, function(x, y, sr, r) create_ranks(as.matrix(x), y, sr, r),
-      n_items, shuffle_unranked, random
+      prefs, function(x, y, r) create_ranks(x, y, r),
+      n_items, max_topological_sorts
     ))
   } else {
     do.call(rbind, parallel::parLapply(
       cl = cl, X = prefs,
-      fun = function(x, y, sr, r) create_ranks(as.matrix(x), y, sr, r),
-      n_items, shuffle_unranked, random
+      fun = function(x, y, r) create_ranks(x, y, r),
+      n_items, max_topological_sorts
     ))
   }
 }
 
 #' @export
 generate_initial_ranking.BayesMallowsIntransitive <- function(
-    preferences, n_items, cl = NULL, shuffle_unranked = FALSE,
-    random = FALSE, random_limit = 8L) {
+    preferences, n_items, cl = NULL, max_topological_sorts) {
   n_assessors <- length(unique(preferences$assessor))
   rankings <- replicate(n_assessors, sample(x = n_items, size = n_items),
     simplify = "numeric"
   )
   rankings <- matrix(rankings, ncol = n_items, nrow = n_assessors, byrow = TRUE)
 }
 
-create_ranks <- function(mat, n_items, shuffle_unranked, random) {
-  if (!random) {
-    g <- igraph::graph_from_edgelist(as.matrix(mat))
-    g <- as.integer(igraph::topo_sort(g))
-
-    all_items <- seq(from = 1, to = n_items, by = 1)
-
-    if (!shuffle_unranked) {
-      # Add unranked elements outside of the range at the end
-      g_final <- c(g, setdiff(all_items, g))
-    } else {
-      ranked_items <- unique(c(mat))
-      unranked_items <- setdiff(all_items, ranked_items)
-      # Indices of ranked elements in final vector
-      idx_ranked <- sort(sample(length(all_items), length(ranked_items)))
-      g_final <- rep(NA, n_items)
-      g_final[idx_ranked] <- g[g %in% ranked_items]
-      g_final[is.na(g_final)] <- unranked_items[sample(length(unranked_items))]
-    }
-
-    # Convert from ordering to ranking
-    return(create_ranking(rev(g_final)))
-  } else {
-    graph <- list()
-    for (i in seq_len(n_items)) {
-      graph[[i]] <- unique(mat[mat[, "top_item"] == i, "bottom_item"])
-    }
-    indegree_init <- rep(0, n_items)
-    indegree <- table(unlist(graph))
-    indegree_init[as.integer(names(indegree))] <- indegree
-    attr(graph, "indegree") <- indegree_init
-
-    e1 <- new.env()
-    assign("x", list(), envir = e1)
-    assign("num", 0L, envir = e1)
-    all_topological_sorts(graph, n_items, e1)
-    return(get("x", envir = e1)[[sample(get("num", envir = e1), 1)]])
-  }
+create_ranks <- function(mat, n_items, max_topological_sorts) {
+  ret <- all_topological_sorts(mat, n_items, max_topological_sorts)
+  u <- sample(min(max_topological_sorts, nrow(ret)), 1)
+  ret <- ret[u, ]
+  all_items <- seq(from = 1, to = n_items, by = 1)
+  ranked_items <- unique(c(mat))
+  unranked_items <- setdiff(all_items, ranked_items)
+  idx_ranked <- sort(sample(length(all_items), length(ranked_items)))
+  g_final <- rep(NA, n_items)
+  g_final[idx_ranked] <- ret[ret %in% ranked_items]
+  g_final[is.na(g_final)] <- unranked_items[sample(length(unranked_items))]
+  create_ranking(g_final)
 }
diff --git a/R/generate_transitive_closure.R b/R/generate_transitive_closure.R
@@ -19,7 +19,7 @@ generate_transitive_closure <- function(preferences, cl = NULL) {
   prefs <- lapplyfun(seq_along(prefs), function(i) {
     cbind(
       assessor = as.numeric(names(prefs)[[i]]),
-      .generate_transitive_closure(as.matrix(prefs[[i]]))
+      .generate_transitive_closure(prefs[[i]])
     )
   })
 

diff --git a/R/get_cardinalities.R b/R/get_cardinalities.R
@@ -3,7 +3,7 @@
 #' @description The partition function for the Mallows model can be defined in a
 #'   computationally efficient manner as
 #'   \deqn{Z_{n}(\alpha) = \sum_{d_{n} \in
-#'   \mathcal{D}_{n}} N_{m,n} e^{-(\alpha/n) d_{m}}}.
+#'   \mathcal{D}_{n}} N_{m,n} e^{-(\alpha/n) d_{m}}.}
 #'   In this equation, \eqn{\mathcal{D}_{n}} a set containing all possible
 #'   distances at the given number of items, and \eqn{d_{m}} is on element of
 #'   this set. Finally, \eqn{N_{m,n}} is the number of possible configurations