From 12f306f357265e348876b3da1eb04bb49a5a878c Mon Sep 17 00:00:00 2001 From: Henrik Lindberg Date: Thu, 7 Jul 2022 13:39:03 +0200 Subject: [PATCH] Improve performance of generating distinct interactions When generating distinct intersections on data with hundreds of thousands of elements, it grinds to a halt. The time seems to be roughly O(n^2), meaning that with double the data execition takes 2^2=4x times as long. With the help of profviz, we find the main source to be a Filter in pushCombination(), which causes a twice nested loop over the elements. Minimal benchmark on a fairly beefy computer (5950X, 128 GB RAM) on Fedora Linux, R 4.1.3 and upsetjs 1.11.0, git hash 4b375a8e0 ``` generate_data <- function(n) { tibble::tibble( col_0 = sample(c(0, 1), n, replace = TRUE), col_1 = sample(c(0, 1), n, replace = TRUE), col_2 = sample(c(0, 1), n, replace = TRUE), col_3 = sample(c(0, 1), n, replace = TRUE), col_4 = sample(c(0, 1), n, replace = TRUE), col_5 = sample(c(0, 1), n, replace = TRUE), col_6 = sample(c(0, 1), n, replace = TRUE), col_7 = sample(c(0, 1), n, replace = TRUE), col_8 = sample(c(0, 1), n, replace = TRUE), col_9 = sample(c(0, 1), n, replace = TRUE) ) } ``` Before this PR: ``` > start <- Sys.time() > upsetjs() |> + upsetjs:::fromDataFrame(generate_data(10000)) |> + upsetjs:::generateDistinctIntersections(limit = 5) > Sys.time() - start Time difference of 24.85004 secs ``` With this PR: ``` > start <- Sys.time() > upsetjs() |> + upsetjs:::fromDataFrame(generate_data(10000)) |> + upsetjs:::generateDistinctIntersections(limit = 5) > Sys.time() - start Time difference of 0.7690187 secs ``` Also, scaling is now closer to O(n) or slightly better. With 10x the data: ``` > start <- Sys.time() > upsetjs() |> + upsetjs:::fromDataFrame(generate_data(100000)) |> + upsetjs:::generateDistinctIntersections(limit = 5) > Sys.time() - start Time difference of 5.745839 secs ``` --- R/data-helpers.R | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/R/data-helpers.R b/R/data-helpers.R index 5216ba1..c5a260c 100644 --- a/R/data-helpers.R +++ b/R/data-helpers.R @@ -118,14 +118,11 @@ generateCombinationsImpl <- function(sets, otherSets <- Filter(function(ss) { !(ss$name %in% s$setNames) }, sets) - dElems <- Filter(function(e) { - for (o in otherSets) { - if (e %in% o$elems) { - return(FALSE) - } - } - TRUE - }, s$elems) + + dElems <- s$elems + for (o in otherSets) { + dElems <- setdiff(dElems, o$elems) + } if (s$cardinality == length(dElems)) { combinations <<- c(combinations, list(s))