From 572b64f6eb398b29d288736d7066b5bf59f783c2 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 22 May 2019 16:40:49 -0300 Subject: [PATCH 01/18] check for index in setkey --- R/setkey.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index c9a366b02..bc670995d 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -86,12 +86,18 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.") } if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - if (verbose) { - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R - # suppress needed for tests 644 and 645 in verbose mode - cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") + + # forder only if index is not present + if(is.null(indices(x))){ + if (verbose) { + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + # suppress needed for tests 644 and 645 in verbose mode + cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") + } else { + o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) + } } else { - o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) + o <- attr(attributes(x)$index, names(attributes(attributes(x)$index))) } if (!physical) { if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer()) From c8ddc62f432a60e387dcc9a229a358955c71ac9b Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 22 May 2019 17:15:20 -0300 Subject: [PATCH 02/18] explicitly naming the parameters --- R/setkey.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index bc670995d..1f9a85a50 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -86,7 +86,6 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.") } if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - # forder only if index is not present if(is.null(indices(x))){ if (verbose) { @@ -97,7 +96,7 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) } } else { - o <- attr(attributes(x)$index, names(attributes(attributes(x)$index))) + o <- attr(attributes(x)$index, which=names(attributes(attributes(x)$index)), exact = TRUE) } if (!physical) { if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer()) From bfefa7143737bbd7e5d767238f056f4c7243569c Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 22 May 2019 17:55:49 -0300 Subject: [PATCH 03/18] using cols to check existing index --- R/setkey.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 1f9a85a50..25929cb40 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -86,8 +86,16 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.") } if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - # forder only if index is not present + + # get existing index name if any + found_index <- NULL if(is.null(indices(x))){ + found_index <- names(attributes(attributes(x)$index)) + found_index <- gsub("^__","", found_index) + } + + # forder only if index is not present + if(!identical(found_index, cols)){ if (verbose) { tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode @@ -96,7 +104,7 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) } } else { - o <- attr(attributes(x)$index, which=names(attributes(attributes(x)$index)), exact = TRUE) + o <- attr(attributes(x)$index, which=found_index, exact = TRUE) } if (!physical) { if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer()) From 0c64f515797efb719a3a057db204b49331ea11ee Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Thu, 23 May 2019 10:08:22 -0300 Subject: [PATCH 04/18] add test --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d56ee7f1b..e38fc27f4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6029,6 +6029,13 @@ thisDT <- copy(DT)[, c("aaa", "b") := 2] test(1419.58, indices(thisDT), c("a", "ab")) test(1419.59, allIndicesValid(thisDT), TRUE) +## setkey on same col as index before +DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), + aaa = c(1,1,2,2,2,1,1,2,2,2)) +setindex(DT, a) +test(1419.59, allIndicesValid(DT), TRUE) +setkey(DT, a) +test(1419.60, indices(DT), c("a")) # setnames updates secondary key DT = data.table(a=1:5,b=10:6) From d46358f0b48721d19a96f6c3fad2b413328b8e85 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Thu, 23 May 2019 11:05:45 -0300 Subject: [PATCH 05/18] adding test, fixing bug --- R/setkey.R | 3 ++- inst/tests/tests.Rraw | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 25929cb40..4e15dcacf 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -89,7 +89,7 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR # get existing index name if any found_index <- NULL - if(is.null(indices(x))){ + if(!is.null(indices(x))){ found_index <- names(attributes(attributes(x)$index)) found_index <- gsub("^__","", found_index) } @@ -104,6 +104,7 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) } } else { + cat("using existing index for", found_index, "\n") o <- attr(attributes(x)$index, which=found_index, exact = TRUE) } if (!physical) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e38fc27f4..b0e334ec0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6033,9 +6033,8 @@ test(1419.59, allIndicesValid(thisDT), TRUE) DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), aaa = c(1,1,2,2,2,1,1,2,2,2)) setindex(DT, a) -test(1419.59, allIndicesValid(DT), TRUE) -setkey(DT, a) -test(1419.60, indices(DT), c("a")) +test(1419.60, allIndicesValid(DT), TRUE) +test(1419.61, setkey(DT, a), output="using existing index for a") # setnames updates secondary key DT = data.table(a=1:5,b=10:6) From 057938feb2aabe18b3aac0983273b6b54f83d09b Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Fri, 24 May 2019 17:10:12 -0300 Subject: [PATCH 06/18] check for multi-indexes --- R/setkey.R | 19 +++++++++++-------- inst/tests/tests.Rraw | 7 +++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 4e15dcacf..0905bc818 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -88,14 +88,11 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov # get existing index name if any - found_index <- NULL - if(!is.null(indices(x))){ - found_index <- names(attributes(attributes(x)$index)) - found_index <- gsub("^__","", found_index) - } + if(!is.null(indices(x))) found_index = names(attributes(attributes(x)$index)) + new_possible_index = paste0("__", cols, collapse="") # forder only if index is not present - if(!identical(found_index, cols)){ + if(!any(new_possible_index == found_index)){ if (verbose) { tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode @@ -104,8 +101,14 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) } } else { - cat("using existing index for", found_index, "\n") - o <- attr(attributes(x)$index, which=found_index, exact = TRUE) + # find the matching index + ix = found_index[which(found_index == new_possible_index)] + if (verbose){ + cat("using existing index for", gsub("^__","", ix), "\n") + o <- attr(attributes(x)$index, which=ix, exact = TRUE) + } else { + o <- attr(attributes(x)$index, which=ix, exact = TRUE) + } } if (!physical) { if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer()) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b0e334ec0..5cd4808f0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6035,6 +6035,13 @@ DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), setindex(DT, a) test(1419.60, allIndicesValid(DT), TRUE) test(1419.61, setkey(DT, a), output="using existing index for a") +DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), + aaa = c(1,1,2,2,2,1,1,2,2,2), + bbb = c(1,1,2,0,1,1,1,0,1,1)) +setindex(DT, a) +setindex(DT, aaa) +test(1419.62, allIndicesValid(DT), TRUE) +test(1419.63, setkey(DT, aaa), output="using existing index for aaa") # setnames updates secondary key DT = data.table(a=1:5,b=10:6) From 7503aeaf5a0af65d8e0d654aa26f957338ec182b Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Fri, 24 May 2019 21:18:59 -0300 Subject: [PATCH 07/18] fix bug --- R/setkey.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/setkey.R b/R/setkey.R index 0905bc818..0072856cf 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -88,7 +88,8 @@ setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TR if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov # get existing index name if any - if(!is.null(indices(x))) found_index = names(attributes(attributes(x)$index)) + found_index = NULL + if(!is.null(indices(x))) found_index <- names(attributes(attributes(x)$index)) new_possible_index = paste0("__", cols, collapse="") # forder only if index is not present From 3b0d3f5753f280eabc8ce03eaf2f9758300bb656 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Fri, 24 May 2019 21:40:40 -0300 Subject: [PATCH 08/18] use verbose param in test --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5cd4808f0..532f41789 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6034,14 +6034,14 @@ DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), aaa = c(1,1,2,2,2,1,1,2,2,2)) setindex(DT, a) test(1419.60, allIndicesValid(DT), TRUE) -test(1419.61, setkey(DT, a), output="using existing index for a") +test(1419.61, setkey(DT, a, verbose=TRUE), output="using existing index for a") DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), aaa = c(1,1,2,2,2,1,1,2,2,2), bbb = c(1,1,2,0,1,1,1,0,1,1)) setindex(DT, a) setindex(DT, aaa) test(1419.62, allIndicesValid(DT), TRUE) -test(1419.63, setkey(DT, aaa), output="using existing index for aaa") +test(1419.63, setkey(DT, aaa, verbose=TRUE), output="using existing index for aaa") # setnames updates secondary key DT = data.table(a=1:5,b=10:6) From 153ae6c5cfd7b7a5ab4f59b66f659321bf5b9100 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 27 May 2019 14:17:59 -0300 Subject: [PATCH 09/18] add more test, description, news, fixes as suggested --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/setkey.R | 12 +++++------- inst/tests/tests.Rraw | 13 +++++++++---- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4d5ecafd5..24ce231aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,8 @@ Authors@R: c( person("Michael","Quinn", role="ctb"), person("@javrucebo","", role="ctb"), person("@marc-outins","", role="ctb"), - person("Roy","Storey", role="ctb")) + person("Roy","Storey", role="ctb")), + person("Manish","Saraswat", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml diff --git a/NEWS.md b/NEWS.md index a18b2cef3..31630923d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -118,6 +118,8 @@ 15. `rbind` and `rbindlist` of zero-row items now retain (again) the unused levels of any (zero-length) factor columns, [#3508](https://github.com/Rdatatable/data.table/issues/3508). This was a regression in v1.12.2 just for zero-row items. Unused factor levels were already retained for items having `nrow>=1`. Thanks to Gregory Demin for reporting. +16. `setkey` now checks for existing index before doing `forder` [#3582](https://github.com/Rdatatable/data.table/pull/3582/). Executes `forder` only if the index does not exist. Thanks to @MichaelChirico for reporting and @saraswatmks for the PR. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/setkey.R b/R/setkey.R index e507cfb39..0b18f33e7 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -88,12 +88,10 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov # get existing index name if any - found_index = NULL - if(!is.null(indices(x))) found_index <- names(attributes(attributes(x)$index)) - new_possible_index = paste0("__", cols, collapse="") + index = paste0(cols, collapse="__") # forder only if index is not present - if(!any(new_possible_index == found_index)){ + if (!any(index == indices(x))){ if (verbose) { tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode @@ -102,10 +100,10 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) } } else { - # find the matching index - ix = found_index[which(found_index == new_possible_index)] + # find the name of matching index + ix = indices(x)[which(indices(x) == index)] if (verbose){ - cat("using existing index for", gsub("^__","", ix), "\n") + cat("using existing index for", ix, "\n") o <- attr(attributes(x)$index, which=ix, exact = TRUE) } else { o <- attr(attributes(x)$index, which=ix, exact = TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1432037c6..f90ff0d21 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6032,18 +6032,23 @@ test(1419.58, indices(thisDT), c("a", "ab")) test(1419.59, allIndicesValid(thisDT), TRUE) ## setkey on same col as index before -DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), +DT <- data.table(a =c(4,1,3,9,2,1,8,7,6,5), aaa = c(1,1,2,2,2,1,1,2,2,2)) setindex(DT, a) test(1419.60, allIndicesValid(DT), TRUE) -test(1419.61, setkey(DT, a, verbose=TRUE), output="using existing index for a") +setkey(DT, a) +test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) +setkey(DT, NULL) +test(1419.62, setkey(DT, a, verbose=TRUE), output="using existing index for a") + +# check setkey incase of existing multiple indexes DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), aaa = c(1,1,2,2,2,1,1,2,2,2), bbb = c(1,1,2,0,1,1,1,0,1,1)) setindex(DT, a) setindex(DT, aaa) -test(1419.62, allIndicesValid(DT), TRUE) -test(1419.63, setkey(DT, aaa, verbose=TRUE), output="using existing index for aaa") +test(1419.63, allIndicesValid(DT), TRUE) +test(1419.64, setkey(DT, aaa, verbose=TRUE), output="using existing index for aaa") # setnames updates secondary key DT = data.table(a=1:5,b=10:6) From 18097827f8c9e3aa8f94c3f7d6afead2e9308662 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 27 May 2019 14:58:51 -0300 Subject: [PATCH 10/18] update news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 1da3af114..d16965b32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -122,6 +122,8 @@ 16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting. +17. `setkey` now checks for existing index before doing `forder` [#3582](https://github.com/Rdatatable/data.table/pull/3582/). Executes `forder` only if the index does not exist. Thanks to @MichaelChirico for reporting and @saraswatmks for the PR. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. From a6aa862339afb5e8df93ebf1084579460dfa6b2f Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 27 May 2019 16:16:04 -0300 Subject: [PATCH 11/18] fix issue --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 24ce231aa..f8addc857 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,7 @@ Authors@R: c( person("Michael","Quinn", role="ctb"), person("@javrucebo","", role="ctb"), person("@marc-outins","", role="ctb"), - person("Roy","Storey", role="ctb")), + person("Roy","Storey", role="ctb"), person("Manish","Saraswat", role="ctb")) Depends: R (>= 3.1.0) Imports: methods From 928812130627034ce9a3ae52ce33ef0deac26d5a Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 27 May 2019 16:52:24 -0300 Subject: [PATCH 12/18] set index in test1419.62 --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 68bb71cf7..115cbe482 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6039,6 +6039,7 @@ test(1419.60, allIndicesValid(DT), TRUE) setkey(DT, a) test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) setkey(DT, NULL) +setindex(DT, a) test(1419.62, setkey(DT, a, verbose=TRUE), output="using existing index for a") # check setkey incase of existing multiple indexes From b50025324dff08ba20c3bd6e81b1e50db05f7f3f Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 27 May 2019 17:28:22 -0300 Subject: [PATCH 13/18] fix test error --- inst/tests/tests.Rraw | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 115cbe482..ff556a62d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6036,6 +6036,7 @@ DT <- data.table(a =c(4,1,3,9,2,1,8,7,6,5), aaa = c(1,1,2,2,2,1,1,2,2,2)) setindex(DT, a) test(1419.60, allIndicesValid(DT), TRUE) +setindex(DT, NULL) setkey(DT, a) test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) setkey(DT, NULL) @@ -6043,9 +6044,14 @@ setindex(DT, a) test(1419.62, setkey(DT, a, verbose=TRUE), output="using existing index for a") # check setkey incase of existing multiple indexes -DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), +DT <- data.table(a = c(3,3,4,4,5,6,1,1,7,2,2), aaa = c(1,1,2,2,2,1,1,2,2,2), bbb = c(1,1,2,0,1,1,1,0,1,1)) +setkey(DT, a) +test(1419.63, DT$a, c(1,1,2,2,3,3,4,4,5,6,7)) +setkey(DT, NULL) +test(1419.64, setkey(DT, a, verbose=TRUE), output="forder took") +setkey(DT, NULL) setindex(DT, a) setindex(DT, aaa) test(1419.63, allIndicesValid(DT), TRUE) From 230e5d1b486b2af2eaddf0e3cc34c8bfeff68449 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 27 May 2019 18:00:55 -0300 Subject: [PATCH 14/18] fix test sequence --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ff556a62d..51130becf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6054,8 +6054,8 @@ test(1419.64, setkey(DT, a, verbose=TRUE), output="forder took") setkey(DT, NULL) setindex(DT, a) setindex(DT, aaa) -test(1419.63, allIndicesValid(DT), TRUE) -test(1419.64, setkey(DT, aaa, verbose=TRUE), output="using existing index for aaa") +test(1419.65, allIndicesValid(DT), TRUE) +test(1419.66, setkey(DT, aaa, verbose=TRUE), output="using existing index for aaa") # setnames updates secondary key DT = data.table(a=1:5,b=10:6) From 97d64a5f3c3195b8171e47ac78293ca4a79c7654 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 27 May 2019 18:13:44 -0300 Subject: [PATCH 15/18] fix testcase 1376.09, 1376.11, 1376.12 --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 51130becf..6b109cc2b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5668,13 +5668,13 @@ test(1376.06, indices(DT), c("b","a")) # 2 secondary keys of single columns test(1376.07, DT[a==7L,verbose=TRUE], DT[7L], output="Optimized subsetting with index 'a'") setkey(DT,b) test(1376.08, indices(DT), NULL) -test(1376.09, list(DT[a==2L], indices(DT)), list(DT[9L],"a")) # create indices for next test +test(1376.09, list(DT[a==2L], indices(DT)), list(DT[2L],"a")) # create indices for next test setindex(DT,NULL) test(1376.10, list(key(DT), indices(DT)), list("b", NULL)) options(datatable.auto.index = FALSE) -test(1376.11, list(DT[a==2L], indices(DT)), list(DT[9L],NULL)) +test(1376.11, list(DT[a==2L], indices(DT)), list(DT[2L],NULL)) options(datatable.auto.index = TRUE) -test(1376.12, list(DT[a==2L], indices(DT)), list(DT[9L],"a")) +test(1376.12, list(DT[a==2L], indices(DT)), list(DT[2L],"a")) # When i is FALSE and a column is being added by reference, for consistency with cases when i is not FALSE # we should still add the column. But we need to know what type it should be, so the user supplied RHS of := From 70b5760f2f8d959f0a2c55a4c6969a566e3f191d Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Tue, 28 May 2019 15:40:36 -0300 Subject: [PATCH 16/18] remove which index --- R/setkey.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 0b18f33e7..dda9a4972 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -88,10 +88,10 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov # get existing index name if any - index = paste0(cols, collapse="__") + newkey = paste0(cols, collapse="__") # forder only if index is not present - if (!any(index == indices(x))){ + if (!any(indices(x) == newkey)){ if (verbose) { tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode @@ -101,12 +101,11 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } } else { # find the name of matching index - ix = indices(x)[which(indices(x) == index)] if (verbose){ - cat("using existing index for", ix, "\n") - o <- attr(attributes(x)$index, which=ix, exact = TRUE) + cat("using existing index for", newkey, "\n") + o <- attr(attributes(x)$index, which=newkey, exact = TRUE) } else { - o <- attr(attributes(x)$index, which=ix, exact = TRUE) + o <- attr(attributes(x)$index, which=newkey, exact = TRUE) } } if (!physical) { From fcb0c5b699e4a1409bacd5ec90d344b92157b70e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 28 May 2019 13:05:33 -0700 Subject: [PATCH 17/18] added checks on y too to new setkey tests --- inst/tests/tests.Rraw | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6b109cc2b..107890523 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5934,7 +5934,7 @@ test(1419.18, key(thisDT), "x1") ## testing secondary index retainment on assign (#2372) allIndicesValid <- function(DT){ - ## checks that the order of all indices is correct + ## checks that the order of all indices are correct for(idx in seq_along(indices(DT))){ index <- attr(attr(DT, "index"), paste0("__", indices(DT)[idx], collapse = "")) if(!length(index)) index <- seq_len(nrow(DT)) @@ -6031,7 +6031,7 @@ thisDT <- copy(DT)[, c("aaa", "b") := 2] test(1419.58, indices(thisDT), c("a", "ab")) test(1419.59, allIndicesValid(thisDT), TRUE) -## setkey on same col as index before +## setkey on same col as existing index, #2889 DT <- data.table(a =c(4,1,3,9,2,1,8,7,6,5), aaa = c(1,1,2,2,2,1,1,2,2,2)) setindex(DT, a) @@ -6041,21 +6041,23 @@ setkey(DT, a) test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) setkey(DT, NULL) setindex(DT, a) -test(1419.62, setkey(DT, a, verbose=TRUE), output="using existing index for a") +test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"), + output="using existing index for a") # checks also that the prior index a is dropped (because y is keyed with no index) -# check setkey incase of existing multiple indexes -DT <- data.table(a = c(3,3,4,4,5,6,1,1,7,2,2), - aaa = c(1,1,2,2,2,1,1,2,2,2), - bbb = c(1,1,2,0,1,1,1,0,1,1)) +# setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals) +DT = data.table(a = c(3,3,4,4,5,6,1,1,7,2), + aaa = c(1,1,2,2,2,1,1,2,2,2), + bbb = c(1,1,2,0,1,1,1,0,1,1)) setkey(DT, a) -test(1419.63, DT$a, c(1,1,2,2,3,3,4,4,5,6,7)) +test(1419.63, DT$a, c(1,1,2,3,3,4,4,5,6,7)) setkey(DT, NULL) test(1419.64, setkey(DT, a, verbose=TRUE), output="forder took") setkey(DT, NULL) -setindex(DT, a) -setindex(DT, aaa) +setindex(DT, aaa, a) +setindex(DT, aaa) # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching test(1419.65, allIndicesValid(DT), TRUE) -test(1419.66, setkey(DT, aaa, verbose=TRUE), output="using existing index for aaa") +test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"), + output="using existing index for aaa") # checks that all indexes are dropped (aaa_a too) # setnames updates secondary key DT = data.table(a=1:5,b=10:6) From bf59b635460d6dd59b65c65213c891858e532661 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 28 May 2019 14:16:18 -0700 Subject: [PATCH 18/18] uses getindex(), 2-space indentation, 2L back to 9L in test 1376 --- R/setkey.R | 36 ++++++++++++++---------------------- inst/tests/tests.Rraw | 10 +++++----- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index dda9a4972..eb0a4b89b 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -86,27 +86,19 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.") } if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - - # get existing index name if any + newkey = paste0(cols, collapse="__") - - # forder only if index is not present - if (!any(indices(x) == newkey)){ - if (verbose) { - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R - # suppress needed for tests 644 and 645 in verbose mode - cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") - } else { - o <- forderv(x, cols, sort=TRUE, retGrp=FALSE) - } + if (!any(indices(x) == newkey)) { + if (verbose) { + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + # suppress needed for tests 644 and 645 in verbose mode + cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") + } else { + o = forderv(x, cols, sort=TRUE, retGrp=FALSE) + } } else { - # find the name of matching index - if (verbose){ - cat("using existing index for", newkey, "\n") - o <- attr(attributes(x)$index, which=newkey, exact = TRUE) - } else { - o <- attr(attributes(x)$index, which=newkey, exact = TRUE) - } + if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="") + o = getindex(x, newkey) } if (!physical) { if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer()) @@ -175,7 +167,7 @@ setreordervec = function(x, order) .Call(Creorder, x, order) is.sorted = function(x, by=seq_along(x)) { if (is.list(x)) { - warning("Use 'if (length(o<-forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.") + warning("Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.") # could pass through a flag for forderv to return early on first FALSE. But we don't need that internally # since internally we always then need ordering, an it's better in one step. Don't want inefficiency to creep in. # This is only here for user/debugging use to check/test valid keys; e.g. data.table:::is.sorted(DT,by) @@ -250,7 +242,7 @@ forder = function(x, ..., na.last=TRUE, decreasing=FALSE) } if (is.name(v)) { ix = chmatch(as.character(v), xcols, nomatch=0L) - if (ix != 0L) ans <- point(ans, i, x, ix) # see 'point' in data.table.R and C-version pointWrapper in assign.c - avoid copies + if (ix != 0L) ans = point(ans, i, x, ix) # see 'point' in data.table.R and C-version pointWrapper in assign.c - avoid copies else { v = as.call(list(as.name("list"), v)) ans = point(ans, i, eval(v, x, parent.frame()), 1L) @@ -273,7 +265,7 @@ forder = function(x, ..., na.last=TRUE, decreasing=FALSE) fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...) { containsNAs = FALSE - if (typeof(x)=="double" && !decreasing && !(containsNAs<-anyNA(x))) { + if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) { if (internal) stop("Internal code should not be being called on type double") return(.Call(Cfsort, x, verbose)) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 107890523..94870a592 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5668,13 +5668,13 @@ test(1376.06, indices(DT), c("b","a")) # 2 secondary keys of single columns test(1376.07, DT[a==7L,verbose=TRUE], DT[7L], output="Optimized subsetting with index 'a'") setkey(DT,b) test(1376.08, indices(DT), NULL) -test(1376.09, list(DT[a==2L], indices(DT)), list(DT[2L],"a")) # create indices for next test +test(1376.09, list(DT[a==2L], indices(DT)), list(DT[9L],"a")) # create indices for next test setindex(DT,NULL) test(1376.10, list(key(DT), indices(DT)), list("b", NULL)) options(datatable.auto.index = FALSE) -test(1376.11, list(DT[a==2L], indices(DT)), list(DT[2L],NULL)) +test(1376.11, list(DT[a==2L], indices(DT)), list(DT[9L],NULL)) options(datatable.auto.index = TRUE) -test(1376.12, list(DT[a==2L], indices(DT)), list(DT[2L],"a")) +test(1376.12, list(DT[a==2L], indices(DT)), list(DT[9L],"a")) # When i is FALSE and a column is being added by reference, for consistency with cases when i is not FALSE # we should still add the column. But we need to know what type it should be, so the user supplied RHS of := @@ -6042,7 +6042,7 @@ test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) setkey(DT, NULL) setindex(DT, a) test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"), - output="using existing index for a") # checks also that the prior index a is dropped (because y is keyed with no index) + output="setkey on columns [a] using existing index 'a'") # checks also that the prior index a is dropped (because y is keyed with no index) # setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals) DT = data.table(a = c(3,3,4,4,5,6,1,1,7,2), @@ -6057,7 +6057,7 @@ setindex(DT, aaa, a) setindex(DT, aaa) # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching test(1419.65, allIndicesValid(DT), TRUE) test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"), - output="using existing index for aaa") # checks that all indexes are dropped (aaa_a too) + output="setkey on columns [aaa] using existing index 'aaa'") # checks that all indexes are dropped (aaa_a too) # setnames updates secondary key DT = data.table(a=1:5,b=10:6)