diff --git a/DESCRIPTION b/DESCRIPTION index 4d5ecafd5..f8addc857 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,8 @@ Authors@R: c( person("Michael","Quinn", role="ctb"), person("@javrucebo","", role="ctb"), person("@marc-outins","", role="ctb"), - person("Roy","Storey", role="ctb")) + person("Roy","Storey", role="ctb"), + person("Manish","Saraswat", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml diff --git a/NEWS.md b/NEWS.md index 1da3af114..d16965b32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -122,6 +122,8 @@ 16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting. +17. `setkey` now checks for existing index before doing `forder` [#3582](https://github.com/Rdatatable/data.table/pull/3582/). Executes `forder` only if the index does not exist. Thanks to @MichaelChirico for reporting and @saraswatmks for the PR. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/setkey.R b/R/setkey.R index 467024b97..eb0a4b89b 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -86,12 +86,19 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.") } if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - if (verbose) { - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R - # suppress needed for tests 644 and 645 in verbose mode - cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") + + newkey = paste0(cols, collapse="__") + if (!any(indices(x) == newkey)) { + if (verbose) { + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + # suppress needed for tests 644 and 645 in verbose mode + cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") + } else { + o = forderv(x, cols, sort=TRUE, retGrp=FALSE) + } } else { - o = forderv(x, cols, sort=TRUE, retGrp=FALSE) + if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="") + o = getindex(x, newkey) } if (!physical) { if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer()) @@ -160,7 +167,7 @@ setreordervec = function(x, order) .Call(Creorder, x, order) is.sorted = function(x, by=seq_along(x)) { if (is.list(x)) { - warning("Use 'if (length(o<-forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.") + warning("Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.") # could pass through a flag for forderv to return early on first FALSE. But we don't need that internally # since internally we always then need ordering, an it's better in one step. Don't want inefficiency to creep in. # This is only here for user/debugging use to check/test valid keys; e.g. data.table:::is.sorted(DT,by) @@ -235,7 +242,7 @@ forder = function(x, ..., na.last=TRUE, decreasing=FALSE) } if (is.name(v)) { ix = chmatch(as.character(v), xcols, nomatch=0L) - if (ix != 0L) ans <- point(ans, i, x, ix) # see 'point' in data.table.R and C-version pointWrapper in assign.c - avoid copies + if (ix != 0L) ans = point(ans, i, x, ix) # see 'point' in data.table.R and C-version pointWrapper in assign.c - avoid copies else { v = as.call(list(as.name("list"), v)) ans = point(ans, i, eval(v, x, parent.frame()), 1L) @@ -258,7 +265,7 @@ forder = function(x, ..., na.last=TRUE, decreasing=FALSE) fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...) { containsNAs = FALSE - if (typeof(x)=="double" && !decreasing && !(containsNAs<-anyNA(x))) { + if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) { if (internal) stop("Internal code should not be being called on type double") return(.Call(Cfsort, x, verbose)) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fe4d38d0b..94870a592 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5934,7 +5934,7 @@ test(1419.18, key(thisDT), "x1") ## testing secondary index retainment on assign (#2372) allIndicesValid <- function(DT){ - ## checks that the order of all indices is correct + ## checks that the order of all indices are correct for(idx in seq_along(indices(DT))){ index <- attr(attr(DT, "index"), paste0("__", indices(DT)[idx], collapse = "")) if(!length(index)) index <- seq_len(nrow(DT)) @@ -6031,6 +6031,33 @@ thisDT <- copy(DT)[, c("aaa", "b") := 2] test(1419.58, indices(thisDT), c("a", "ab")) test(1419.59, allIndicesValid(thisDT), TRUE) +## setkey on same col as existing index, #2889 +DT <- data.table(a =c(4,1,3,9,2,1,8,7,6,5), + aaa = c(1,1,2,2,2,1,1,2,2,2)) +setindex(DT, a) +test(1419.60, allIndicesValid(DT), TRUE) +setindex(DT, NULL) +setkey(DT, a) +test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) +setkey(DT, NULL) +setindex(DT, a) +test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"), + output="setkey on columns [a] using existing index 'a'") # checks also that the prior index a is dropped (because y is keyed with no index) + +# setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals) +DT = data.table(a = c(3,3,4,4,5,6,1,1,7,2), + aaa = c(1,1,2,2,2,1,1,2,2,2), + bbb = c(1,1,2,0,1,1,1,0,1,1)) +setkey(DT, a) +test(1419.63, DT$a, c(1,1,2,3,3,4,4,5,6,7)) +setkey(DT, NULL) +test(1419.64, setkey(DT, a, verbose=TRUE), output="forder took") +setkey(DT, NULL) +setindex(DT, aaa, a) +setindex(DT, aaa) # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching +test(1419.65, allIndicesValid(DT), TRUE) +test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"), + output="setkey on columns [aaa] using existing index 'aaa'") # checks that all indexes are dropped (aaa_a too) # setnames updates secondary key DT = data.table(a=1:5,b=10:6)