diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 66a3ecaa663d..0ed581b9bf75 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -106,10 +106,10 @@ if [[ $OS_NAME == "macos" ]]; then -target / || exit 1 fi -# fix for issue where CRAN was not returning {lattice} when using R 3.6 +# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6 # "Warning: dependency ‘lattice’ is not available" if [[ "${R_MAJOR_VERSION}" == "3" ]]; then - Rscript --vanilla -e "install.packages('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', repos = NULL, lib = '${R_LIB_PATH}')" + Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')" else # {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}. # This should be unnecessary on R >=4.4.0 diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 98b9505120de..8a4a7246b16c 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -170,7 +170,12 @@ Dataset <- R6::R6Class( # Check if more categorical features were output over the feature space data_is_not_filename <- !is.character(private$raw_data) - if (data_is_not_filename && max(private$categorical_feature) > ncol(private$raw_data)) { + if ( + data_is_not_filename + && !is.null(private$raw_data) + && is.null(private$used_indices) + && max(private$categorical_feature) > ncol(private$raw_data) + ) { stop( "lgb.Dataset.construct: supplied a too large value in categorical_feature: " , max(private$categorical_feature) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 65255a730017..cbd2e7e076f7 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -440,6 +440,35 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l expect_true(methods::is(bst, "lgb.CVBooster")) }) +test_that("lgb.Dataset: should be able to be used in lgb.cv() when constructed with categorical feature indices", { + data("mtcars") + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1L]) + categorical_feature <- which(names(mtcars) %in% c("cyl", "vs", "am", "gear", "carb")) - 1L + dtrain <- lgb.Dataset( + data = x + , label = y + , categorical_feature = categorical_feature + , free_raw_data = TRUE + , params = list(num_threads = .LGB_MAX_THREADS) + ) + # constructing the Dataset frees the raw data + dtrain$construct() + params <- list( + objective = "regression" + , num_leaves = 2L + , verbose = .LGB_VERBOSITY + , num_threads = .LGB_MAX_THREADS + ) + # cv should reuse the same categorical features without checking the indices + bst <- lgb.cv(params = params, data = dtrain, stratified = FALSE, nrounds = 1L) + expect_equal( + unlist(bst$boosters[[1L]]$booster$params$categorical_feature) + , categorical_feature - 1L # 0-based + ) +}) + + test_that("lgb.Dataset: should be able to use and retrieve long feature names", { # set one feature to a value longer than the default buffer size used # in LGBM_DatasetGetFeatureNames_R @@ -621,3 +650,12 @@ test_that("lgb.Dataset can be constructed with categorical features and without lgb.Dataset(raw_mat, categorical_feature = 2L)$construct() }, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features") }) + +test_that("lgb.Dataset.slice fails with a categorical feature index greater than the number of features", { + data <- matrix(runif(100L), nrow = 50L, ncol = 2L) + ds <- lgb.Dataset(data = data, categorical_feature = 3L) + subset <- ds$slice(1L:20L) + expect_error({ + subset$construct() + }, regexp = "supplied a too large value in categorical_feature: 3 but only 2 features") +})