From 2aab1031c730a4a571fbf2cc8f91685c6664f4a9 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 May 2021 23:09:23 -0500 Subject: [PATCH 01/14] [R-package] move creation of character vectors in some methods to C++ side --- R-package/R/lgb.Dataset.R | 23 +--------------------- R-package/src/lightgbm_R.cpp | 38 +++++++++++++++++++++--------------- R-package/src/lightgbm_R.h | 6 +----- 3 files changed, 24 insertions(+), 43 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 70c9f09f4465..980dde6f9fca 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -369,31 +369,10 @@ Dataset <- R6::R6Class( # Check for handle if (!lgb.is.null.handle(x = private$handle)) { - - # Get feature names and write them - buf_len <- as.integer(1024L * 1024L) - act_len <- 0L - buf <- raw(buf_len) - .Call( + private$colnames <- .Call( LGBM_DatasetGetFeatureNames_R , private$handle - , buf_len - , act_len - , buf ) - if (act_len > buf_len) { - buf_len <- act_len - buf <- raw(buf_len) - .Call( - LGBM_DatasetGetFeatureNames_R - , private$handle - , buf_len - , act_len - , buf - ) - } - cnames <- lgb.encode.char(arr = buf, len = act_len) - private$colnames <- as.character(base::strsplit(cnames, "\t")[[1L]]) return(private$colnames) } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) { diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index db9307875128..82ac8b52aadf 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -153,17 +153,15 @@ SEXP LGBM_DatasetSetFeatureNames_R(LGBM_SE handle, R_API_END(); } -SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle, - SEXP buf_len, - SEXP actual_len, - LGBM_SE feature_names) { +SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle) { + SEXP feature_names; R_API_BEGIN(); - int len = 0; - CHECK_CALL(LGBM_DatasetGetNumFeature(R_GET_PTR(handle), &len)); + int num_features = 0; + CHECK_CALL(LGBM_DatasetGetNumFeature(R_GET_PTR(handle), &num_features)); const size_t reserved_string_size = 256; - std::vector> names(len); - std::vector ptr_names(len); - for (int i = 0; i < len; ++i) { + std::vector> names(num_features); + std::vector ptr_names(num_features); + for (int i = 0; i < num_features; ++i) { names[i].resize(reserved_string_size); ptr_names[i] = names[i].data(); } @@ -172,13 +170,21 @@ SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle, CHECK_CALL( LGBM_DatasetGetFeatureNames( R_GET_PTR(handle), - len, &out_len, - reserved_string_size, &required_string_size, - ptr_names.data())); - CHECK_EQ(len, out_len); + num_features, + &out_len, + reserved_string_size, + &required_string_size, + ptr_names.data() + ) + ); + CHECK_EQ(num_features, out_len); CHECK_GE(reserved_string_size, required_string_size); - auto merge_str = Join(ptr_names, "\t"); - EncodeChar(feature_names, merge_str.c_str(), buf_len, actual_len, merge_str.size() + 1); + feature_names = PROTECT(Rf_allocVector(STRSXP, num_features)); + for (int i = 0; i < num_features; ++i) { + SET_STRING_ELT(feature_names, i, Rf_mkChar(ptr_names[i])); + } + UNPROTECT(1); + return feature_names; R_API_END(); } @@ -652,7 +658,7 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_DatasetCreateFromMat_R" , (DL_FUNC) &LGBM_DatasetCreateFromMat_R , 6}, {"LGBM_DatasetGetSubset_R" , (DL_FUNC) &LGBM_DatasetGetSubset_R , 5}, {"LGBM_DatasetSetFeatureNames_R" , (DL_FUNC) &LGBM_DatasetSetFeatureNames_R , 2}, - {"LGBM_DatasetGetFeatureNames_R" , (DL_FUNC) &LGBM_DatasetGetFeatureNames_R , 4}, + {"LGBM_DatasetGetFeatureNames_R" , (DL_FUNC) &LGBM_DatasetGetFeatureNames_R , 1}, {"LGBM_DatasetSaveBinary_R" , (DL_FUNC) &LGBM_DatasetSaveBinary_R , 2}, {"LGBM_DatasetFree_R" , (DL_FUNC) &LGBM_DatasetFree_R , 1}, {"LGBM_DatasetSetField_R" , (DL_FUNC) &LGBM_DatasetSetField_R , 4}, diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 3b46f6e17b27..9cf875f4383f 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -114,14 +114,10 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetSetFeatureNames_R( /*! * \brief save feature names to Dataset * \param handle handle -* \param feature_names feature names * \return 0 when succeed, -1 when failure happens */ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetGetFeatureNames_R( - LGBM_SE handle, - SEXP buf_len, - SEXP actual_len, - LGBM_SE feature_names + LGBM_SE handle ); /*! From 3d6e9895eb3be4c948bfcfe72268f1000c92b82b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 May 2021 23:33:13 -0500 Subject: [PATCH 02/14] convert LGBM_BoosterGetEvalNames_R --- R-package/R/lgb.Booster.R | 29 ++++----------------------- R-package/src/lightgbm_R.cpp | 38 +++++++++++++++++++++--------------- R-package/src/lightgbm_R.h | 7 ++----- 3 files changed, 28 insertions(+), 46 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index a3a6ae90d373..2ae1bbf52dc8 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -666,41 +666,20 @@ Booster <- R6::R6Class( # Check for evaluation names emptiness if (is.null(private$eval_names)) { - - # Get evaluation names - buf_len <- as.integer(1024L * 1024L) - act_len <- 0L - buf <- raw(buf_len) - .Call( + eval_names <- .Call( LGBM_BoosterGetEvalNames_R , private$handle - , buf_len - , act_len - , buf ) - if (act_len > buf_len) { - buf_len <- act_len - buf <- raw(buf_len) - .Call( - LGBM_BoosterGetEvalNames_R - , private$handle - , buf_len - , act_len - , buf - ) - } - names <- lgb.encode.char(arr = buf, len = act_len) # Check names' length - if (nchar(names) > 0L) { + if (length(eval_names) > 0L) { # Parse and store privately names - names <- strsplit(names, "\t")[[1L]] - private$eval_names <- names + private$eval_names <- eval_names # some metrics don't map cleanly to metric names, for example "ndcg@1" is just the # ndcg metric evaluated at the first "query result" in learning-to-rank - metric_names <- gsub("@.*", "", names) + metric_names <- gsub("@.*", "", eval_names) private$higher_better_inner_eval <- .METRICS_HIGHER_BETTER()[metric_names] } diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 82ac8b52aadf..077b9ba8fd0e 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -438,18 +438,16 @@ SEXP LGBM_BoosterGetLowerBoundValue_R(LGBM_SE handle, R_API_END(); } -SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle, - SEXP buf_len, - SEXP actual_len, - LGBM_SE eval_names) { +SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle) { + SEXP eval_names; R_API_BEGIN(); - int len; - CHECK_CALL(LGBM_BoosterGetEvalCounts(R_GET_PTR(handle), &len)); + int num_eval_sets; + CHECK_CALL(LGBM_BoosterGetEvalCounts(R_GET_PTR(handle), &num_eval_sets)); const size_t reserved_string_size = 128; - std::vector> names(len); - std::vector ptr_names(len); - for (int i = 0; i < len; ++i) { + std::vector> names(num_eval_sets); + std::vector ptr_names(num_eval_sets); + for (int i = 0; i < num_eval_sets; ++i) { names[i].resize(reserved_string_size); ptr_names[i] = names[i].data(); } @@ -459,13 +457,21 @@ SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle, CHECK_CALL( LGBM_BoosterGetEvalNames( R_GET_PTR(handle), - len, &out_len, - reserved_string_size, &required_string_size, - ptr_names.data())); - CHECK_EQ(out_len, len); + num_eval_sets, + &out_len, + reserved_string_size, + &required_string_size, + ptr_names.data() + ) + ); + CHECK_EQ(out_len, num_eval_sets); CHECK_GE(reserved_string_size, required_string_size); - auto merge_names = Join(ptr_names, "\t"); - EncodeChar(eval_names, merge_names.c_str(), buf_len, actual_len, merge_names.size() + 1); + eval_names = PROTECT(Rf_allocVector(STRSXP, num_eval_sets)); + for (int i = 0; i < num_eval_sets; ++i) { + SET_STRING_ELT(eval_names, i, Rf_mkChar(ptr_names[i])); + } + UNPROTECT(1); + return eval_names; R_API_END(); } @@ -682,7 +688,7 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterGetCurrentIteration_R", (DL_FUNC) &LGBM_BoosterGetCurrentIteration_R, 2}, {"LGBM_BoosterGetUpperBoundValue_R" , (DL_FUNC) &LGBM_BoosterGetUpperBoundValue_R , 2}, {"LGBM_BoosterGetLowerBoundValue_R" , (DL_FUNC) &LGBM_BoosterGetLowerBoundValue_R , 2}, - {"LGBM_BoosterGetEvalNames_R" , (DL_FUNC) &LGBM_BoosterGetEvalNames_R , 4}, + {"LGBM_BoosterGetEvalNames_R" , (DL_FUNC) &LGBM_BoosterGetEvalNames_R , 1}, {"LGBM_BoosterGetEval_R" , (DL_FUNC) &LGBM_BoosterGetEval_R , 3}, {"LGBM_BoosterGetNumPredict_R" , (DL_FUNC) &LGBM_BoosterGetNumPredict_R , 3}, {"LGBM_BoosterGetPredict_R" , (DL_FUNC) &LGBM_BoosterGetPredict_R , 3}, diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 9cf875f4383f..89f514999d19 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -385,14 +385,11 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetLowerBoundValue_R( /*! * \brief Get Name of eval -* \param eval_names eval names +* \param handle Handle of booster * \return 0 when succeed, -1 when failure happens */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetEvalNames_R( - LGBM_SE handle, - SEXP buf_len, - SEXP actual_len, - LGBM_SE eval_names + LGBM_SE handle ); /*! From 63856c2a63eaa803fd4c0fe6b028e301c2b0aa45 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 5 May 2021 00:37:04 -0500 Subject: [PATCH 03/14] convert LGBM_BoosterDumpModel_R and LGBM_BoosterSaveModelToString_R --- R-package/R/lgb.Booster.R | 55 ++------------------- R-package/R/utils.R | 7 --- R-package/src/R_object_helper.h | 2 - R-package/src/lightgbm_R.cpp | 70 +++++++++++++++------------ R-package/src/lightgbm_R.h | 11 +---- R-package/tests/testthat/test_utils.R | 9 ---- 6 files changed, 46 insertions(+), 108 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 2ae1bbf52dc8..acee33f176f0 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -466,40 +466,15 @@ Booster <- R6::R6Class( num_iteration <- self$best_iter } - # Create buffer - buf_len <- as.integer(1024L * 1024L) - act_len <- 0L - buf <- raw(buf_len) - # Call buffer - .Call( + model_str <- .Call( LGBM_BoosterSaveModelToString_R , private$handle , as.integer(num_iteration) , as.integer(feature_importance_type) - , buf_len - , act_len - , buf ) - # Check for buffer content - if (act_len > buf_len) { - buf_len <- act_len - buf <- raw(buf_len) - .Call( - LGBM_BoosterSaveModelToString_R - , private$handle - , as.integer(num_iteration) - , as.integer(feature_importance_type) - , buf_len - , act_len - , buf - ) - } - - return( - lgb.encode.char(arr = buf, len = act_len) - ) + return(model_str) }, @@ -511,36 +486,14 @@ Booster <- R6::R6Class( num_iteration <- self$best_iter } - buf_len <- as.integer(1024L * 1024L) - act_len <- 0L - buf <- raw(buf_len) - .Call( + model_str <- .Call( LGBM_BoosterDumpModel_R , private$handle , as.integer(num_iteration) , as.integer(feature_importance_type) - , buf_len - , act_len - , buf ) - if (act_len > buf_len) { - buf_len <- act_len - buf <- raw(buf_len) - .Call( - LGBM_BoosterDumpModel_R - , private$handle - , as.integer(num_iteration) - , as.integer(feature_importance_type) - , buf_len - , act_len - , buf - ) - } - - return( - lgb.encode.char(arr = buf, len = act_len) - ) + return(model_str) }, diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 013e2d4a9f2f..a0f47efd2ef6 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -18,13 +18,6 @@ lgb.is.null.handle <- function(x) { return(is.null(x) || is.na(x)) } -lgb.encode.char <- function(arr, len) { - if (!is.raw(arr)) { - stop("lgb.encode.char: Can only encode from raw type") - } - return(rawToChar(arr[seq_len(len)])) -} - # [description] Get the most recent error stored on the C++ side and raise it # as an R error. lgb.last_error <- function() { diff --git a/R-package/src/R_object_helper.h b/R-package/src/R_object_helper.h index b961dc76c06e..986d7756d828 100644 --- a/R-package/src/R_object_helper.h +++ b/R-package/src/R_object_helper.h @@ -96,8 +96,6 @@ typedef union { VECTOR_SER s; double align; } SEXPREC_ALIGN; #define DATAPTR(x) ((reinterpret_cast(x)) + 1) -#define R_CHAR_PTR(x) (reinterpret_castDATAPTR(x)) - #define R_IS_NULL(x) ((*reinterpret_cast(x)).sxpinfo.type == 0) // 64bit pointer diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 077b9ba8fd0e..45ac9b3c5fdf 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -39,23 +39,9 @@ return R_NilValue; \ } -using LightGBM::Common::Join; using LightGBM::Common::Split; using LightGBM::Log; -LGBM_SE EncodeChar(LGBM_SE dest, const char* src, SEXP buf_len, SEXP actual_len, size_t str_len) { - if (str_len > INT32_MAX) { - Log::Fatal("Don't support large string in R-package"); - } - INTEGER(actual_len)[0] = static_cast(str_len); - if (Rf_asInteger(buf_len) < static_cast(str_len)) { - return dest; - } - auto ptr = R_CHAR_PTR(dest); - std::memcpy(ptr, src, str_len); - return dest; -} - SEXP LGBM_GetLastError_R() { SEXP out; out = PROTECT(Rf_allocVector(STRSXP, 1)); @@ -628,31 +614,55 @@ SEXP LGBM_BoosterSaveModel_R(LGBM_SE handle, SEXP LGBM_BoosterSaveModelToString_R(LGBM_SE handle, SEXP num_iteration, - SEXP feature_importance_type, - SEXP buffer_len, - SEXP actual_len, - LGBM_SE out_str) { + SEXP feature_importance_type) { + SEXP model_str; R_API_BEGIN(); int64_t out_len = 0; - int64_t buf_len = static_cast(Rf_asInteger(buffer_len)); + int64_t buf_len = 1024 * 1024; std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); - EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); + CHECK_CALL(LGBM_BoosterSaveModelToString( + R_GET_PTR(handle), + 0, + Rf_asInteger(num_iteration), + Rf_asInteger(feature_importance_type), + buf_len, + &out_len, + inner_char_buf.data() + )); + + model_str = PROTECT(Rf_allocVector(STRSXP, 1)); + SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); + UNPROTECT(1); + return model_str; + + // EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); R_API_END(); } SEXP LGBM_BoosterDumpModel_R(LGBM_SE handle, SEXP num_iteration, - SEXP feature_importance_type, - SEXP buffer_len, - SEXP actual_len, - LGBM_SE out_str) { + SEXP feature_importance_type) { + SEXP model_str; R_API_BEGIN(); int64_t out_len = 0; - int64_t buf_len = static_cast(Rf_asInteger(buffer_len)); + int64_t buf_len = 1024 * 1024; std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); - EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); + CHECK_CALL(LGBM_BoosterDumpModel( + R_GET_PTR(handle), + 0, + Rf_asInteger(num_iteration), + Rf_asInteger(feature_importance_type), + buf_len, + &out_len, + inner_char_buf.data() + )); + + model_str = PROTECT(Rf_allocVector(STRSXP, 1)); + SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); + UNPROTECT(1); + return model_str; + + // EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); R_API_END(); } @@ -697,8 +707,8 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterPredictForCSC_R" , (DL_FUNC) &LGBM_BoosterPredictForCSC_R , 14}, {"LGBM_BoosterPredictForMat_R" , (DL_FUNC) &LGBM_BoosterPredictForMat_R , 11}, {"LGBM_BoosterSaveModel_R" , (DL_FUNC) &LGBM_BoosterSaveModel_R , 4}, - {"LGBM_BoosterSaveModelToString_R" , (DL_FUNC) &LGBM_BoosterSaveModelToString_R , 6}, - {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 6}, + {"LGBM_BoosterSaveModelToString_R" , (DL_FUNC) &LGBM_BoosterSaveModelToString_R , 3}, + {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, {NULL, NULL, 0} }; diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 89f514999d19..b8c8e2e53b3c 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -559,16 +559,12 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModel_R( * \brief create string containing model * \param handle handle * \param num_iteration, <= 0 means save all -* \param out_str string of model * \return 0 when succeed, -1 when failure happens */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModelToString_R( LGBM_SE handle, SEXP num_iteration, - SEXP feature_importance_type, - SEXP buffer_len, - SEXP actual_len, - LGBM_SE out_str + SEXP feature_importance_type ); /*! @@ -581,10 +577,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModelToString_R( LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( LGBM_SE handle, SEXP num_iteration, - SEXP feature_importance_type, - SEXP buffer_len, - SEXP actual_len, - LGBM_SE out_str + SEXP feature_importance_type ); #endif // LIGHTGBM_R_H_ diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 228de36366a5..491ded72d994 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -1,12 +1,3 @@ -context("lgb.encode.char") - -test_that("lgb.encode.char throws an informative error if it is passed a non-raw input", { - x <- "some-string" - expect_error({ - lgb.encode.char(x) - }, regexp = "Can only encode from raw type") -}) - context("lgb.check.r6.class") test_that("lgb.check.r6.class() should return FALSE for NULL input", { From 826ae2216fe20cfa24407ab01400a7107fc360f4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 5 May 2021 00:44:02 -0500 Subject: [PATCH 04/14] remove debugging code --- R-package/src/lightgbm_R.cpp | 78 +++++++++++------------------------- 1 file changed, 24 insertions(+), 54 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 45ac9b3c5fdf..d47edf280401 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -142,12 +142,12 @@ SEXP LGBM_DatasetSetFeatureNames_R(LGBM_SE handle, SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle) { SEXP feature_names; R_API_BEGIN(); - int num_features = 0; - CHECK_CALL(LGBM_DatasetGetNumFeature(R_GET_PTR(handle), &num_features)); + int len = 0; + CHECK_CALL(LGBM_DatasetGetNumFeature(R_GET_PTR(handle), &len)); const size_t reserved_string_size = 256; - std::vector> names(num_features); - std::vector ptr_names(num_features); - for (int i = 0; i < num_features; ++i) { + std::vector> names(len); + std::vector ptr_names(len); + for (int i = 0; i < len; ++i) { names[i].resize(reserved_string_size); ptr_names[i] = names[i].data(); } @@ -156,17 +156,13 @@ SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle) { CHECK_CALL( LGBM_DatasetGetFeatureNames( R_GET_PTR(handle), - num_features, - &out_len, - reserved_string_size, - &required_string_size, - ptr_names.data() - ) - ); - CHECK_EQ(num_features, out_len); + len, &out_len, + reserved_string_size, &required_string_size, + ptr_names.data())); + CHECK_EQ(len, out_len); CHECK_GE(reserved_string_size, required_string_size); - feature_names = PROTECT(Rf_allocVector(STRSXP, num_features)); - for (int i = 0; i < num_features; ++i) { + feature_names = PROTECT(Rf_allocVector(STRSXP, len)); + for (int i = 0; i < len; ++i) { SET_STRING_ELT(feature_names, i, Rf_mkChar(ptr_names[i])); } UNPROTECT(1); @@ -427,13 +423,13 @@ SEXP LGBM_BoosterGetLowerBoundValue_R(LGBM_SE handle, SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle) { SEXP eval_names; R_API_BEGIN(); - int num_eval_sets; - CHECK_CALL(LGBM_BoosterGetEvalCounts(R_GET_PTR(handle), &num_eval_sets)); + int len; + CHECK_CALL(LGBM_BoosterGetEvalCounts(R_GET_PTR(handle), &len)); const size_t reserved_string_size = 128; - std::vector> names(num_eval_sets); - std::vector ptr_names(num_eval_sets); - for (int i = 0; i < num_eval_sets; ++i) { + std::vector> names(len); + std::vector ptr_names(len); + for (int i = 0; i < len; ++i) { names[i].resize(reserved_string_size); ptr_names[i] = names[i].data(); } @@ -443,17 +439,13 @@ SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle) { CHECK_CALL( LGBM_BoosterGetEvalNames( R_GET_PTR(handle), - num_eval_sets, - &out_len, - reserved_string_size, - &required_string_size, - ptr_names.data() - ) - ); - CHECK_EQ(out_len, num_eval_sets); + len, &out_len, + reserved_string_size, &required_string_size, + ptr_names.data())); + CHECK_EQ(out_len, len); CHECK_GE(reserved_string_size, required_string_size); - eval_names = PROTECT(Rf_allocVector(STRSXP, num_eval_sets)); - for (int i = 0; i < num_eval_sets; ++i) { + eval_names = PROTECT(Rf_allocVector(STRSXP, len)); + for (int i = 0; i < len; ++i) { SET_STRING_ELT(eval_names, i, Rf_mkChar(ptr_names[i])); } UNPROTECT(1); @@ -620,22 +612,11 @@ SEXP LGBM_BoosterSaveModelToString_R(LGBM_SE handle, int64_t out_len = 0; int64_t buf_len = 1024 * 1024; std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_BoosterSaveModelToString( - R_GET_PTR(handle), - 0, - Rf_asInteger(num_iteration), - Rf_asInteger(feature_importance_type), - buf_len, - &out_len, - inner_char_buf.data() - )); - + CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); UNPROTECT(1); return model_str; - - // EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); R_API_END(); } @@ -647,22 +628,11 @@ SEXP LGBM_BoosterDumpModel_R(LGBM_SE handle, int64_t out_len = 0; int64_t buf_len = 1024 * 1024; std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_BoosterDumpModel( - R_GET_PTR(handle), - 0, - Rf_asInteger(num_iteration), - Rf_asInteger(feature_importance_type), - buf_len, - &out_len, - inner_char_buf.data() - )); - + CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); UNPROTECT(1); return model_str; - - // EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); R_API_END(); } From 1f07b058e2498b39122ab6c366149bbedb24c1e3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 5 May 2021 09:05:34 -0500 Subject: [PATCH 05/14] update docs --- R-package/src/lightgbm_R.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index b8c8e2e53b3c..fc6a56d5f5bb 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -112,9 +112,9 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetSetFeatureNames_R( ); /*! -* \brief save feature names to Dataset -* \param handle handle -* \return 0 when succeed, -1 when failure happens +* \brief get feature names from Dataset +* \param handle Dataset handle +* \return an R character vector with feature names from the Dataset or NULL if no feature names */ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetGetFeatureNames_R( LGBM_SE handle @@ -557,9 +557,10 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModel_R( /*! * \brief create string containing model -* \param handle handle +* \param handle Booster handle * \param num_iteration, <= 0 means save all -* \return 0 when succeed, -1 when failure happens +* \param feature_importance_type type of feature importance, 0: split, 1: gain +* \return length-1 R character vector with model string */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModelToString_R( LGBM_SE handle, @@ -569,10 +570,10 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModelToString_R( /*! * \brief dump model to json -* \param handle handle +* \param handle Booster handle * \param num_iteration, <= 0 means save all -* \param out_str json format string of model -* \return 0 when succeed, -1 when failure happens +* \param feature_importance_type type of feature importance, 0: split, 1: gain +* \return length-1 R character vector with model JSON */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( LGBM_SE handle, From f84b84d86f57f2212558e9dddfbf185b21a9b12f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 5 May 2021 09:14:16 -0500 Subject: [PATCH 06/14] remove comment --- R-package/R/lgb.Booster.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index acee33f176f0..960f349c3c57 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -466,7 +466,6 @@ Booster <- R6::R6Class( num_iteration <- self$best_iter } - # Call buffer model_str <- .Call( LGBM_BoosterSaveModelToString_R , private$handle From 147e1c29539fc0394680968a7c2c51b6f8592d68 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 6 May 2021 20:35:41 -0500 Subject: [PATCH 07/14] add handling for larger model strings --- R-package/src/lightgbm_R.cpp | 10 ++++++++++ R-package/src/lightgbm_R.h | 8 ++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index d47edf280401..eb0bfaa8259e 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -613,6 +613,11 @@ SEXP LGBM_BoosterSaveModelToString_R(LGBM_SE handle, int64_t buf_len = 1024 * 1024; std::vector inner_char_buf(buf_len); CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + // if the model string was larger than the initial buffer, allocate a bigger buffer and try again + if (out_len > buf_len) { + buf_len = out_len; + CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + } model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); UNPROTECT(1); @@ -629,6 +634,11 @@ SEXP LGBM_BoosterDumpModel_R(LGBM_SE handle, int64_t buf_len = 1024 * 1024; std::vector inner_char_buf(buf_len); CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + // if the model string was larger than the initial buffer, allocate a bigger buffer and try again + if (out_len > buf_len) { + buf_len = out_len; + CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + } model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); UNPROTECT(1); diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 047c2503a609..115fdb22241d 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -101,7 +101,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetGetSubset_R( * \brief save feature names to Dataset * \param handle handle * \param feature_names feature names -* \return R NULL value +* \return R character vector of feature names */ LIGHTGBM_C_EXPORT SEXP LGBM_DatasetSetFeatureNames_R( LGBM_SE handle, @@ -385,7 +385,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetLowerBoundValue_R( /*! * \brief Get Name of eval * \param handle Handle of booster -* \return 0 when succeed, -1 when failure happens +* \return R character vector with names of eval sets */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetEvalNames_R( LGBM_SE handle @@ -579,7 +579,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModel_R( * \param handle Booster handle * \param num_iteration, <= 0 means save all * \param feature_importance_type type of feature importance, 0: split, 1: gain -* \return length-1 R character vector with model string +* \return R character vector (length=1) with model string */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModelToString_R( LGBM_SE handle, @@ -592,7 +592,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModelToString_R( * \param handle Booster handle * \param num_iteration, <= 0 means save all * \param feature_importance_type type of feature importance, 0: split, 1: gain -* \return length-1 R character vector with model JSON +* \return R character vector (length=1) with model JSON */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( LGBM_SE handle, From c272765079ac1625dbac2c32eb6bc8162b42882e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 6 May 2021 20:57:09 -0500 Subject: [PATCH 08/14] handle large strings in feature and eval names --- R-package/src/lightgbm_R.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index eb0bfaa8259e..42b7eaee4a6b 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -159,6 +159,18 @@ SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle) { len, &out_len, reserved_string_size, &required_string_size, ptr_names.data())); + // if any feature names were larger than allocated size, + // allow for a larger size and try again + if (required_string_size > reserved_string_size) { + CHECK_CALL( + LGBM_DatasetGetFeatureNames( + R_GET_PTR(handle), + len, + &out_len, + required_string_size, + &required_string_size, + ptr_names.data())); + } CHECK_EQ(len, out_len); CHECK_GE(reserved_string_size, required_string_size); feature_names = PROTECT(Rf_allocVector(STRSXP, len)); @@ -442,6 +454,18 @@ SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle) { len, &out_len, reserved_string_size, &required_string_size, ptr_names.data())); + // if any eval names were larger than allocated size, + // allow for a larger size and try again + if (required_string_size > reserved_string_size) { + CHECK_CALL( + LGBM_BoosterGetEvalNames( + R_GET_PTR(handle), + len, + &out_len, + required_string_size, + &required_string_size, + ptr_names.data())); + } CHECK_EQ(out_len, len); CHECK_GE(reserved_string_size, required_string_size); eval_names = PROTECT(Rf_allocVector(STRSXP, len)); From 31ce4bd0fd20ab48cf042c548f932ba85bcd286a Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 6 May 2021 23:30:55 -0500 Subject: [PATCH 09/14] got long feature names working --- R-package/src/lightgbm_R.cpp | 11 ++++++++++- R-package/tests/testthat/test_dataset.R | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 42b7eaee4a6b..f472123b64e5 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -162,6 +162,10 @@ SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle) { // if any feature names were larger than allocated size, // allow for a larger size and try again if (required_string_size > reserved_string_size) { + for (int i = 0; i < len; ++i) { + names[i].resize(required_string_size); + ptr_names[i] = names[i].data(); + } CHECK_CALL( LGBM_DatasetGetFeatureNames( R_GET_PTR(handle), @@ -172,7 +176,6 @@ SEXP LGBM_DatasetGetFeatureNames_R(LGBM_SE handle) { ptr_names.data())); } CHECK_EQ(len, out_len); - CHECK_GE(reserved_string_size, required_string_size); feature_names = PROTECT(Rf_allocVector(STRSXP, len)); for (int i = 0; i < len; ++i) { SET_STRING_ELT(feature_names, i, Rf_mkChar(ptr_names[i])); @@ -457,6 +460,12 @@ SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle) { // if any eval names were larger than allocated size, // allow for a larger size and try again if (required_string_size > reserved_string_size) { + std::vector> names(len); + std::vector ptr_names(len); + for (int i = 0; i < len; ++i) { + names[i].resize(required_string_size); + ptr_names[i] = names[i].data(); + } CHECK_CALL( LGBM_BoosterGetEvalNames( R_GET_PTR(handle), diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 6d3fb188e7b4..a815fda5bcab 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -278,3 +278,21 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l expect_is(bst, "lgb.CVBooster") }) + +test_that("lgb.Dataset: should be able to use and retrieve long feature names", { + # set one feature to a value longer than the default buffer size used + # in LGBM_DatasetGetFeatureNames_R + feature_names <- names(iris) + long_name <- paste0(rep("a", 1e3), collapse = "") + feature_names[1L] <- long_name + names(iris) <- feature_names + # check that feature name survived the trip from R to C++ and back + dtrain <- lgb.Dataset( + data = as.matrix(iris[, -5L]) + , label = as.numeric(iris$Species) - 1L + ) + dtrain$construct() + col_names <- dtrain$get_colnames() + expect_equal(col_names[1L], long_name) + expect_equal(nchar(col_names[1L]), 1e3) +}) From 1ca0f5ee23f7cc1323b3e57f807c36e462d8be5d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 7 May 2021 11:33:38 -0500 Subject: [PATCH 10/14] more fixes --- R-package/src/lightgbm_R.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index f472123b64e5..6529bf44af77 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -460,8 +460,6 @@ SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle) { // if any eval names were larger than allocated size, // allow for a larger size and try again if (required_string_size > reserved_string_size) { - std::vector> names(len); - std::vector ptr_names(len); for (int i = 0; i < len; ++i) { names[i].resize(required_string_size); ptr_names[i] = names[i].data(); @@ -476,7 +474,6 @@ SEXP LGBM_BoosterGetEvalNames_R(LGBM_SE handle) { ptr_names.data())); } CHECK_EQ(out_len, len); - CHECK_GE(reserved_string_size, required_string_size); eval_names = PROTECT(Rf_allocVector(STRSXP, len)); for (int i = 0; i < len; ++i) { SET_STRING_ELT(eval_names, i, Rf_mkChar(ptr_names[i])); From d5a47aae01e6611530a6fc54471c523cd9611a79 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 7 May 2021 12:15:22 -0500 Subject: [PATCH 11/14] linting --- R-package/tests/testthat/test_dataset.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index a815fda5bcab..3380b831aa12 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -283,7 +283,7 @@ test_that("lgb.Dataset: should be able to use and retrieve long feature names", # set one feature to a value longer than the default buffer size used # in LGBM_DatasetGetFeatureNames_R feature_names <- names(iris) - long_name <- paste0(rep("a", 1e3), collapse = "") + long_name <- paste0(rep("a", 1000L), collapse = "") feature_names[1L] <- long_name names(iris) <- feature_names # check that feature name survived the trip from R to C++ and back @@ -294,5 +294,5 @@ test_that("lgb.Dataset: should be able to use and retrieve long feature names", dtrain$construct() col_names <- dtrain$get_colnames() expect_equal(col_names[1L], long_name) - expect_equal(nchar(col_names[1L]), 1e3) + expect_equal(nchar(col_names[1L]), 1000L) }) From 179bcec74521eea8d33beb671e23bdcff13701d5 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 7 May 2021 13:05:00 -0500 Subject: [PATCH 12/14] resize --- R-package/src/lightgbm_R.cpp | 8 +-- R-package/tests/testthat/test_lgb.Booster.R | 62 +++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 6529bf44af77..fbdb5c371ae8 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -645,8 +645,8 @@ SEXP LGBM_BoosterSaveModelToString_R(LGBM_SE handle, CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); // if the model string was larger than the initial buffer, allocate a bigger buffer and try again if (out_len > buf_len) { - buf_len = out_len; - CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + inner_char_buf.resize(out_len); + CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), out_len, &out_len, inner_char_buf.data())); } model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); @@ -666,8 +666,8 @@ SEXP LGBM_BoosterDumpModel_R(LGBM_SE handle, CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); // if the model string was larger than the initial buffer, allocate a bigger buffer and try again if (out_len > buf_len) { - buf_len = out_len; - CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + inner_char_buf.resize(out_len); + CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), out_len, &out_len, inner_char_buf.data())); } model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 2b9e78b9ea87..bd6281e64189 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -240,6 +240,68 @@ test_that("Loading a Booster from a string works", { expect_identical(pred, pred2) }) +test_that("Saving a large model to string should work", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + train <- agaricus.train + bst <- lightgbm( + data = as.matrix(train$data) + , label = train$label + , num_leaves = 100L + , learning_rate = 0.01 + , nrounds = 500L + , objective = "binary" + , save_name = tempfile(fileext = ".model") + , verbose = -1L + ) + + pred <- predict(bst, train$data) + model_string <- bst$save_model_to_string() + + # make sure this test is still producing a model bigger than the default + # buffer size used in LGBM_BoosterSaveModelToString_R + expect_gt(nchar(model_string), 1024L * 1024L) + + # finalize the booster and destroy it so you know we aren't cheating + bst$finalize() + expect_null(bst$.__enclos_env__$private$handle) + rm(bst) + + # make sure a new model can be created from this string, and that it + # produces expected results + bst2 <- lgb.load( + model_str = model_string + ) + pred2 <- predict(bst2, train$data) + expect_identical(pred, pred2) +}) + +test_that("Saving a large model to JSON should work", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + train <- agaricus.train + bst <- lightgbm( + data = as.matrix(train$data) + , label = train$label + , num_leaves = 100L + , learning_rate = 0.01 + , nrounds = 200L + , objective = "binary" + , save_name = tempfile(fileext = ".model") + , verbose = -1L + ) + + model_json <- bst$dump_model() + + # make sure this test is still producing a model bigger than the default + # buffer size used in LGBM_BoosterSaveModelToString + expect_gt(nchar(model_json), 1024L * 1024L) + + # check that it is valid JSON that looks like a LightGBM model + model_list <- jsonlite::fromJSON(model_json) + expect_equal(model_list[["objective"]], "binary sigmoid:1") +}) + test_that("If a string and a file are both passed to lgb.load() the file is used model_str is totally ignored", { set.seed(708L) data(agaricus.train, package = "lightgbm") From d5f51784b1ed3b4968b4828a62e7779e63614716 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 8 May 2021 23:28:51 -0500 Subject: [PATCH 13/14] Apply suggestions from code review Co-authored-by: Nikita Titov --- R-package/src/lightgbm_R.h | 6 +++--- R-package/tests/testthat/test_lgb.Booster.R | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 115fdb22241d..28117c414658 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -383,9 +383,9 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetLowerBoundValue_R( ); /*! -* \brief Get Name of eval +* \brief Get names of eval metrics * \param handle Handle of booster -* \return R character vector with names of eval sets +* \return R character vector with names of eval metrics */ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetEvalNames_R( LGBM_SE handle @@ -588,7 +588,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterSaveModelToString_R( ); /*! -* \brief dump model to json +* \brief dump model to JSON * \param handle Booster handle * \param num_iteration, <= 0 means save all * \param feature_importance_type type of feature importance, 0: split, 1: gain diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index bd6281e64189..799681488834 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -294,7 +294,7 @@ test_that("Saving a large model to JSON should work", { model_json <- bst$dump_model() # make sure this test is still producing a model bigger than the default - # buffer size used in LGBM_BoosterSaveModelToString + # buffer size used in LGBM_BoosterDumpModel_R expect_gt(nchar(model_json), 1024L * 1024L) # check that it is valid JSON that looks like a LightGBM model From 6430f62dfc683449f6ac01d51a67505c7947efbc Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 8 May 2021 23:42:01 -0500 Subject: [PATCH 14/14] stricter test --- R-package/src/lightgbm_R.cpp | 12 ++++++++---- R-package/tests/testthat/test_lgb.Booster.R | 6 ++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index fbdb5c371ae8..3b9cc16c11ae 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -641,12 +641,14 @@ SEXP LGBM_BoosterSaveModelToString_R(LGBM_SE handle, R_API_BEGIN(); int64_t out_len = 0; int64_t buf_len = 1024 * 1024; + int64_t num_iter = Rf_asInteger(num_iteration); + int64_t importance_type = Rf_asInteger(feature_importance_type); std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, num_iter, importance_type, buf_len, &out_len, inner_char_buf.data())); // if the model string was larger than the initial buffer, allocate a bigger buffer and try again if (out_len > buf_len) { inner_char_buf.resize(out_len); - CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), out_len, &out_len, inner_char_buf.data())); + CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, num_iter, importance_type, out_len, &out_len, inner_char_buf.data())); } model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); @@ -662,12 +664,14 @@ SEXP LGBM_BoosterDumpModel_R(LGBM_SE handle, R_API_BEGIN(); int64_t out_len = 0; int64_t buf_len = 1024 * 1024; + int64_t num_iter = Rf_asInteger(num_iteration); + int64_t importance_type = Rf_asInteger(feature_importance_type); std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), buf_len, &out_len, inner_char_buf.data())); + CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, num_iter, importance_type, buf_len, &out_len, inner_char_buf.data())); // if the model string was larger than the initial buffer, allocate a bigger buffer and try again if (out_len > buf_len) { inner_char_buf.resize(out_len); - CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), out_len, &out_len, inner_char_buf.data())); + CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, num_iter, importance_type, out_len, &out_len, inner_char_buf.data())); } model_str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(model_str, 0, Rf_mkChar(inner_char_buf.data())); diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 799681488834..4f39a036590c 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -256,6 +256,8 @@ test_that("Saving a large model to string should work", { ) pred <- predict(bst, train$data) + pred_leaf_indx <- predict(bst, train$data, predleaf = TRUE) + pred_raw_score <- predict(bst, train$data, rawscore = TRUE) model_string <- bst$save_model_to_string() # make sure this test is still producing a model bigger than the default @@ -273,7 +275,11 @@ test_that("Saving a large model to string should work", { model_str = model_string ) pred2 <- predict(bst2, train$data) + pred2_leaf_indx <- predict(bst2, train$data, predleaf = TRUE) + pred2_raw_score <- predict(bst2, train$data, rawscore = TRUE) expect_identical(pred, pred2) + expect_identical(pred_leaf_indx, pred2_leaf_indx) + expect_identical(pred_raw_score, pred2_raw_score) }) test_that("Saving a large model to JSON should work", {