diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index d64e82fd894e..70dc5c6491f6 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -483,6 +483,7 @@ Booster <- R6::R6Class( # Predict on new data predict = function(data, + start_iteration = NULL, num_iteration = NULL, rawscore = FALSE, predleaf = FALSE, @@ -494,10 +495,14 @@ Booster <- R6::R6Class( if (is.null(num_iteration)) { num_iteration <- self$best_iter } + # Check if start iteration is non existent + if (is.null(start_iteration)) { + start_iteration <- 0L + } # Predict on new data predictor <- Predictor$new(private$handle, ...) - predictor$predict(data, num_iteration, rawscore, predleaf, predcontrib, header, reshape) + predictor$predict(data, start_iteration, num_iteration, rawscore, predleaf, predcontrib, header, reshape) }, @@ -698,7 +703,14 @@ Booster <- R6::R6Class( #' @description Predicted values based on class \code{lgb.Booster} #' @param object Object of class \code{lgb.Booster} #' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename -#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration +#' @param start_iteration int or None, optional (default=None) +#' Start index of the iteration to predict. +#' If None or <= 0, starts from the first iteration. +#' @param num_iteration int or None, optional (default=None) +#' Limit number of iterations in the prediction. +#' If None, if the best iteration exists and start_iteration is None or <= 0, the +#' best iteration is used; otherwise, all iterations from start_iteration are used. +#' If <= 0, all iterations from start_iteration are used (no limits). #' @param rawscore whether the prediction should be returned in the for of original untransformed #' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} #' for logistic regression would result in predictions for log-odds instead of probabilities. @@ -740,6 +752,7 @@ Booster <- R6::R6Class( #' @export predict.lgb.Booster <- function(object, data, + start_iteration = NULL, num_iteration = NULL, rawscore = FALSE, predleaf = FALSE, @@ -756,6 +769,7 @@ predict.lgb.Booster <- function(object, # Return booster predictions object$predict( data + , start_iteration , num_iteration , rawscore , predleaf diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index fa2b14c94614..bdada64f576d 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -76,6 +76,7 @@ Predictor <- R6::R6Class( # Predict from data predict = function(data, + start_iteration = NULL, num_iteration = NULL, rawscore = FALSE, predleaf = FALSE, @@ -87,6 +88,10 @@ Predictor <- R6::R6Class( if (is.null(num_iteration)) { num_iteration <- -1L } + # Check if start iterations is existing - if not, then set it to 0 (start from the first iteration) + if (is.null(start_iteration)) { + start_iteration <- 0L + } # Set temporary variable num_row <- 0L @@ -108,6 +113,7 @@ Predictor <- R6::R6Class( , as.integer(rawscore) , as.integer(predleaf) , as.integer(predcontrib) + , as.integer(start_iteration) , as.integer(num_iteration) , private$params , lgb.c_str(tmp_filename) @@ -134,6 +140,7 @@ Predictor <- R6::R6Class( , as.integer(rawscore) , as.integer(predleaf) , as.integer(predcontrib) + , as.integer(start_iteration) , as.integer(num_iteration) ) @@ -156,6 +163,7 @@ Predictor <- R6::R6Class( , as.integer(rawscore) , as.integer(predleaf) , as.integer(predcontrib) + , as.integer(start_iteration) , as.integer(num_iteration) , private$params ) @@ -178,6 +186,7 @@ Predictor <- R6::R6Class( , as.integer(rawscore) , as.integer(predleaf) , as.integer(predcontrib) + , as.integer(start_iteration) , as.integer(num_iteration) , private$params ) diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 395c2d45ea37..3f56d0886648 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -7,6 +7,7 @@ \method{predict}{lgb.Booster}( object, data, + start_iteration = NULL, num_iteration = NULL, rawscore = FALSE, predleaf = FALSE, @@ -21,7 +22,15 @@ \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} -\item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} +\item{start_iteration}{int or None, optional (default=None) +Start index of the iteration to predict. +If None or <= 0, starts from the first iteration.} + +\item{num_iteration}{int or None, optional (default=None) +Limit number of iterations in the prediction. +If None, if the best iteration exists and start_iteration is None or <= 0, the +best iteration is used; otherwise, all iterations from start_iteration are used. +If <= 0, all iterations from start_iteration are used (no limits).} \item{rawscore}{whether the prediction should be returned in the for of original untransformed sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 8aff687378dd..f6dc82e9bd04 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -541,6 +541,7 @@ LGBM_SE LGBM_BoosterPredictForFile_R(LGBM_SE handle, LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE parameter, LGBM_SE result_filename, @@ -548,7 +549,7 @@ LGBM_SE LGBM_BoosterPredictForFile_R(LGBM_SE handle, R_API_BEGIN(); int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); CHECK_CALL(LGBM_BoosterPredictForFile(R_GET_PTR(handle), R_CHAR_PTR(data_filename), - R_AS_INT(data_has_header), pred_type, R_AS_INT(num_iteration), R_CHAR_PTR(parameter), + R_AS_INT(data_has_header), pred_type, R_AS_INT(start_iteration), R_AS_INT(num_iteration), R_CHAR_PTR(parameter), R_CHAR_PTR(result_filename))); R_API_END(); } @@ -558,6 +559,7 @@ LGBM_SE LGBM_BoosterCalcNumPredict_R(LGBM_SE handle, LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE out_len, LGBM_SE call_state) { @@ -565,7 +567,7 @@ LGBM_SE LGBM_BoosterCalcNumPredict_R(LGBM_SE handle, int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); int64_t len = 0; CHECK_CALL(LGBM_BoosterCalcNumPredict(R_GET_PTR(handle), R_AS_INT(num_row), - pred_type, R_AS_INT(num_iteration), &len)); + pred_type, R_AS_INT(start_iteration), R_AS_INT(num_iteration), &len)); R_INT_PTR(out_len)[0] = static_cast(len); R_API_END(); } @@ -580,6 +582,7 @@ LGBM_SE LGBM_BoosterPredictForCSC_R(LGBM_SE handle, LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE parameter, LGBM_SE out_result, @@ -599,7 +602,7 @@ LGBM_SE LGBM_BoosterPredictForCSC_R(LGBM_SE handle, CHECK_CALL(LGBM_BoosterPredictForCSC(R_GET_PTR(handle), p_indptr, C_API_DTYPE_INT32, p_indices, p_data, C_API_DTYPE_FLOAT64, nindptr, ndata, - nrow, pred_type, R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret)); + nrow, pred_type, R_AS_INT(start_iteration), R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret)); R_API_END(); } @@ -610,6 +613,7 @@ LGBM_SE LGBM_BoosterPredictForMat_R(LGBM_SE handle, LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE parameter, LGBM_SE out_result, @@ -625,7 +629,7 @@ LGBM_SE LGBM_BoosterPredictForMat_R(LGBM_SE handle, int64_t out_len; CHECK_CALL(LGBM_BoosterPredictForMat(R_GET_PTR(handle), p_mat, C_API_DTYPE_FLOAT64, nrow, ncol, COL_MAJOR, - pred_type, R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret)); + pred_type, R_AS_INT(start_iteration), R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret)); R_API_END(); } @@ -706,10 +710,10 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterGetEval_R" , (DL_FUNC) &LGBM_BoosterGetEval_R , 4}, {"LGBM_BoosterGetNumPredict_R" , (DL_FUNC) &LGBM_BoosterGetNumPredict_R , 4}, {"LGBM_BoosterGetPredict_R" , (DL_FUNC) &LGBM_BoosterGetPredict_R , 4}, - {"LGBM_BoosterPredictForFile_R" , (DL_FUNC) &LGBM_BoosterPredictForFile_R , 10}, - {"LGBM_BoosterCalcNumPredict_R" , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R , 8}, - {"LGBM_BoosterPredictForCSC_R" , (DL_FUNC) &LGBM_BoosterPredictForCSC_R , 14}, - {"LGBM_BoosterPredictForMat_R" , (DL_FUNC) &LGBM_BoosterPredictForMat_R , 11}, + {"LGBM_BoosterPredictForFile_R" , (DL_FUNC) &LGBM_BoosterPredictForFile_R , 11}, + {"LGBM_BoosterCalcNumPredict_R" , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R , 9}, + {"LGBM_BoosterPredictForCSC_R" , (DL_FUNC) &LGBM_BoosterPredictForCSC_R , 15}, + {"LGBM_BoosterPredictForMat_R" , (DL_FUNC) &LGBM_BoosterPredictForMat_R , 12}, {"LGBM_BoosterSaveModel_R" , (DL_FUNC) &LGBM_BoosterSaveModel_R , 5}, {"LGBM_BoosterSaveModelToString_R" , (DL_FUNC) &LGBM_BoosterSaveModelToString_R , 7}, {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 7}, diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index ff4b2cf9fb0e..2b0ddf0cc9f1 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -489,6 +489,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterPredictForFile_R( LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE parameter, LGBM_SE result_filename, @@ -511,6 +512,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterCalcNumPredict_R( LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE out_len, LGBM_SE call_state @@ -545,6 +547,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterPredictForCSC_R( LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE parameter, LGBM_SE out_result, @@ -574,6 +577,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterPredictForMat_R( LGBM_SE is_rawscore, LGBM_SE is_leafidx, LGBM_SE is_predcontrib, + LGBM_SE start_iteration, LGBM_SE num_iteration, LGBM_SE parameter, LGBM_SE out_result, diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index c44f0497e4c3..d193cd87712e 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -17,3 +17,63 @@ test_that("predictions do not fail for integer input", { pred_double <- predict(fit, X_double) expect_equal(pred_integer, pred_double) }) + +test_that("start_iteration works correctly", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + data(agaricus.test, package = "lightgbm") + train <- agaricus.train + test <- agaricus.test + dtrain <- lgb.Dataset( + agaricus.train$data + , label = agaricus.train$label + ) + dtest <- lgb.Dataset.create.valid( + dtrain + , agaricus.test$data + , label = agaricus.test$label + ) + bst <- lightgbm( + data = as.matrix(train$data) + , label = train$label + , num_leaves = 4L + , learning_rate = 0.6 + , nrounds = 100L + , objective = "binary" + , save_name = tempfile(fileext = ".model") + , valids = list("test" = dtest) + , early_stopping_rounds = 2L + ) + expect_true(lgb.is.Booster(bst)) + pred1 <- predict(bst, data = test$data, rawscore = TRUE) + pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE) + pred2 <- rep(0.0, length(pred1)) + pred_contrib2 <- rep(0.0, length(pred2)) + step <- 11L + end_iter <- 99L + if (bst$best_iter != -1L) { + end_iter <- bst$best_iter - 1L + } + start_iters <- seq(0L, end_iter, by = step) + for (start_iter in start_iters) { + n_iter <- min(c(end_iter - start_iter + 1L, step)) + inc_pred <- predict(bst, test$data + , start_iteration = start_iter + , num_iteration = n_iter + , rawscore = TRUE + ) + inc_pred_contrib <- bst$predict(test$data + , start_iteration = start_iter + , num_iteration = n_iter + , predcontrib = TRUE + ) + pred2 <- pred2 + inc_pred + pred_contrib2 <- pred_contrib2 + inc_pred_contrib + } + expect_equal(pred2, pred1) + expect_equal(pred_contrib2, pred_contrib1) + + pred_leaf1 <- predict(bst, test$data, predleaf = TRUE) + pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) + expect_equal(pred_leaf1, pred_leaf2) +}) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 483bbcd711e1..14d7a8098cf8 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -767,6 +767,14 @@ Dataset Parameters Predict Parameters ~~~~~~~~~~~~~~~~~~ +- ``start_iteration_predict`` :raw-html:`🔗︎`, default = ``0``, type = int + + - used only in ``prediction`` task + + - used to specify from which iteration to start the prediction + + - ``<= 0`` means from the first iteration + - ``num_iteration_predict`` :raw-html:`🔗︎`, default = ``-1``, type = int - used only in ``prediction`` task diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index 92f427655106..ca198d37671e 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -123,7 +123,7 @@ class LIGHTGBM_EXPORT Boosting { */ virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0; - virtual int NumPredictOneRow(int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const = 0; + virtual int NumPredictOneRow(int start_iteration, int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const = 0; /*! * \brief Prediction for one record, not sigmoid transform @@ -284,10 +284,11 @@ class LIGHTGBM_EXPORT Boosting { /*! * \brief Initial work for the prediction + * \param start_iteration Start index of the iteration to predict * \param num_iteration number of used iteration * \param is_pred_contrib */ - virtual void InitPredict(int num_iteration, bool is_pred_contrib) = 0; + virtual void InitPredict(int start_iteration, int num_iteration, bool is_pred_contrib) = 0; /*! * \brief Name of submodel diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 86e5a086b7ff..ea34343362b2 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -675,6 +675,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterGetPredict(BoosterHandle handle, * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iterations for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param result_filename Filename of result file in which predictions will be written @@ -684,6 +685,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle, const char* data_filename, int data_has_header, int predict_type, + int start_iteration, int num_iteration, const char* parameter, const char* result_filename); @@ -697,6 +699,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle, * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iterations for prediction, <= 0 means no limit * \param[out] out_len Length of prediction * \return 0 when succeed, -1 when failure happens @@ -704,6 +707,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle, LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle, int num_row, int predict_type, + int start_iteration, int num_iteration, int64_t* out_len); @@ -736,6 +740,7 @@ LIGHTGBM_C_EXPORT int LGBM_FastConfigFree(FastConfigHandle fastConfig); * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iterations for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param[out] out_len Length of output result @@ -752,6 +757,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle, int64_t nelem, int64_t num_col, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -775,6 +781,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle, * \param num_col_or_row Number of columns for CSR or number of rows for CSC * \param predict_type What should be predicted, only feature contributions supported currently * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iterations for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param matrix_type Type of matrix input and output, can be ``C_API_MATRIX_TYPE_CSR`` or ``C_API_MATRIX_TYPE_CSC`` @@ -794,6 +801,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, int64_t nelem, int64_t num_col_or_row, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int matrix_type, @@ -835,6 +843,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indic * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iterations for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param[out] out_len Length of output result @@ -851,6 +860,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, int64_t nelem, int64_t num_col, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -867,6 +877,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iterations for prediction, <= 0 means no limit * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64`` * \param num_col Number of columns @@ -876,6 +887,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, */ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle, const int predict_type, + const int start_iteration, const int num_iteration, const int data_type, const int64_t num_col, @@ -944,6 +956,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fa * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iteration for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param[out] out_len Length of output result @@ -960,6 +973,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle, int64_t nelem, int64_t num_row, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -983,6 +997,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle, * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iteration for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param[out] out_len Length of output result @@ -996,6 +1011,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle, int32_t ncol, int is_row_major, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -1019,6 +1035,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle, * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iteration for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param[out] out_len Length of output result @@ -1031,6 +1048,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, int ncol, int is_row_major, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -1047,6 +1065,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iterations for prediction, <= 0 means no limit * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64`` * \param ncol Number of columns @@ -1056,6 +1075,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, */ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle, const int predict_type, + const int start_iteration, const int num_iteration, const int data_type, const int32_t ncol, @@ -1104,6 +1124,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fa * - ``C_API_PREDICT_RAW_SCORE``: raw score; * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of iteration for prediction, <= 0 means no limit * \param parameter Other parameters for prediction, e.g. early stopping for prediction * \param[out] out_len Length of output result @@ -1116,6 +1137,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle, int32_t nrow, int32_t ncol, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index ef2d68c76cfd..bfcb09a40049 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -684,6 +684,12 @@ struct Config { #pragma region Predict Parameters + // [no-save] + // desc = used only in ``prediction`` task + // desc = used to specify from which iteration to start the prediction + // desc = ``<= 0`` means from the first iteration + int start_iteration_predict = 0; + // [no-save] // desc = used only in ``prediction`` task // desc = used to specify how many trained iterations will be used in prediction diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index ea0d9eec12d4..a09841519be2 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -519,7 +519,7 @@ def __getstate__(self): this.pop('handle', None) return this - def predict(self, data, num_iteration=-1, + def predict(self, data, start_iteration=0, num_iteration=-1, raw_score=False, pred_leaf=False, pred_contrib=False, data_has_header=False, is_reshape=True): """Predict logic. @@ -529,6 +529,8 @@ def predict(self, data, num_iteration=-1, data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse Data source for prediction. When data type is string, it represents the path of txt file. + start_iteration : int, optional (default=0) + Start index of the iteration to predict. num_iteration : int, optional (default=-1) Iteration used for prediction. raw_score : bool, optional (default=False) @@ -560,8 +562,6 @@ def predict(self, data, num_iteration=-1, if pred_contrib: predict_type = C_API_PREDICT_CONTRIB int_data_has_header = 1 if data_has_header else 0 - if num_iteration > self.num_total_iteration: - num_iteration = self.num_total_iteration if isinstance(data, string_type): with _TempFile() as f: @@ -570,6 +570,7 @@ def predict(self, data, num_iteration=-1, c_str(data), ctypes.c_int(int_data_has_header), ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), c_str(self.pred_parameter), c_str(f.name))) @@ -578,26 +579,26 @@ def predict(self, data, num_iteration=-1, preds = [float(token) for line in lines for token in line.split('\t')] preds = np.array(preds, dtype=np.float64, copy=False) elif isinstance(data, scipy.sparse.csr_matrix): - preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type) + preds, nrow = self.__pred_for_csr(data, start_iteration, num_iteration, predict_type) elif isinstance(data, scipy.sparse.csc_matrix): - preds, nrow = self.__pred_for_csc(data, num_iteration, predict_type) + preds, nrow = self.__pred_for_csc(data, start_iteration, num_iteration, predict_type) elif isinstance(data, np.ndarray): - preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type) + preds, nrow = self.__pred_for_np2d(data, start_iteration, num_iteration, predict_type) elif isinstance(data, list): try: data = np.array(data) except BaseException: raise ValueError('Cannot convert data list to numpy array.') - preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type) + preds, nrow = self.__pred_for_np2d(data, start_iteration, num_iteration, predict_type) elif isinstance(data, DataTable): - preds, nrow = self.__pred_for_np2d(data.to_numpy(), num_iteration, predict_type) + preds, nrow = self.__pred_for_np2d(data.to_numpy(), start_iteration, num_iteration, predict_type) else: try: warnings.warn('Converting data to scipy sparse matrix.') csr = scipy.sparse.csr_matrix(data) except BaseException: raise TypeError('Cannot predict data for type {}'.format(type(data).__name__)) - preds, nrow = self.__pred_for_csr(csr, num_iteration, predict_type) + preds, nrow = self.__pred_for_csr(csr, start_iteration, num_iteration, predict_type) if pred_leaf: preds = preds.astype(np.int32) is_sparse = scipy.sparse.issparse(preds) or isinstance(preds, list) @@ -609,7 +610,7 @@ def predict(self, data, num_iteration=-1, % (preds.size, nrow)) return preds - def __get_num_preds(self, num_iteration, nrow, predict_type): + def __get_num_preds(self, start_iteration, num_iteration, nrow, predict_type): """Get size of prediction result.""" if nrow > MAX_INT32: raise LightGBMError('LightGBM cannot perform prediction for data' @@ -621,22 +622,23 @@ def __get_num_preds(self, num_iteration, nrow, predict_type): self.handle, ctypes.c_int(nrow), ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), ctypes.byref(n_preds))) return n_preds.value - def __pred_for_np2d(self, mat, num_iteration, predict_type): + def __pred_for_np2d(self, mat, start_iteration, num_iteration, predict_type): """Predict for a 2-D numpy matrix.""" if len(mat.shape) != 2: raise ValueError('Input numpy.ndarray or list must be 2 dimensional') - def inner_predict(mat, num_iteration, predict_type, preds=None): + def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None): if mat.dtype == np.float32 or mat.dtype == np.float64: data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) else: # change non-float data to float data, need to copy data = np.array(mat.reshape(mat.size), dtype=np.float32) ptr_data, type_ptr_data, _ = c_float_array(data) - n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type) + n_preds = self.__get_num_preds(start_iteration, num_iteration, mat.shape[0], predict_type) if preds is None: preds = np.zeros(n_preds, dtype=np.float64) elif len(preds.shape) != 1 or len(preds) != n_preds: @@ -650,6 +652,7 @@ def inner_predict(mat, num_iteration, predict_type, preds=None): ctypes.c_int(mat.shape[1]), ctypes.c_int(C_API_IS_ROW_MAJOR), ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), c_str(self.pred_parameter), ctypes.byref(out_num_preds), @@ -662,16 +665,16 @@ def inner_predict(mat, num_iteration, predict_type, preds=None): if nrow > MAX_INT32: sections = np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32) # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal - n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])] + n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])] n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() preds = np.zeros(sum(n_preds), dtype=np.float64) for chunk, (start_idx_pred, end_idx_pred) in zip_(np.array_split(mat, sections), zip_(n_preds_sections, n_preds_sections[1:])): # avoid memory consumption by arrays concatenation operations - inner_predict(chunk, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred]) + inner_predict(chunk, start_iteration, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred]) return preds, nrow else: - return inner_predict(mat, num_iteration, predict_type) + return inner_predict(mat, start_iteration, num_iteration, predict_type) def __create_sparse_native(self, cs, out_shape, out_ptr_indptr, out_ptr_indices, out_ptr_data, indptr_type, data_type, is_csr=True): @@ -719,11 +722,11 @@ def __create_sparse_native(self, cs, out_shape, out_ptr_indptr, out_ptr_indices, return cs_output_matrices[0] return cs_output_matrices - def __pred_for_csr(self, csr, num_iteration, predict_type): + def __pred_for_csr(self, csr, start_iteration, num_iteration, predict_type): """Predict for a CSR data.""" - def inner_predict(csr, num_iteration, predict_type, preds=None): + def inner_predict(csr, start_iteration, num_iteration, predict_type, preds=None): nrow = len(csr.indptr) - 1 - n_preds = self.__get_num_preds(num_iteration, nrow, predict_type) + n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type) if preds is None: preds = np.zeros(n_preds, dtype=np.float64) elif len(preds.shape) != 1 or len(preds) != n_preds: @@ -747,6 +750,7 @@ def inner_predict(csr, num_iteration, predict_type, preds=None): ctypes.c_int64(len(csr.data)), ctypes.c_int64(csr.shape[1]), ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), c_str(self.pred_parameter), ctypes.byref(out_num_preds), @@ -755,7 +759,7 @@ def inner_predict(csr, num_iteration, predict_type, preds=None): raise ValueError("Wrong length for predict results") return preds, nrow - def inner_predict_sparse(csr, num_iteration, predict_type): + def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type): ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr) ptr_data, type_ptr_data, _ = c_float_array(csr.data) csr_indices = csr.indices.astype(np.int32, copy=False) @@ -781,6 +785,7 @@ def inner_predict_sparse(csr, num_iteration, predict_type): ctypes.c_int64(len(csr.data)), ctypes.c_int64(csr.shape[1]), ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), c_str(self.pred_parameter), ctypes.c_int(matrix_type), @@ -794,25 +799,25 @@ def inner_predict_sparse(csr, num_iteration, predict_type): return matrices, nrow if predict_type == C_API_PREDICT_CONTRIB: - return inner_predict_sparse(csr, num_iteration, predict_type) + return inner_predict_sparse(csr, start_iteration, num_iteration, predict_type) nrow = len(csr.indptr) - 1 if nrow > MAX_INT32: sections = [0] + list(np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)) + [nrow] # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal - n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff(sections)] + n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)] n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() preds = np.zeros(sum(n_preds), dtype=np.float64) for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip_(zip_(sections, sections[1:]), zip_(n_preds_sections, n_preds_sections[1:])): # avoid memory consumption by arrays concatenation operations - inner_predict(csr[start_idx:end_idx], num_iteration, predict_type, preds[start_idx_pred:end_idx_pred]) + inner_predict(csr[start_idx:end_idx], start_iteration, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred]) return preds, nrow else: - return inner_predict(csr, num_iteration, predict_type) + return inner_predict(csr, start_iteration, num_iteration, predict_type) - def __pred_for_csc(self, csc, num_iteration, predict_type): + def __pred_for_csc(self, csc, start_iteration, num_iteration, predict_type): """Predict for a CSC data.""" - def inner_predict_sparse(csc, num_iteration, predict_type): + def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type): ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr) ptr_data, type_ptr_data, _ = c_float_array(csc.data) csc_indices = csc.indices.astype(np.int32, copy=False) @@ -838,6 +843,7 @@ def inner_predict_sparse(csc, num_iteration, predict_type): ctypes.c_int64(len(csc.data)), ctypes.c_int64(csc.shape[0]), ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), c_str(self.pred_parameter), ctypes.c_int(matrix_type), @@ -852,10 +858,10 @@ def inner_predict_sparse(csc, num_iteration, predict_type): nrow = csc.shape[0] if nrow > MAX_INT32: - return self.__pred_for_csr(csc.tocsr(), num_iteration, predict_type) + return self.__pred_for_csr(csc.tocsr(), start_iteration, num_iteration, predict_type) if predict_type == C_API_PREDICT_CONTRIB: - return inner_predict_sparse(csc, num_iteration, predict_type) - n_preds = self.__get_num_preds(num_iteration, nrow, predict_type) + return inner_predict_sparse(csc, start_iteration, num_iteration, predict_type) + n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type) preds = np.zeros(n_preds, dtype=np.float64) out_num_preds = ctypes.c_int64(0) @@ -876,6 +882,7 @@ def inner_predict_sparse(csc, num_iteration, predict_type): ctypes.c_int64(len(csc.data)), ctypes.c_int64(csc.shape[0]), ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), c_str(self.pred_parameter), ctypes.byref(out_num_preds), @@ -2806,7 +2813,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, importance_type='spl default=json_default_with_numpy)) return ret - def predict(self, data, num_iteration=None, + def predict(self, data, start_iteration=None, num_iteration=None, raw_score=False, pred_leaf=False, pred_contrib=False, data_has_header=False, is_reshape=True, **kwargs): """Make a prediction. @@ -2816,10 +2823,14 @@ def predict(self, data, num_iteration=None, data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse Data source for prediction. If string, it represents the path to txt file. + start_iteration : int or None, optional (default=None) + Start index of the iteration to predict. + If None or <= 0, starts from the first iteration. num_iteration : int or None, optional (default=None) Limit number of iterations in the prediction. - If None, if the best iteration exists, it is used; otherwise, all iterations are used. - If <= 0, all iterations are used (no limits). + If None, if the best iteration exists and start_iteration is None or <= 0, the best iteration is used; + otherwise, all iterations from start_iteration are used. + If <= 0, all iterations from start_iteration are used (no limits). raw_score : bool, optional (default=False) Whether to predict raw scores. pred_leaf : bool, optional (default=False) @@ -2850,9 +2861,14 @@ def predict(self, data, num_iteration=None, Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``). """ predictor = self._to_predictor(copy.deepcopy(kwargs)) + if start_iteration is None or start_iteration < 0: + start_iteration = 0 if num_iteration is None: - num_iteration = self.best_iteration - return predictor.predict(data, num_iteration, + if start_iteration == 0: + num_iteration = self.best_iteration + else: + num_iteration = -1 + return predictor.predict(data, start_iteration, num_iteration, raw_score, pred_leaf, pred_contrib, data_has_header, is_reshape) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index d7250c83eba4..1dd1c0b7c566 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -612,7 +612,7 @@ def _get_meta_data(collection, name, i): del train_set, valid_sets return self - def predict(self, X, raw_score=False, num_iteration=None, + def predict(self, X, raw_score=False, start_iteration=None, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): """Return the predicted value for each sample. @@ -622,6 +622,9 @@ def predict(self, X, raw_score=False, num_iteration=None, Input features matrix. raw_score : bool, optional (default=False) Whether to predict raw scores. + start_iteration : int or None, optional (default=None) + Start index of the iteration to predict. + If None or <= 0, starts from the first iteration. num_iteration : int or None, optional (default=None) Limit number of iterations in the prediction. If None, if the best iteration exists, it is used; otherwise, all trees are used. @@ -661,7 +664,7 @@ def predict(self, X, raw_score=False, num_iteration=None, "match the input. Model n_features_ is %s and " "input n_features is %s " % (self._n_features, n_features)) - return self._Booster.predict(X, raw_score=raw_score, num_iteration=num_iteration, + return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs) @property @@ -832,10 +835,10 @@ def fit(self, X, y, fit.__doc__ = LGBMModel.fit.__doc__ - def predict(self, X, raw_score=False, num_iteration=None, + def predict(self, X, raw_score=False, start_iteration=None, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): """Docstring is inherited from the LGBMModel.""" - result = self.predict_proba(X, raw_score, num_iteration, + result = self.predict_proba(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs) if callable(self._objective) or raw_score or pred_leaf or pred_contrib: return result @@ -845,7 +848,7 @@ def predict(self, X, raw_score=False, num_iteration=None, predict.__doc__ = LGBMModel.predict.__doc__ - def predict_proba(self, X, raw_score=False, num_iteration=None, + def predict_proba(self, X, raw_score=False, start_iteration=None, num_iteration=None, pred_leaf=False, pred_contrib=False, **kwargs): """Return the predicted probability for each class for each sample. @@ -855,6 +858,9 @@ def predict_proba(self, X, raw_score=False, num_iteration=None, Input features matrix. raw_score : bool, optional (default=False) Whether to predict raw scores. + start_iteration : int or None, optional (default=None) + Start index of the iteration to predict. + If None or <= 0, starts from the first iteration. num_iteration : int or None, optional (default=None) Limit number of iterations in the prediction. If None, if the best iteration exists, it is used; otherwise, all trees are used. @@ -884,7 +890,7 @@ def predict_proba(self, X, raw_score=False, num_iteration=None, X_SHAP_values : array-like of shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects If ``pred_contrib=True``, the feature contributions for each sample. """ - result = super(LGBMClassifier, self).predict(X, raw_score, num_iteration, + result = super(LGBMClassifier, self).predict(X, raw_score, start_iteration, num_iteration, pred_leaf, pred_contrib, **kwargs) if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib): warnings.warn("Cannot compute class probabilities or labels " diff --git a/src/application/application.cpp b/src/application/application.cpp index eca5c5e97ae9..21163a5a30ea 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -88,7 +88,7 @@ void Application::LoadData() { PredictFunction predict_fun = nullptr; // need to continue training if (boosting_->NumberOfTotalModel() > 0 && config_.task != TaskType::KRefitTree) { - predictor.reset(new Predictor(boosting_.get(), -1, true, false, false, false, -1, -1)); + predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1)); predict_fun = predictor->GetPredictFunction(); } @@ -213,7 +213,7 @@ void Application::Train() { void Application::Predict() { if (config_.task == TaskType::KRefitTree) { // create predictor - Predictor predictor(boosting_.get(), -1, false, true, false, false, 1, 1); + Predictor predictor(boosting_.get(), 0, -1, false, true, false, false, 1, 1); predictor.Predict(config_.data.c_str(), config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check); TextReader result_reader(config_.output_result.c_str(), false); result_reader.ReadAllLines(); @@ -239,7 +239,7 @@ void Application::Predict() { Log::Info("Finished RefitTree"); } else { // create predictor - Predictor predictor(boosting_.get(), config_.num_iteration_predict, config_.predict_raw_score, + Predictor predictor(boosting_.get(), config_.start_iteration_predict, config_.num_iteration_predict, config_.predict_raw_score, config_.predict_leaf_index, config_.predict_contrib, config_.pred_early_stop, config_.pred_early_stop_freq, config_.pred_early_stop_margin); diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index 48ef227de2c6..a8454490fcf1 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -31,12 +31,13 @@ class Predictor { /*! * \brief Constructor * \param boosting Input boosting model + * \param start_iteration Start index of the iteration to predict * \param num_iteration Number of boosting round * \param is_raw_score True if need to predict result with raw score * \param predict_leaf_index True to output leaf index instead of prediction score * \param predict_contrib True to output feature contributions instead of prediction score */ - Predictor(Boosting* boosting, int num_iteration, bool is_raw_score, + Predictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score, bool predict_leaf_index, bool predict_contrib, bool early_stop, int early_stop_freq, double early_stop_margin) { early_stop_ = CreatePredictionEarlyStopInstance( @@ -56,9 +57,9 @@ class Predictor { } } - boosting->InitPredict(num_iteration, predict_contrib); + boosting->InitPredict(start_iteration, num_iteration, predict_contrib); boosting_ = boosting; - num_pred_one_row_ = boosting_->NumPredictOneRow( + num_pred_one_row_ = boosting_->NumPredictOneRow(start_iteration, num_iteration, predict_leaf_index, predict_contrib); num_feature_ = boosting_->MaxFeatureIdx() + 1; predict_buf_.resize( @@ -225,6 +226,7 @@ class Predictor { data_size_t, const std::vector& lines) { std::vector> oneline_features; std::vector result_to_write(lines.size()); + Log::Warning("before predict_fun_ is called"); OMP_INIT_EX(); #pragma omp parallel for schedule(static) firstprivate(oneline_features) for (data_size_t i = 0; i < static_cast(lines.size()); ++i) { @@ -239,6 +241,7 @@ class Predictor { result_to_write[i] = str_result; OMP_LOOP_EX_END(); } + Log::Warning("after predict_fun_ is called"); OMP_THROW_EX(); for (data_size_t i = 0; i < static_cast(result_to_write.size()); ++i) { writer->Write(result_to_write[i].c_str(), result_to_write[i].size()); diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 064bebfec2f5..03f5fe25d554 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -574,7 +574,8 @@ void GBDT::PredictContrib(const double* features, double* output) const { // set zero const int num_features = max_feature_idx_ + 1; std::memset(output, 0, sizeof(double) * num_tree_per_iteration_ * (num_features + 1)); - for (int i = 0; i < num_iteration_for_pred_; ++i) { + const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_; + for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) { // predict all the trees for one iteration for (int k = 0; k < num_tree_per_iteration_; ++k) { models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features + 1)); @@ -585,7 +586,8 @@ void GBDT::PredictContrib(const double* features, double* output) const { void GBDT::PredictContribByMap(const std::unordered_map& features, std::vector>* output) const { const int num_features = max_feature_idx_ + 1; - for (int i = 0; i < num_iteration_for_pred_; ++i) { + const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_; + for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) { // predict all the trees for one iteration for (int k = 0; k < num_tree_per_iteration_; ++k) { models_[i * num_tree_per_iteration_ + k]->PredictContribByMap(features, num_features, &((*output)[k])); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index dfd9a15d08e8..a84b321531f1 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -204,19 +204,22 @@ class GBDT : public GBDTBase { /*! * \brief Get number of prediction for one data + * \param start_iteration Start index of the iteration to predict * \param num_iteration number of used iterations * \param is_pred_leaf True if predicting leaf index * \param is_pred_contrib True if predicting feature contribution * \return number of prediction */ - inline int NumPredictOneRow(int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const override { + inline int NumPredictOneRow(int start_iteration, int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const override { int num_pred_in_one_row = num_class_; if (is_pred_leaf) { int max_iteration = GetCurrentIteration(); + start_iteration = std::max(start_iteration, 0); + start_iteration = std::min(start_iteration, max_iteration); if (num_iteration > 0) { - num_pred_in_one_row *= static_cast(std::min(max_iteration, num_iteration)); + num_pred_in_one_row *= static_cast(std::min(max_iteration - start_iteration, num_iteration)); } else { - num_pred_in_one_row *= max_iteration; + num_pred_in_one_row *= (max_iteration - start_iteration); } } else if (is_pred_contrib) { num_pred_in_one_row = num_tree_per_iteration_ * (max_feature_idx_ + 2); // +1 for 0-based indexing, +1 for baseline @@ -352,11 +355,16 @@ class GBDT : public GBDTBase { */ inline int NumberOfClasses() const override { return num_class_; } - inline void InitPredict(int num_iteration, bool is_pred_contrib) override { + inline void InitPredict(int start_iteration, int num_iteration, bool is_pred_contrib) override { num_iteration_for_pred_ = static_cast(models_.size()) / num_tree_per_iteration_; + start_iteration = std::max(start_iteration, 0); + start_iteration = std::min(start_iteration, num_iteration_for_pred_); if (num_iteration > 0) { - num_iteration_for_pred_ = std::min(num_iteration, num_iteration_for_pred_); + num_iteration_for_pred_ = std::min(num_iteration, num_iteration_for_pred_ - start_iteration); + } else { + num_iteration_for_pred_ = num_iteration_for_pred_ - start_iteration; } + start_iteration_for_pred_ = start_iteration; if (is_pred_contrib) { #pragma omp parallel for schedule(static) for (int i = 0; i < static_cast(models_.size()); ++i) { @@ -489,6 +497,8 @@ class GBDT : public GBDTBase { data_size_t label_idx_; /*! \brief number of used model */ int num_iteration_for_pred_; + /*! \brief Start iteration of used model */ + int start_iteration_for_pred_; /*! \brief Shrinkage rate for one iteration */ double shrinkage_rate_; /*! \brief Number of loaded initial models */ diff --git a/src/boosting/gbdt_prediction.cpp b/src/boosting/gbdt_prediction.cpp index b4711f7c01a6..97db70de4c32 100644 --- a/src/boosting/gbdt_prediction.cpp +++ b/src/boosting/gbdt_prediction.cpp @@ -14,7 +14,8 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa int early_stop_round_counter = 0; // set zero std::memset(output, 0, sizeof(double) * num_tree_per_iteration_); - for (int i = 0; i < num_iteration_for_pred_; ++i) { + const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_; + for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) { // predict all the trees for one iteration for (int k = 0; k < num_tree_per_iteration_; ++k) { output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features); @@ -34,7 +35,8 @@ void GBDT::PredictRawByMap(const std::unordered_map& features, doub int early_stop_round_counter = 0; // set zero std::memset(output, 0, sizeof(double) * num_tree_per_iteration_); - for (int i = 0; i < num_iteration_for_pred_; ++i) { + const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_; + for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) { // predict all the trees for one iteration for (int k = 0; k < num_tree_per_iteration_; ++k) { output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features); @@ -75,16 +77,20 @@ void GBDT::PredictByMap(const std::unordered_map& features, double* } void GBDT::PredictLeafIndex(const double* features, double* output) const { - int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_; - for (int i = 0; i < total_tree; ++i) { - output[i] = models_[i]->PredictLeafIndex(features); + int start_tree = start_iteration_for_pred_ * num_tree_per_iteration_; + int num_trees = num_iteration_for_pred_ * num_tree_per_iteration_; + const auto* models_ptr = models_.data() + start_tree; + for (int i = 0; i < num_trees; ++i) { + output[i] = models_ptr[i]->PredictLeafIndex(features); } } void GBDT::PredictLeafIndexByMap(const std::unordered_map& features, double* output) const { - int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_; - for (int i = 0; i < total_tree; ++i) { - output[i] = models_[i]->PredictLeafIndexByMap(features); + int start_tree = start_iteration_for_pred_ * num_tree_per_iteration_; + int num_trees = num_iteration_for_pred_ * num_tree_per_iteration_; + const auto* models_ptr = models_.data() + start_tree; + for (int i = 0; i < num_trees; ++i) { + output[i] = models_ptr[i]->PredictLeafIndexByMap(features); } } diff --git a/src/c_api.cpp b/src/c_api.cpp index cc82697ac35b..61b3038e660b 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -62,7 +62,7 @@ class SingleRowPredictor { PredictFunction predict_function; int64_t num_pred_in_one_row; - SingleRowPredictor(int predict_type, Boosting* boosting, const Config& config, int iter) { + SingleRowPredictor(int predict_type, Boosting* boosting, const Config& config, int start_iter, int num_iter) { bool is_predict_leaf = false; bool is_raw_score = false; bool predict_contrib = false; @@ -78,10 +78,10 @@ class SingleRowPredictor { early_stop_ = config.pred_early_stop; early_stop_freq_ = config.pred_early_stop_freq; early_stop_margin_ = config.pred_early_stop_margin; - iter_ = iter; - predictor_.reset(new Predictor(boosting, iter_, is_raw_score, is_predict_leaf, predict_contrib, + iter_ = num_iter; + predictor_.reset(new Predictor(boosting, start_iter, iter_, is_raw_score, is_predict_leaf, predict_contrib, early_stop_, early_stop_freq_, early_stop_margin_)); - num_pred_in_one_row = boosting->NumPredictOneRow(iter_, is_predict_leaf, predict_contrib); + num_pred_in_one_row = boosting->NumPredictOneRow(start_iter, iter_, is_predict_leaf, predict_contrib); predict_function = predictor_->GetPredictFunction(); num_total_model_ = boosting->NumberOfTotalModel(); } @@ -369,12 +369,12 @@ class Booster { boosting_->RollbackOneIter(); } - void SetSingleRowPredictor(int num_iteration, int predict_type, const Config& config) { + void SetSingleRowPredictor(int start_iteration, int num_iteration, int predict_type, const Config& config) { UNIQUE_LOCK(mutex_) if (single_row_predictor_[predict_type].get() == nullptr || !single_row_predictor_[predict_type]->IsPredictorEqual(config, num_iteration, boosting_.get())) { single_row_predictor_[predict_type].reset(new SingleRowPredictor(predict_type, boosting_.get(), - config, num_iteration)); + config, start_iteration, num_iteration)); } } @@ -395,7 +395,7 @@ class Booster { *out_len = single_row_predictor->num_pred_in_one_row; } - Predictor CreatePredictor(int num_iteration, int predict_type, int ncol, const Config& config) const { + Predictor CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const { if (!config.predict_disable_shape_check && ncol != boosting_->MaxFeatureIdx() + 1) { Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \ "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", ncol, boosting_->MaxFeatureIdx() + 1); @@ -413,17 +413,17 @@ class Booster { is_raw_score = false; } - Predictor predictor(boosting_.get(), num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); return predictor; } - void Predict(int num_iteration, int predict_type, int nrow, int ncol, + void Predict(int start_iteration, int num_iteration, int predict_type, int nrow, int ncol, std::function>(int row_idx)> get_row_fun, const Config& config, double* out_result, int64_t* out_len) const { SHARED_LOCK(mutex_); - auto predictor = CreatePredictor(num_iteration, predict_type, ncol, config); + auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); bool is_predict_leaf = false; bool predict_contrib = false; if (predict_type == C_API_PREDICT_LEAF_INDEX) { @@ -431,7 +431,7 @@ class Booster { } else if (predict_type == C_API_PREDICT_CONTRIB) { predict_contrib = true; } - int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(num_iteration, is_predict_leaf, predict_contrib); + int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib); auto pred_fun = predictor.GetPredictFunction(); OMP_INIT_EX(); #pragma omp parallel for schedule(static) @@ -446,13 +446,13 @@ class Booster { *out_len = num_pred_in_one_row * nrow; } - void PredictSparse(int num_iteration, int predict_type, int64_t nrow, int ncol, + void PredictSparse(int start_iteration, int num_iteration, int predict_type, int64_t nrow, int ncol, std::function>(int64_t row_idx)> get_row_fun, const Config& config, int64_t* out_elements_size, std::vector>>* agg_ptr, int32_t** out_indices, void** out_data, int data_type, bool* is_data_float32_ptr, int num_matrices) const { - auto predictor = CreatePredictor(num_iteration, predict_type, ncol, config); + auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); auto pred_sparse_fun = predictor.GetPredictSparseFunction(); std::vector>>& agg = *agg_ptr; OMP_INIT_EX(); @@ -488,7 +488,7 @@ class Booster { *out_indices = new int32_t[elements_size]; } - void PredictSparseCSR(int num_iteration, int predict_type, int64_t nrow, int ncol, + void PredictSparseCSR(int start_iteration, int num_iteration, int predict_type, int64_t nrow, int ncol, std::function>(int64_t row_idx)> get_row_fun, const Config& config, int64_t* out_len, void** out_indptr, int indptr_type, @@ -511,7 +511,7 @@ class Booster { // aggregated per row feature contribution results std::vector>> agg(nrow); int64_t elements_size = 0; - PredictSparse(num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg, + PredictSparse(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg, out_indices, out_data, data_type, &is_data_float32, num_matrices); std::vector row_sizes(num_matrices * nrow); std::vector row_matrix_offsets(num_matrices * nrow); @@ -572,7 +572,7 @@ class Booster { out_len[1] = indptr_size; } - void PredictSparseCSC(int num_iteration, int predict_type, int64_t nrow, int ncol, + void PredictSparseCSC(int start_iteration, int num_iteration, int predict_type, int64_t nrow, int ncol, std::function>(int64_t row_idx)> get_row_fun, const Config& config, int64_t* out_len, void** out_col_ptr, int col_ptr_type, @@ -580,7 +580,7 @@ class Booster { SHARED_LOCK(mutex_); // Get the number of trees per iteration (for multiclass scenario we output multiple sparse matrices) int num_matrices = boosting_->NumModelPerIteration(); - auto predictor = CreatePredictor(num_iteration, predict_type, ncol, config); + auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); auto pred_sparse_fun = predictor.GetPredictSparseFunction(); bool is_col_ptr_int32 = false; bool is_data_float32 = false; @@ -598,7 +598,7 @@ class Booster { // aggregated per row feature contribution results std::vector>> agg(nrow); int64_t elements_size = 0; - PredictSparse(num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg, + PredictSparse(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg, out_indices, out_data, data_type, &is_data_float32, num_matrices); // calculate number of elements per column to construct // the CSC matrix with random access @@ -676,7 +676,7 @@ class Booster { out_len[1] = col_ptr_size; } - void Predict(int num_iteration, int predict_type, const char* data_filename, + void Predict(int start_iteration, int num_iteration, int predict_type, const char* data_filename, int data_has_header, const Config& config, const char* result_filename) const { SHARED_LOCK(mutex_) @@ -692,7 +692,7 @@ class Booster { } else { is_raw_score = false; } - Predictor predictor(boosting_.get(), num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); bool bool_data_has_header = data_has_header > 0 ? true : false; predictor.Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check); @@ -1728,6 +1728,7 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle, const char* data_filename, int data_has_header, int predict_type, + int start_iteration, int num_iteration, const char* parameter, const char* result_filename) { @@ -1739,7 +1740,7 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle, omp_set_num_threads(config.num_threads); } Booster* ref_booster = reinterpret_cast(handle); - ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header, + ref_booster->Predict(start_iteration, num_iteration, predict_type, data_filename, data_has_header, config, result_filename); API_END(); } @@ -1747,11 +1748,12 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle, int LGBM_BoosterCalcNumPredict(BoosterHandle handle, int num_row, int predict_type, + int start_iteration, int num_iteration, int64_t* out_len) { API_BEGIN(); Booster* ref_booster = reinterpret_cast(handle); - *out_len = static_cast(num_row) * ref_booster->GetBoosting()->NumPredictOneRow( + *out_len = static_cast(num_row) * ref_booster->GetBoosting()->NumPredictOneRow(start_iteration, num_iteration, predict_type == C_API_PREDICT_LEAF_INDEX, predict_type == C_API_PREDICT_CONTRIB); API_END(); } @@ -1798,6 +1800,7 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle, int64_t nelem, int64_t num_col, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -1817,7 +1820,7 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle, Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int nrow = static_cast(nindptr - 1); - ref_booster->Predict(num_iteration, predict_type, nrow, static_cast(num_col), get_row_fun, + ref_booster->Predict(start_iteration, num_iteration, predict_type, nrow, static_cast(num_col), get_row_fun, config, out_result, out_len); API_END(); } @@ -1832,6 +1835,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, int64_t nelem, int64_t num_col_or_row, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int matrix_type, @@ -1855,7 +1859,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, } auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); int64_t nrow = nindptr - 1; - ref_booster->PredictSparseCSR(num_iteration, predict_type, nrow, static_cast(num_col_or_row), get_row_fun, + ref_booster->PredictSparseCSR(start_iteration, num_iteration, predict_type, nrow, static_cast(num_col_or_row), get_row_fun, config, out_len, out_indptr, indptr_type, out_indices, out_data, data_type); } else if (matrix_type == C_API_MATRIX_TYPE_CSC) { int num_threads = OMP_NUM_THREADS(); @@ -1879,7 +1883,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, } return one_row; }; - ref_booster->PredictSparseCSC(num_iteration, predict_type, num_col_or_row, ncol, get_row_fun, config, + ref_booster->PredictSparseCSC(start_iteration, num_iteration, predict_type, num_col_or_row, ncol, get_row_fun, config, out_len, out_indptr, indptr_type, out_indices, out_data, data_type); } else { Log::Fatal("Unknown matrix type in LGBM_BoosterPredictSparseOutput"); @@ -1917,6 +1921,7 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, int64_t nelem, int64_t num_col, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -1935,13 +1940,14 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, } Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); - ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config); + ref_booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, config); ref_booster->PredictSingleRow(predict_type, static_cast(num_col), get_row_fun, config, out_result, out_len); API_END(); } int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle, const int predict_type, + const int start_iteration, const int num_iteration, const int data_type, const int64_t num_col, @@ -1965,7 +1971,7 @@ int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle, omp_set_num_threads(fastConfig_ptr->config.num_threads); } - fastConfig_ptr->booster->SetSingleRowPredictor(num_iteration, predict_type, fastConfig_ptr->config); + fastConfig_ptr->booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, fastConfig_ptr->config); *out_fastConfig = fastConfig_ptr.release(); API_END(); @@ -1999,6 +2005,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle, int64_t nelem, int64_t num_row, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -2032,7 +2039,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle, } return one_row; }; - ref_booster->Predict(num_iteration, predict_type, static_cast(num_row), ncol, get_row_fun, config, + ref_booster->Predict(start_iteration, num_iteration, predict_type, static_cast(num_row), ncol, get_row_fun, config, out_result, out_len); API_END(); } @@ -2044,6 +2051,7 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle, int32_t ncol, int is_row_major, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -2057,7 +2065,7 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle, } Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); - ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, + ref_booster->Predict(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len); API_END(); } @@ -2068,6 +2076,7 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, int32_t ncol, int is_row_major, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -2081,13 +2090,14 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, } Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major); - ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config); + ref_booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, config); ref_booster->PredictSingleRow(predict_type, ncol, get_row_fun, config, out_result, out_len); API_END(); } int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle, const int predict_type, + const int start_iteration, const int num_iteration, const int data_type, const int32_t ncol, @@ -2105,7 +2115,7 @@ int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle, omp_set_num_threads(fastConfig_ptr->config.num_threads); } - fastConfig_ptr->booster->SetSingleRowPredictor(num_iteration, predict_type, fastConfig_ptr->config); + fastConfig_ptr->booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, fastConfig_ptr->config); *out_fastConfig = fastConfig_ptr.release(); API_END(); @@ -2132,6 +2142,7 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, int32_t nrow, int32_t ncol, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -2145,7 +2156,7 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, } Booster* ref_booster = reinterpret_cast(handle); auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type); - ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len); + ref_booster->Predict(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len); API_END(); } diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index ed593a48e4aa..b14af67fd30e 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -256,6 +256,7 @@ const std::unordered_set& Config::parameter_set() { "categorical_feature", "forcedbins_filename", "save_binary", + "start_iteration_predict", "num_iteration_predict", "predict_raw_score", "predict_leaf_index", @@ -513,6 +514,8 @@ void Config::GetMembersFromString(const std::unordered_mapGetPrimitiveArrayCritical(data, 0); - int ret = LGBM_BoosterPredictForMatSingleRow(handle, data0, data_type, ncol, is_row_major, predict_type, + int ret = LGBM_BoosterPredictForMatSingleRow(handle, data0, data_type, ncol, is_row_major, predict_type, start_iteration, num_iteration, parameter, out_len, out_result); jenv->ReleasePrimitiveArrayCritical(data, data0, JNI_ABORT); @@ -130,6 +131,7 @@ int64_t nelem, int64_t num_col, int predict_type, + int start_iteration, int num_iteration, const char* parameter, int64_t* out_len, @@ -147,7 +149,7 @@ int32_t ind[2] = { 0, numNonZeros }; int ret = LGBM_BoosterPredictForCSRSingleRow(handle, ind, indptr_type, indices0, values0, data_type, 2, - nelem, num_col, predict_type, num_iteration, parameter, out_len, out_result); + nelem, num_col, predict_type, start_iteration, num_iteration, parameter, out_len, out_result); jenv->ReleasePrimitiveArrayCritical(values, values0, JNI_ABORT); jenv->ReleasePrimitiveArrayCritical(indices, indices0, JNI_ABORT); diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py index 8efa8d0ba88c..e124b6594193 100644 --- a/tests/c_api_test/test_.py +++ b/tests/c_api_test/test_.py @@ -263,6 +263,7 @@ def test_booster(): mat.shape[1], 1, 1, + 0, 25, c_str(''), ctypes.byref(num_preb), @@ -273,6 +274,17 @@ def test_booster(): '../../examples/binary_classification/binary.test')), 0, 0, + 0, + 25, + c_str(''), + c_str('preb.txt')) + LIB.LGBM_BoosterPredictForFile( + booster2, + c_str(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/binary_classification/binary.test')), + 0, + 0, + 10, 25, c_str(''), c_str('preb.txt')) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 2cfdf67fe94c..3c24f39fbcce 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -2315,3 +2315,90 @@ def test_interaction_constraints(self): est = lgb.train(dict(params, interaction_constraints=[[0] + list(range(2, num_features)), [1] + list(range(2, num_features))]), train_data, num_boost_round=10) + + def test_predict_with_start_iteration(self): + def inner_test(X, y, params, early_stopping_rounds): + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + train_data = lgb.Dataset(X_train, label=y_train) + valid_data = lgb.Dataset(X_test, label=y_test) + booster = lgb.train(params, train_data, num_boost_round=100, early_stopping_rounds=early_stopping_rounds, valid_sets=[valid_data]) + + # test that the predict once with all iterations equals summed results with start_iteration and num_iteration + all_pred = booster.predict(X, raw_score=True) + all_pred_contrib = booster.predict(X, pred_contrib=True) + steps = [10, 12] + for step in steps: + pred = np.zeros_like(all_pred) + pred_contrib = np.zeros_like(all_pred_contrib) + for start_iter in range(0, 100, step): + pred += booster.predict(X, num_iteration=step, start_iteration=start_iter, raw_score=True) + pred_contrib += booster.predict(X, num_iteration=step, start_iteration=start_iter, pred_contrib=True) + np.testing.assert_allclose(all_pred, pred) + np.testing.assert_allclose(all_pred_contrib, pred_contrib) + # test the case where start_iteration <= 0, and num_iteration is None + pred1 = booster.predict(X, start_iteration=-1) + pred2 = booster.predict(X, num_iteration=booster.best_iteration) + pred3 = booster.predict(X, num_iteration=booster.best_iteration, start_iteration=0) + np.testing.assert_allclose(pred1, pred2) + np.testing.assert_allclose(pred1, pred3) + + # test the case where start_iteration > 0, and num_iteration <= 0 + pred4 = booster.predict(X, start_iteration=10, num_iteration=-1) + pred5 = booster.predict(X, start_iteration=10, num_iteration=90) + pred6 = booster.predict(X, start_iteration=10, num_iteration=0) + np.testing.assert_allclose(pred4, pred5) + np.testing.assert_allclose(pred4, pred6) + + # test the case where start_iteration > 0, and num_iteration <= 0, with pred_leaf=True + pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_leaf=True) + pred5 = booster.predict(X, start_iteration=10, num_iteration=90, pred_leaf=True) + pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_leaf=True) + np.testing.assert_allclose(pred4, pred5) + np.testing.assert_allclose(pred4, pred6) + + # test the case where start_iteration > 0, and num_iteration <= 0, with pred_contrib=True + pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_contrib=True) + pred5 = booster.predict(X, start_iteration=10, num_iteration=90, pred_contrib=True) + pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_contrib=True) + np.testing.assert_allclose(pred4, pred5) + np.testing.assert_allclose(pred4, pred6) + + # test for regression + X, y = load_boston(True) + params = { + 'objective': 'regression', + 'verbose': -1, + 'metric': 'l2', + 'learning_rate': 0.5 + } + # test both with and without early stopping + inner_test(X, y, params, early_stopping_rounds=1) + inner_test(X, y, params, early_stopping_rounds=10) + inner_test(X, y, params, early_stopping_rounds=None) + + # test for multi-class + X, y = load_iris(True) + params = { + 'objective': 'multiclass', + 'metric': 'multi_logloss', + 'num_class': 3, + 'verbose': -1, + 'metric': 'multi_error' + } + # test both with and without early stopping + inner_test(X, y, params, early_stopping_rounds=1) + inner_test(X, y, params, early_stopping_rounds=10) + inner_test(X, y, params, early_stopping_rounds=None) + + # test for binary + X, y = load_breast_cancer(True) + params = { + 'objective': 'binary', + 'metric': 'binary_logloss', + 'verbose': -1, + 'metric': 'auc' + } + # test both with and without early stopping + inner_test(X, y, params, early_stopping_rounds=1) + inner_test(X, y, params, early_stopping_rounds=10) + inner_test(X, y, params, early_stopping_rounds=None) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 51fdeedd09cc..47d0697b2e68 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -607,6 +607,41 @@ def test_predict(self): np.testing.assert_allclose, res_engine, res_sklearn_params) + # Tests start_iteration + # Tests same probabilities, starting from iteration 10 + res_engine = gbm.predict(X_test, start_iteration=10) + res_sklearn = clf.predict_proba(X_test, start_iteration=10) + np.testing.assert_allclose(res_engine, res_sklearn) + + # Tests same predictions, starting from iteration 10 + res_engine = np.argmax(gbm.predict(X_test, start_iteration=10), axis=1) + res_sklearn = clf.predict(X_test, start_iteration=10) + np.testing.assert_equal(res_engine, res_sklearn) + + # Tests same raw scores, starting from iteration 10 + res_engine = gbm.predict(X_test, raw_score=True, start_iteration=10) + res_sklearn = clf.predict(X_test, raw_score=True, start_iteration=10) + np.testing.assert_allclose(res_engine, res_sklearn) + + # Tests same leaf indices, starting from iteration 10 + res_engine = gbm.predict(X_test, pred_leaf=True, start_iteration=10) + res_sklearn = clf.predict(X_test, pred_leaf=True, start_iteration=10) + np.testing.assert_equal(res_engine, res_sklearn) + + # Tests same feature contributions, starting from iteration 10 + res_engine = gbm.predict(X_test, pred_contrib=True, start_iteration=10) + res_sklearn = clf.predict(X_test, pred_contrib=True, start_iteration=10) + np.testing.assert_allclose(res_engine, res_sklearn) + + # Tests other parameters for the prediction works, starting from iteration 10 + res_engine = gbm.predict(X_test, start_iteration=10) + res_sklearn_params = clf.predict_proba(X_test, + pred_early_stop=True, + pred_early_stop_margin=1.0, start_iteration=10) + self.assertRaises(AssertionError, + np.testing.assert_allclose, + res_engine, res_sklearn_params) + def test_evaluate_train_set(self): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)