diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
index d64e82fd894e..70dc5c6491f6 100644
--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -483,6 +483,7 @@ Booster <- R6::R6Class(
 
     # Predict on new data
     predict = function(data,
+                       start_iteration = NULL,
                        num_iteration = NULL,
                        rawscore = FALSE,
                        predleaf = FALSE,
@@ -494,10 +495,14 @@ Booster <- R6::R6Class(
       if (is.null(num_iteration)) {
         num_iteration <- self$best_iter
       }
+      # Check if start iteration is  non existent
+      if (is.null(start_iteration)) {
+        start_iteration <- 0L
+      }
 
       # Predict on new data
       predictor <- Predictor$new(private$handle, ...)
-      predictor$predict(data, num_iteration, rawscore, predleaf, predcontrib, header, reshape)
+      predictor$predict(data, start_iteration, num_iteration, rawscore, predleaf, predcontrib, header, reshape)
 
     },
 
@@ -698,7 +703,14 @@ Booster <- R6::R6Class(
 #' @description Predicted values based on class \code{lgb.Booster}
 #' @param object Object of class \code{lgb.Booster}
 #' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
-#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration
+#' @param start_iteration int or None, optional (default=None)
+#'                        Start index of the iteration to predict.
+#'                        If None or <= 0, starts from the first iteration.
+#' @param num_iteration int or None, optional (default=None)
+#'                      Limit number of iterations in the prediction.
+#'                      If None, if the best iteration exists and start_iteration is None or <= 0, the
+#'                      best iteration is used; otherwise, all iterations from start_iteration are used.
+#'                      If <= 0, all iterations from start_iteration are used (no limits).
 #' @param rawscore whether the prediction should be returned in the for of original untransformed
 #'                 sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
 #'                 for logistic regression would result in predictions for log-odds instead of probabilities.
@@ -740,6 +752,7 @@ Booster <- R6::R6Class(
 #' @export
 predict.lgb.Booster <- function(object,
                                 data,
+                                start_iteration = NULL,
                                 num_iteration = NULL,
                                 rawscore = FALSE,
                                 predleaf = FALSE,
@@ -756,6 +769,7 @@ predict.lgb.Booster <- function(object,
   # Return booster predictions
   object$predict(
     data
+    , start_iteration
     , num_iteration
     , rawscore
     , predleaf
diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R
index fa2b14c94614..bdada64f576d 100644
--- a/R-package/R/lgb.Predictor.R
+++ b/R-package/R/lgb.Predictor.R
@@ -76,6 +76,7 @@ Predictor <- R6::R6Class(
 
     # Predict from data
     predict = function(data,
+                       start_iteration = NULL,
                        num_iteration = NULL,
                        rawscore = FALSE,
                        predleaf = FALSE,
@@ -87,6 +88,10 @@ Predictor <- R6::R6Class(
       if (is.null(num_iteration)) {
         num_iteration <- -1L
       }
+      # Check if start iterations is existing - if not, then set it to 0 (start from the first iteration)
+      if (is.null(start_iteration)) {
+        start_iteration <- 0L
+      }
 
       # Set temporary variable
       num_row <- 0L
@@ -108,6 +113,7 @@ Predictor <- R6::R6Class(
           , as.integer(rawscore)
           , as.integer(predleaf)
           , as.integer(predcontrib)
+          , as.integer(start_iteration)
           , as.integer(num_iteration)
           , private$params
           , lgb.c_str(tmp_filename)
@@ -134,6 +140,7 @@ Predictor <- R6::R6Class(
           , as.integer(rawscore)
           , as.integer(predleaf)
           , as.integer(predcontrib)
+          , as.integer(start_iteration)
           , as.integer(num_iteration)
         )
 
@@ -156,6 +163,7 @@ Predictor <- R6::R6Class(
             , as.integer(rawscore)
             , as.integer(predleaf)
             , as.integer(predcontrib)
+            , as.integer(start_iteration)
             , as.integer(num_iteration)
             , private$params
           )
@@ -178,6 +186,7 @@ Predictor <- R6::R6Class(
             , as.integer(rawscore)
             , as.integer(predleaf)
             , as.integer(predcontrib)
+            , as.integer(start_iteration)
             , as.integer(num_iteration)
             , private$params
           )
diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd
index 395c2d45ea37..3f56d0886648 100644
--- a/R-package/man/predict.lgb.Booster.Rd
+++ b/R-package/man/predict.lgb.Booster.Rd
@@ -7,6 +7,7 @@
 \method{predict}{lgb.Booster}(
   object,
   data,
+  start_iteration = NULL,
   num_iteration = NULL,
   rawscore = FALSE,
   predleaf = FALSE,
@@ -21,7 +22,15 @@
 
 \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
 
-\item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
+\item{start_iteration}{int or None, optional (default=None)
+Start index of the iteration to predict.
+If None or <= 0, starts from the first iteration.}
+
+\item{num_iteration}{int or None, optional (default=None)
+Limit number of iterations in the prediction.
+If None, if the best iteration exists and start_iteration is None or <= 0, the
+best iteration is used; otherwise, all iterations from start_iteration are used.
+If <= 0, all iterations from start_iteration are used (no limits).}
 
 \item{rawscore}{whether the prediction should be returned in the for of original untransformed
 sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp
index 8aff687378dd..f6dc82e9bd04 100644
--- a/R-package/src/lightgbm_R.cpp
+++ b/R-package/src/lightgbm_R.cpp
@@ -541,6 +541,7 @@ LGBM_SE LGBM_BoosterPredictForFile_R(LGBM_SE handle,
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE parameter,
   LGBM_SE result_filename,
@@ -548,7 +549,7 @@ LGBM_SE LGBM_BoosterPredictForFile_R(LGBM_SE handle,
   R_API_BEGIN();
   int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
   CHECK_CALL(LGBM_BoosterPredictForFile(R_GET_PTR(handle), R_CHAR_PTR(data_filename),
-    R_AS_INT(data_has_header), pred_type, R_AS_INT(num_iteration), R_CHAR_PTR(parameter),
+    R_AS_INT(data_has_header), pred_type, R_AS_INT(start_iteration), R_AS_INT(num_iteration), R_CHAR_PTR(parameter),
     R_CHAR_PTR(result_filename)));
   R_API_END();
 }
@@ -558,6 +559,7 @@ LGBM_SE LGBM_BoosterCalcNumPredict_R(LGBM_SE handle,
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE out_len,
   LGBM_SE call_state) {
@@ -565,7 +567,7 @@ LGBM_SE LGBM_BoosterCalcNumPredict_R(LGBM_SE handle,
   int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
   int64_t len = 0;
   CHECK_CALL(LGBM_BoosterCalcNumPredict(R_GET_PTR(handle), R_AS_INT(num_row),
-    pred_type, R_AS_INT(num_iteration), &len));
+    pred_type, R_AS_INT(start_iteration), R_AS_INT(num_iteration), &len));
   R_INT_PTR(out_len)[0] = static_cast<int>(len);
   R_API_END();
 }
@@ -580,6 +582,7 @@ LGBM_SE LGBM_BoosterPredictForCSC_R(LGBM_SE handle,
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE parameter,
   LGBM_SE out_result,
@@ -599,7 +602,7 @@ LGBM_SE LGBM_BoosterPredictForCSC_R(LGBM_SE handle,
   CHECK_CALL(LGBM_BoosterPredictForCSC(R_GET_PTR(handle),
     p_indptr, C_API_DTYPE_INT32, p_indices,
     p_data, C_API_DTYPE_FLOAT64, nindptr, ndata,
-    nrow, pred_type, R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret));
+    nrow, pred_type,  R_AS_INT(start_iteration), R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret));
   R_API_END();
 }
 
@@ -610,6 +613,7 @@ LGBM_SE LGBM_BoosterPredictForMat_R(LGBM_SE handle,
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE parameter,
   LGBM_SE out_result,
@@ -625,7 +629,7 @@ LGBM_SE LGBM_BoosterPredictForMat_R(LGBM_SE handle,
   int64_t out_len;
   CHECK_CALL(LGBM_BoosterPredictForMat(R_GET_PTR(handle),
     p_mat, C_API_DTYPE_FLOAT64, nrow, ncol, COL_MAJOR,
-    pred_type, R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret));
+    pred_type, R_AS_INT(start_iteration), R_AS_INT(num_iteration), R_CHAR_PTR(parameter), &out_len, ptr_ret));
 
   R_API_END();
 }
@@ -706,10 +710,10 @@ static const R_CallMethodDef CallEntries[] = {
   {"LGBM_BoosterGetEval_R"            , (DL_FUNC) &LGBM_BoosterGetEval_R            , 4},
   {"LGBM_BoosterGetNumPredict_R"      , (DL_FUNC) &LGBM_BoosterGetNumPredict_R      , 4},
   {"LGBM_BoosterGetPredict_R"         , (DL_FUNC) &LGBM_BoosterGetPredict_R         , 4},
-  {"LGBM_BoosterPredictForFile_R"     , (DL_FUNC) &LGBM_BoosterPredictForFile_R     , 10},
-  {"LGBM_BoosterCalcNumPredict_R"     , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R     , 8},
-  {"LGBM_BoosterPredictForCSC_R"      , (DL_FUNC) &LGBM_BoosterPredictForCSC_R      , 14},
-  {"LGBM_BoosterPredictForMat_R"      , (DL_FUNC) &LGBM_BoosterPredictForMat_R      , 11},
+  {"LGBM_BoosterPredictForFile_R"     , (DL_FUNC) &LGBM_BoosterPredictForFile_R     , 11},
+  {"LGBM_BoosterCalcNumPredict_R"     , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R     , 9},
+  {"LGBM_BoosterPredictForCSC_R"      , (DL_FUNC) &LGBM_BoosterPredictForCSC_R      , 15},
+  {"LGBM_BoosterPredictForMat_R"      , (DL_FUNC) &LGBM_BoosterPredictForMat_R      , 12},
   {"LGBM_BoosterSaveModel_R"          , (DL_FUNC) &LGBM_BoosterSaveModel_R          , 5},
   {"LGBM_BoosterSaveModelToString_R"  , (DL_FUNC) &LGBM_BoosterSaveModelToString_R  , 7},
   {"LGBM_BoosterDumpModel_R"          , (DL_FUNC) &LGBM_BoosterDumpModel_R          , 7},
diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h
index ff4b2cf9fb0e..2b0ddf0cc9f1 100644
--- a/R-package/src/lightgbm_R.h
+++ b/R-package/src/lightgbm_R.h
@@ -489,6 +489,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterPredictForFile_R(
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE parameter,
   LGBM_SE result_filename,
@@ -511,6 +512,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterCalcNumPredict_R(
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE out_len,
   LGBM_SE call_state
@@ -545,6 +547,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterPredictForCSC_R(
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE parameter,
   LGBM_SE out_result,
@@ -574,6 +577,7 @@ LIGHTGBM_C_EXPORT LGBM_SE LGBM_BoosterPredictForMat_R(
   LGBM_SE is_rawscore,
   LGBM_SE is_leafidx,
   LGBM_SE is_predcontrib,
+  LGBM_SE start_iteration,
   LGBM_SE num_iteration,
   LGBM_SE parameter,
   LGBM_SE out_result,
diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R
index c44f0497e4c3..d193cd87712e 100644
--- a/R-package/tests/testthat/test_Predictor.R
+++ b/R-package/tests/testthat/test_Predictor.R
@@ -17,3 +17,63 @@ test_that("predictions do not fail for integer input", {
     pred_double <- predict(fit, X_double)
     expect_equal(pred_integer, pred_double)
 })
+
+test_that("start_iteration works correctly", {
+    set.seed(708L)
+    data(agaricus.train, package = "lightgbm")
+    data(agaricus.test, package = "lightgbm")
+    train <- agaricus.train
+    test <- agaricus.test
+    dtrain <- lgb.Dataset(
+        agaricus.train$data
+        , label = agaricus.train$label
+    )
+    dtest <- lgb.Dataset.create.valid(
+        dtrain
+        , agaricus.test$data
+        , label = agaricus.test$label
+    )
+    bst <- lightgbm(
+        data = as.matrix(train$data)
+        , label = train$label
+        , num_leaves = 4L
+        , learning_rate = 0.6
+        , nrounds = 100L
+        , objective = "binary"
+        , save_name = tempfile(fileext = ".model")
+        , valids = list("test" = dtest)
+        , early_stopping_rounds = 2L
+    )
+    expect_true(lgb.is.Booster(bst))
+    pred1 <- predict(bst, data = test$data, rawscore = TRUE)
+    pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE)
+    pred2 <- rep(0.0, length(pred1))
+    pred_contrib2 <- rep(0.0, length(pred2))
+    step <- 11L
+    end_iter <- 99L
+    if (bst$best_iter != -1L) {
+        end_iter <- bst$best_iter - 1L
+    }
+    start_iters <- seq(0L, end_iter, by = step)
+    for (start_iter in start_iters) {
+        n_iter <- min(c(end_iter - start_iter + 1L, step))
+        inc_pred <- predict(bst, test$data
+            , start_iteration = start_iter
+            , num_iteration = n_iter
+            , rawscore = TRUE
+        )
+        inc_pred_contrib <- bst$predict(test$data
+            , start_iteration = start_iter
+            , num_iteration = n_iter
+            , predcontrib = TRUE
+        )
+        pred2 <- pred2 + inc_pred
+        pred_contrib2 <- pred_contrib2 + inc_pred_contrib
+    }
+    expect_equal(pred2, pred1)
+    expect_equal(pred_contrib2, pred_contrib1)
+
+    pred_leaf1 <- predict(bst, test$data, predleaf = TRUE)
+    pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE)
+    expect_equal(pred_leaf1, pred_leaf2)
+})
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 483bbcd711e1..14d7a8098cf8 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -767,6 +767,14 @@ Dataset Parameters
 Predict Parameters
 ~~~~~~~~~~~~~~~~~~
 
+-  ``start_iteration_predict`` :raw-html:`<a id="start_iteration_predict" title="Permalink to this parameter" href="#start_iteration_predict">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int
+
+   -  used only in ``prediction`` task
+
+   -  used to specify from which iteration to start the prediction
+
+   -  ``<= 0`` means from the first iteration
+
 -  ``num_iteration_predict`` :raw-html:`<a id="num_iteration_predict" title="Permalink to this parameter" href="#num_iteration_predict">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
    -  used only in ``prediction`` task
diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h
index 92f427655106..ca198d37671e 100644
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -123,7 +123,7 @@ class LIGHTGBM_EXPORT Boosting {
   */
   virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0;
 
-  virtual int NumPredictOneRow(int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const = 0;
+  virtual int NumPredictOneRow(int start_iteration, int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const = 0;
 
   /*!
   * \brief Prediction for one record, not sigmoid transform
@@ -284,10 +284,11 @@ class LIGHTGBM_EXPORT Boosting {
 
   /*!
   * \brief Initial work for the prediction
+  * \param start_iteration Start index of the iteration to predict
   * \param num_iteration number of used iteration
   * \param is_pred_contrib
   */
-  virtual void InitPredict(int num_iteration, bool is_pred_contrib) = 0;
+  virtual void InitPredict(int start_iteration, int num_iteration, bool is_pred_contrib) = 0;
 
   /*!
   * \brief Name of submodel
diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
index 86e5a086b7ff..ea34343362b2 100644
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -675,6 +675,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterGetPredict(BoosterHandle handle,
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param result_filename Filename of result file in which predictions will be written
@@ -684,6 +685,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
                                                  const char* data_filename,
                                                  int data_has_header,
                                                  int predict_type,
+                                                 int start_iteration,
                                                  int num_iteration,
                                                  const char* parameter,
                                                  const char* result_filename);
@@ -697,6 +699,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param[out] out_len Length of prediction
  * \return 0 when succeed, -1 when failure happens
@@ -704,6 +707,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForFile(BoosterHandle handle,
 LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
                                                  int num_row,
                                                  int predict_type,
+                                                 int start_iteration,
                                                  int num_iteration,
                                                  int64_t* out_len);
 
@@ -736,6 +740,7 @@ LIGHTGBM_C_EXPORT int LGBM_FastConfigFree(FastConfigHandle fastConfig);
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param[out] out_len Length of output result
@@ -752,6 +757,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
                                                 int64_t nelem,
                                                 int64_t num_col,
                                                 int predict_type,
+                                                int start_iteration,
                                                 int num_iteration,
                                                 const char* parameter,
                                                 int64_t* out_len,
@@ -775,6 +781,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
  * \param num_col_or_row Number of columns for CSR or number of rows for CSC
  * \param predict_type What should be predicted, only feature contributions supported currently
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param matrix_type Type of matrix input and output, can be ``C_API_MATRIX_TYPE_CSR`` or ``C_API_MATRIX_TYPE_CSC``
@@ -794,6 +801,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
                                                       int64_t nelem,
                                                       int64_t num_col_or_row,
                                                       int predict_type,
+                                                      int start_iteration,
                                                       int num_iteration,
                                                       const char* parameter,
                                                       int matrix_type,
@@ -835,6 +843,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indic
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param[out] out_len Length of output result
@@ -851,6 +860,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
                                                          int64_t nelem,
                                                          int64_t num_col,
                                                          int predict_type,
+                                                         int start_iteration,
                                                          int num_iteration,
                                                          const char* parameter,
                                                          int64_t* out_len,
@@ -867,6 +877,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
  * \param num_col Number of columns
@@ -876,6 +887,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
  */
 LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
                                                                  const int predict_type,
+                                                                 const int start_iteration,
                                                                  const int num_iteration,
                                                                  const int data_type,
                                                                  const int64_t num_col,
@@ -944,6 +956,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fa
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iteration for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param[out] out_len Length of output result
@@ -960,6 +973,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle,
                                                 int64_t nelem,
                                                 int64_t num_row,
                                                 int predict_type,
+                                                int start_iteration,
                                                 int num_iteration,
                                                 const char* parameter,
                                                 int64_t* out_len,
@@ -983,6 +997,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle,
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iteration for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param[out] out_len Length of output result
@@ -996,6 +1011,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle,
                                                 int32_t ncol,
                                                 int is_row_major,
                                                 int predict_type,
+                                                int start_iteration,
                                                 int num_iteration,
                                                 const char* parameter,
                                                 int64_t* out_len,
@@ -1019,6 +1035,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle,
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iteration for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param[out] out_len Length of output result
@@ -1031,6 +1048,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
                                                          int ncol,
                                                          int is_row_major,
                                                          int predict_type,
+                                                         int start_iteration,
                                                          int num_iteration,
                                                          const char* parameter,
                                                          int64_t* out_len,
@@ -1047,6 +1065,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iterations for prediction, <= 0 means no limit
  * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
  * \param ncol Number of columns
@@ -1056,6 +1075,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
  */
 LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
                                                                  const int predict_type,
+                                                                 const int start_iteration,
                                                                  const int num_iteration,
                                                                  const int data_type,
                                                                  const int32_t ncol,
@@ -1104,6 +1124,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fa
  *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
  *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
  *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
  * \param num_iteration Number of iteration for prediction, <= 0 means no limit
  * \param parameter Other parameters for prediction, e.g. early stopping for prediction
  * \param[out] out_len Length of output result
@@ -1116,6 +1137,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle,
                                                  int32_t nrow,
                                                  int32_t ncol,
                                                  int predict_type,
+                                                 int start_iteration,
                                                  int num_iteration,
                                                  const char* parameter,
                                                  int64_t* out_len,
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index ef2d68c76cfd..bfcb09a40049 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -684,6 +684,12 @@ struct Config {
 
   #pragma region Predict Parameters
 
+  // [no-save]
+  // desc = used only in ``prediction`` task
+  // desc = used to specify from which iteration to start the prediction
+  // desc = ``<= 0`` means from the first iteration
+  int start_iteration_predict = 0;
+
   // [no-save]
   // desc = used only in ``prediction`` task
   // desc = used to specify how many trained iterations will be used in prediction
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index ea0d9eec12d4..a09841519be2 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -519,7 +519,7 @@ def __getstate__(self):
         this.pop('handle', None)
         return this
 
-    def predict(self, data, num_iteration=-1,
+    def predict(self, data, start_iteration=0, num_iteration=-1,
                 raw_score=False, pred_leaf=False, pred_contrib=False, data_has_header=False,
                 is_reshape=True):
         """Predict logic.
@@ -529,6 +529,8 @@ def predict(self, data, num_iteration=-1,
         data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
             Data source for prediction.
             When data type is string, it represents the path of txt file.
+        start_iteration : int, optional (default=0)
+            Start index of the iteration to predict.
         num_iteration : int, optional (default=-1)
             Iteration used for prediction.
         raw_score : bool, optional (default=False)
@@ -560,8 +562,6 @@ def predict(self, data, num_iteration=-1,
         if pred_contrib:
             predict_type = C_API_PREDICT_CONTRIB
         int_data_has_header = 1 if data_has_header else 0
-        if num_iteration > self.num_total_iteration:
-            num_iteration = self.num_total_iteration
 
         if isinstance(data, string_type):
             with _TempFile() as f:
@@ -570,6 +570,7 @@ def predict(self, data, num_iteration=-1,
                     c_str(data),
                     ctypes.c_int(int_data_has_header),
                     ctypes.c_int(predict_type),
+                    ctypes.c_int(start_iteration),
                     ctypes.c_int(num_iteration),
                     c_str(self.pred_parameter),
                     c_str(f.name)))
@@ -578,26 +579,26 @@ def predict(self, data, num_iteration=-1,
                 preds = [float(token) for line in lines for token in line.split('\t')]
                 preds = np.array(preds, dtype=np.float64, copy=False)
         elif isinstance(data, scipy.sparse.csr_matrix):
-            preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
+            preds, nrow = self.__pred_for_csr(data, start_iteration, num_iteration, predict_type)
         elif isinstance(data, scipy.sparse.csc_matrix):
-            preds, nrow = self.__pred_for_csc(data, num_iteration, predict_type)
+            preds, nrow = self.__pred_for_csc(data, start_iteration, num_iteration, predict_type)
         elif isinstance(data, np.ndarray):
-            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
+            preds, nrow = self.__pred_for_np2d(data, start_iteration, num_iteration, predict_type)
         elif isinstance(data, list):
             try:
                 data = np.array(data)
             except BaseException:
                 raise ValueError('Cannot convert data list to numpy array.')
-            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
+            preds, nrow = self.__pred_for_np2d(data, start_iteration, num_iteration, predict_type)
         elif isinstance(data, DataTable):
-            preds, nrow = self.__pred_for_np2d(data.to_numpy(), num_iteration, predict_type)
+            preds, nrow = self.__pred_for_np2d(data.to_numpy(), start_iteration, num_iteration, predict_type)
         else:
             try:
                 warnings.warn('Converting data to scipy sparse matrix.')
                 csr = scipy.sparse.csr_matrix(data)
             except BaseException:
                 raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
-            preds, nrow = self.__pred_for_csr(csr, num_iteration, predict_type)
+            preds, nrow = self.__pred_for_csr(csr, start_iteration, num_iteration, predict_type)
         if pred_leaf:
             preds = preds.astype(np.int32)
         is_sparse = scipy.sparse.issparse(preds) or isinstance(preds, list)
@@ -609,7 +610,7 @@ def predict(self, data, num_iteration=-1,
                                  % (preds.size, nrow))
         return preds
 
-    def __get_num_preds(self, num_iteration, nrow, predict_type):
+    def __get_num_preds(self, start_iteration, num_iteration, nrow, predict_type):
         """Get size of prediction result."""
         if nrow > MAX_INT32:
             raise LightGBMError('LightGBM cannot perform prediction for data'
@@ -621,22 +622,23 @@ def __get_num_preds(self, num_iteration, nrow, predict_type):
             self.handle,
             ctypes.c_int(nrow),
             ctypes.c_int(predict_type),
+            ctypes.c_int(start_iteration),
             ctypes.c_int(num_iteration),
             ctypes.byref(n_preds)))
         return n_preds.value
 
-    def __pred_for_np2d(self, mat, num_iteration, predict_type):
+    def __pred_for_np2d(self, mat, start_iteration, num_iteration, predict_type):
         """Predict for a 2-D numpy matrix."""
         if len(mat.shape) != 2:
             raise ValueError('Input numpy.ndarray or list must be 2 dimensional')
 
-        def inner_predict(mat, num_iteration, predict_type, preds=None):
+        def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None):
             if mat.dtype == np.float32 or mat.dtype == np.float64:
                 data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
             else:  # change non-float data to float data, need to copy
                 data = np.array(mat.reshape(mat.size), dtype=np.float32)
             ptr_data, type_ptr_data, _ = c_float_array(data)
-            n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
+            n_preds = self.__get_num_preds(start_iteration, num_iteration, mat.shape[0], predict_type)
             if preds is None:
                 preds = np.zeros(n_preds, dtype=np.float64)
             elif len(preds.shape) != 1 or len(preds) != n_preds:
@@ -650,6 +652,7 @@ def inner_predict(mat, num_iteration, predict_type, preds=None):
                 ctypes.c_int(mat.shape[1]),
                 ctypes.c_int(C_API_IS_ROW_MAJOR),
                 ctypes.c_int(predict_type),
+                ctypes.c_int(start_iteration),
                 ctypes.c_int(num_iteration),
                 c_str(self.pred_parameter),
                 ctypes.byref(out_num_preds),
@@ -662,16 +665,16 @@ def inner_predict(mat, num_iteration, predict_type, preds=None):
         if nrow > MAX_INT32:
             sections = np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)
             # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
-            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
+            n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
             n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
             preds = np.zeros(sum(n_preds), dtype=np.float64)
             for chunk, (start_idx_pred, end_idx_pred) in zip_(np.array_split(mat, sections),
                                                               zip_(n_preds_sections, n_preds_sections[1:])):
                 # avoid memory consumption by arrays concatenation operations
-                inner_predict(chunk, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
+                inner_predict(chunk, start_iteration, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
             return preds, nrow
         else:
-            return inner_predict(mat, num_iteration, predict_type)
+            return inner_predict(mat, start_iteration, num_iteration, predict_type)
 
     def __create_sparse_native(self, cs, out_shape, out_ptr_indptr, out_ptr_indices, out_ptr_data,
                                indptr_type, data_type, is_csr=True):
@@ -719,11 +722,11 @@ def __create_sparse_native(self, cs, out_shape, out_ptr_indptr, out_ptr_indices,
             return cs_output_matrices[0]
         return cs_output_matrices
 
-    def __pred_for_csr(self, csr, num_iteration, predict_type):
+    def __pred_for_csr(self, csr, start_iteration, num_iteration, predict_type):
         """Predict for a CSR data."""
-        def inner_predict(csr, num_iteration, predict_type, preds=None):
+        def inner_predict(csr, start_iteration, num_iteration, predict_type, preds=None):
             nrow = len(csr.indptr) - 1
-            n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
+            n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type)
             if preds is None:
                 preds = np.zeros(n_preds, dtype=np.float64)
             elif len(preds.shape) != 1 or len(preds) != n_preds:
@@ -747,6 +750,7 @@ def inner_predict(csr, num_iteration, predict_type, preds=None):
                 ctypes.c_int64(len(csr.data)),
                 ctypes.c_int64(csr.shape[1]),
                 ctypes.c_int(predict_type),
+                ctypes.c_int(start_iteration),
                 ctypes.c_int(num_iteration),
                 c_str(self.pred_parameter),
                 ctypes.byref(out_num_preds),
@@ -755,7 +759,7 @@ def inner_predict(csr, num_iteration, predict_type, preds=None):
                 raise ValueError("Wrong length for predict results")
             return preds, nrow
 
-        def inner_predict_sparse(csr, num_iteration, predict_type):
+        def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type):
             ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
             ptr_data, type_ptr_data, _ = c_float_array(csr.data)
             csr_indices = csr.indices.astype(np.int32, copy=False)
@@ -781,6 +785,7 @@ def inner_predict_sparse(csr, num_iteration, predict_type):
                 ctypes.c_int64(len(csr.data)),
                 ctypes.c_int64(csr.shape[1]),
                 ctypes.c_int(predict_type),
+                ctypes.c_int(start_iteration),
                 ctypes.c_int(num_iteration),
                 c_str(self.pred_parameter),
                 ctypes.c_int(matrix_type),
@@ -794,25 +799,25 @@ def inner_predict_sparse(csr, num_iteration, predict_type):
             return matrices, nrow
 
         if predict_type == C_API_PREDICT_CONTRIB:
-            return inner_predict_sparse(csr, num_iteration, predict_type)
+            return inner_predict_sparse(csr, start_iteration, num_iteration, predict_type)
         nrow = len(csr.indptr) - 1
         if nrow > MAX_INT32:
             sections = [0] + list(np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)) + [nrow]
             # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
-            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff(sections)]
+            n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)]
             n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
             preds = np.zeros(sum(n_preds), dtype=np.float64)
             for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip_(zip_(sections, sections[1:]),
                                                                              zip_(n_preds_sections, n_preds_sections[1:])):
                 # avoid memory consumption by arrays concatenation operations
-                inner_predict(csr[start_idx:end_idx], num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
+                inner_predict(csr[start_idx:end_idx], start_iteration, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
             return preds, nrow
         else:
-            return inner_predict(csr, num_iteration, predict_type)
+            return inner_predict(csr, start_iteration, num_iteration, predict_type)
 
-    def __pred_for_csc(self, csc, num_iteration, predict_type):
+    def __pred_for_csc(self, csc, start_iteration, num_iteration, predict_type):
         """Predict for a CSC data."""
-        def inner_predict_sparse(csc, num_iteration, predict_type):
+        def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type):
             ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
             ptr_data, type_ptr_data, _ = c_float_array(csc.data)
             csc_indices = csc.indices.astype(np.int32, copy=False)
@@ -838,6 +843,7 @@ def inner_predict_sparse(csc, num_iteration, predict_type):
                 ctypes.c_int64(len(csc.data)),
                 ctypes.c_int64(csc.shape[0]),
                 ctypes.c_int(predict_type),
+                ctypes.c_int(start_iteration),
                 ctypes.c_int(num_iteration),
                 c_str(self.pred_parameter),
                 ctypes.c_int(matrix_type),
@@ -852,10 +858,10 @@ def inner_predict_sparse(csc, num_iteration, predict_type):
 
         nrow = csc.shape[0]
         if nrow > MAX_INT32:
-            return self.__pred_for_csr(csc.tocsr(), num_iteration, predict_type)
+            return self.__pred_for_csr(csc.tocsr(), start_iteration, num_iteration, predict_type)
         if predict_type == C_API_PREDICT_CONTRIB:
-            return inner_predict_sparse(csc, num_iteration, predict_type)
-        n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
+            return inner_predict_sparse(csc, start_iteration, num_iteration, predict_type)
+        n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type)
         preds = np.zeros(n_preds, dtype=np.float64)
         out_num_preds = ctypes.c_int64(0)
 
@@ -876,6 +882,7 @@ def inner_predict_sparse(csc, num_iteration, predict_type):
             ctypes.c_int64(len(csc.data)),
             ctypes.c_int64(csc.shape[0]),
             ctypes.c_int(predict_type),
+            ctypes.c_int(start_iteration),
             ctypes.c_int(num_iteration),
             c_str(self.pred_parameter),
             ctypes.byref(out_num_preds),
@@ -2806,7 +2813,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, importance_type='spl
                                                           default=json_default_with_numpy))
         return ret
 
-    def predict(self, data, num_iteration=None,
+    def predict(self, data, start_iteration=None, num_iteration=None,
                 raw_score=False, pred_leaf=False, pred_contrib=False,
                 data_has_header=False, is_reshape=True, **kwargs):
         """Make a prediction.
@@ -2816,10 +2823,14 @@ def predict(self, data, num_iteration=None,
         data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
             Data source for prediction.
             If string, it represents the path to txt file.
+        start_iteration : int or None, optional (default=None)
+            Start index of the iteration to predict.
+            If None or <= 0, starts from the first iteration.
         num_iteration : int or None, optional (default=None)
             Limit number of iterations in the prediction.
-            If None, if the best iteration exists, it is used; otherwise, all iterations are used.
-            If <= 0, all iterations are used (no limits).
+            If None, if the best iteration exists and start_iteration is None or <= 0, the best iteration is used;
+            otherwise, all iterations from start_iteration are used.
+            If <= 0, all iterations from start_iteration are used (no limits).
         raw_score : bool, optional (default=False)
             Whether to predict raw scores.
         pred_leaf : bool, optional (default=False)
@@ -2850,9 +2861,14 @@ def predict(self, data, num_iteration=None,
             Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``).
         """
         predictor = self._to_predictor(copy.deepcopy(kwargs))
+        if start_iteration is None or start_iteration < 0:
+            start_iteration = 0
         if num_iteration is None:
-            num_iteration = self.best_iteration
-        return predictor.predict(data, num_iteration,
+            if start_iteration == 0:
+                num_iteration = self.best_iteration
+            else:
+                num_iteration = -1
+        return predictor.predict(data, start_iteration, num_iteration,
                                  raw_score, pred_leaf, pred_contrib,
                                  data_has_header, is_reshape)
 
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index d7250c83eba4..1dd1c0b7c566 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -612,7 +612,7 @@ def _get_meta_data(collection, name, i):
         del train_set, valid_sets
         return self
 
-    def predict(self, X, raw_score=False, num_iteration=None,
+    def predict(self, X, raw_score=False, start_iteration=None, num_iteration=None,
                 pred_leaf=False, pred_contrib=False, **kwargs):
         """Return the predicted value for each sample.
 
@@ -622,6 +622,9 @@ def predict(self, X, raw_score=False, num_iteration=None,
             Input features matrix.
         raw_score : bool, optional (default=False)
             Whether to predict raw scores.
+        start_iteration : int or None, optional (default=None)
+            Start index of the iteration to predict.
+            If None or <= 0, starts from the first iteration.
         num_iteration : int or None, optional (default=None)
             Limit number of iterations in the prediction.
             If None, if the best iteration exists, it is used; otherwise, all trees are used.
@@ -661,7 +664,7 @@ def predict(self, X, raw_score=False, num_iteration=None,
                              "match the input. Model n_features_ is %s and "
                              "input n_features is %s "
                              % (self._n_features, n_features))
-        return self._Booster.predict(X, raw_score=raw_score, num_iteration=num_iteration,
+        return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration,
                                      pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs)
 
     @property
@@ -832,10 +835,10 @@ def fit(self, X, y,
 
     fit.__doc__ = LGBMModel.fit.__doc__
 
-    def predict(self, X, raw_score=False, num_iteration=None,
+    def predict(self, X, raw_score=False, start_iteration=None, num_iteration=None,
                 pred_leaf=False, pred_contrib=False, **kwargs):
         """Docstring is inherited from the LGBMModel."""
-        result = self.predict_proba(X, raw_score, num_iteration,
+        result = self.predict_proba(X, raw_score, start_iteration, num_iteration,
                                     pred_leaf, pred_contrib, **kwargs)
         if callable(self._objective) or raw_score or pred_leaf or pred_contrib:
             return result
@@ -845,7 +848,7 @@ def predict(self, X, raw_score=False, num_iteration=None,
 
     predict.__doc__ = LGBMModel.predict.__doc__
 
-    def predict_proba(self, X, raw_score=False, num_iteration=None,
+    def predict_proba(self, X, raw_score=False, start_iteration=None, num_iteration=None,
                       pred_leaf=False, pred_contrib=False, **kwargs):
         """Return the predicted probability for each class for each sample.
 
@@ -855,6 +858,9 @@ def predict_proba(self, X, raw_score=False, num_iteration=None,
             Input features matrix.
         raw_score : bool, optional (default=False)
             Whether to predict raw scores.
+        start_iteration : int or None, optional (default=None)
+            Start index of the iteration to predict.
+            If None or <= 0, starts from the first iteration.
         num_iteration : int or None, optional (default=None)
             Limit number of iterations in the prediction.
             If None, if the best iteration exists, it is used; otherwise, all trees are used.
@@ -884,7 +890,7 @@ def predict_proba(self, X, raw_score=False, num_iteration=None,
         X_SHAP_values : array-like of shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects
             If ``pred_contrib=True``, the feature contributions for each sample.
         """
-        result = super(LGBMClassifier, self).predict(X, raw_score, num_iteration,
+        result = super(LGBMClassifier, self).predict(X, raw_score, start_iteration, num_iteration,
                                                      pred_leaf, pred_contrib, **kwargs)
         if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
             warnings.warn("Cannot compute class probabilities or labels "
diff --git a/src/application/application.cpp b/src/application/application.cpp
index eca5c5e97ae9..21163a5a30ea 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -88,7 +88,7 @@ void Application::LoadData() {
   PredictFunction predict_fun = nullptr;
   // need to continue training
   if (boosting_->NumberOfTotalModel() > 0 && config_.task != TaskType::KRefitTree) {
-    predictor.reset(new Predictor(boosting_.get(), -1, true, false, false, false, -1, -1));
+    predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1));
     predict_fun = predictor->GetPredictFunction();
   }
 
@@ -213,7 +213,7 @@ void Application::Train() {
 void Application::Predict() {
   if (config_.task == TaskType::KRefitTree) {
     // create predictor
-    Predictor predictor(boosting_.get(), -1, false, true, false, false, 1, 1);
+    Predictor predictor(boosting_.get(), 0, -1, false, true, false, false, 1, 1);
     predictor.Predict(config_.data.c_str(), config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check);
     TextReader<int> result_reader(config_.output_result.c_str(), false);
     result_reader.ReadAllLines();
@@ -239,7 +239,7 @@ void Application::Predict() {
     Log::Info("Finished RefitTree");
   } else {
     // create predictor
-    Predictor predictor(boosting_.get(), config_.num_iteration_predict, config_.predict_raw_score,
+    Predictor predictor(boosting_.get(), config_.start_iteration_predict, config_.num_iteration_predict, config_.predict_raw_score,
                         config_.predict_leaf_index, config_.predict_contrib,
                         config_.pred_early_stop, config_.pred_early_stop_freq,
                         config_.pred_early_stop_margin);
diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
index 48ef227de2c6..a8454490fcf1 100644
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -31,12 +31,13 @@ class Predictor {
   /*!
   * \brief Constructor
   * \param boosting Input boosting model
+  * \param start_iteration Start index of the iteration to predict
   * \param num_iteration Number of boosting round
   * \param is_raw_score True if need to predict result with raw score
   * \param predict_leaf_index True to output leaf index instead of prediction score
   * \param predict_contrib True to output feature contributions instead of prediction score
   */
-  Predictor(Boosting* boosting, int num_iteration, bool is_raw_score,
+  Predictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score,
             bool predict_leaf_index, bool predict_contrib, bool early_stop,
             int early_stop_freq, double early_stop_margin) {
     early_stop_ = CreatePredictionEarlyStopInstance(
@@ -56,9 +57,9 @@ class Predictor {
       }
     }
 
-    boosting->InitPredict(num_iteration, predict_contrib);
+    boosting->InitPredict(start_iteration, num_iteration, predict_contrib);
     boosting_ = boosting;
-    num_pred_one_row_ = boosting_->NumPredictOneRow(
+    num_pred_one_row_ = boosting_->NumPredictOneRow(start_iteration,
         num_iteration, predict_leaf_index, predict_contrib);
     num_feature_ = boosting_->MaxFeatureIdx() + 1;
     predict_buf_.resize(
@@ -225,6 +226,7 @@ class Predictor {
                           data_size_t, const std::vector<std::string>& lines) {
       std::vector<std::pair<int, double>> oneline_features;
       std::vector<std::string> result_to_write(lines.size());
+      Log::Warning("before predict_fun_ is called");
       OMP_INIT_EX();
       #pragma omp parallel for schedule(static) firstprivate(oneline_features)
       for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
@@ -239,6 +241,7 @@ class Predictor {
         result_to_write[i] = str_result;
         OMP_LOOP_EX_END();
       }
+      Log::Warning("after predict_fun_ is called");
       OMP_THROW_EX();
       for (data_size_t i = 0; i < static_cast<data_size_t>(result_to_write.size()); ++i) {
         writer->Write(result_to_write[i].c_str(), result_to_write[i].size());
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 064bebfec2f5..03f5fe25d554 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -574,7 +574,8 @@ void GBDT::PredictContrib(const double* features, double* output) const {
   // set zero
   const int num_features = max_feature_idx_ + 1;
   std::memset(output, 0, sizeof(double) * num_tree_per_iteration_ * (num_features + 1));
-  for (int i = 0; i < num_iteration_for_pred_; ++i) {
+  const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_;
+  for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) {
     // predict all the trees for one iteration
     for (int k = 0; k < num_tree_per_iteration_; ++k) {
       models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features + 1));
@@ -585,7 +586,8 @@ void GBDT::PredictContrib(const double* features, double* output) const {
 void GBDT::PredictContribByMap(const std::unordered_map<int, double>& features,
                                std::vector<std::unordered_map<int, double>>* output) const {
   const int num_features = max_feature_idx_ + 1;
-  for (int i = 0; i < num_iteration_for_pred_; ++i) {
+  const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_;
+  for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) {
     // predict all the trees for one iteration
     for (int k = 0; k < num_tree_per_iteration_; ++k) {
       models_[i * num_tree_per_iteration_ + k]->PredictContribByMap(features, num_features, &((*output)[k]));
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index dfd9a15d08e8..a84b321531f1 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -204,19 +204,22 @@ class GBDT : public GBDTBase {
 
   /*!
   * \brief Get number of prediction for one data
+  * \param start_iteration Start index of the iteration to predict
   * \param num_iteration number of used iterations
   * \param is_pred_leaf True if predicting  leaf index
   * \param is_pred_contrib True if predicting feature contribution
   * \return number of prediction
   */
-  inline int NumPredictOneRow(int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const override {
+  inline int NumPredictOneRow(int start_iteration, int num_iteration, bool is_pred_leaf, bool is_pred_contrib) const override {
     int num_pred_in_one_row = num_class_;
     if (is_pred_leaf) {
       int max_iteration = GetCurrentIteration();
+      start_iteration = std::max(start_iteration, 0);
+      start_iteration = std::min(start_iteration, max_iteration);
       if (num_iteration > 0) {
-        num_pred_in_one_row *= static_cast<int>(std::min(max_iteration, num_iteration));
+        num_pred_in_one_row *= static_cast<int>(std::min(max_iteration - start_iteration, num_iteration));
       } else {
-        num_pred_in_one_row *= max_iteration;
+        num_pred_in_one_row *= (max_iteration - start_iteration);
       }
     } else if (is_pred_contrib) {
       num_pred_in_one_row = num_tree_per_iteration_ * (max_feature_idx_ + 2);  // +1 for 0-based indexing, +1 for baseline
@@ -352,11 +355,16 @@ class GBDT : public GBDTBase {
   */
   inline int NumberOfClasses() const override { return num_class_; }
 
-  inline void InitPredict(int num_iteration, bool is_pred_contrib) override {
+  inline void InitPredict(int start_iteration, int num_iteration, bool is_pred_contrib) override {
     num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
+    start_iteration = std::max(start_iteration, 0);
+    start_iteration = std::min(start_iteration, num_iteration_for_pred_);
     if (num_iteration > 0) {
-      num_iteration_for_pred_ = std::min(num_iteration, num_iteration_for_pred_);
+      num_iteration_for_pred_ = std::min(num_iteration, num_iteration_for_pred_ - start_iteration);
+    } else {
+      num_iteration_for_pred_ = num_iteration_for_pred_ - start_iteration;
     }
+    start_iteration_for_pred_ = start_iteration;
     if (is_pred_contrib) {
       #pragma omp parallel for schedule(static)
       for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
@@ -489,6 +497,8 @@ class GBDT : public GBDTBase {
   data_size_t label_idx_;
   /*! \brief number of used model */
   int num_iteration_for_pred_;
+  /*! \brief Start iteration of used model */
+  int start_iteration_for_pred_;
   /*! \brief Shrinkage rate for one iteration */
   double shrinkage_rate_;
   /*! \brief Number of loaded initial models */
diff --git a/src/boosting/gbdt_prediction.cpp b/src/boosting/gbdt_prediction.cpp
index b4711f7c01a6..97db70de4c32 100644
--- a/src/boosting/gbdt_prediction.cpp
+++ b/src/boosting/gbdt_prediction.cpp
@@ -14,7 +14,8 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa
   int early_stop_round_counter = 0;
   // set zero
   std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);
-  for (int i = 0; i < num_iteration_for_pred_; ++i) {
+  const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_;
+  for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) {
     // predict all the trees for one iteration
     for (int k = 0; k < num_tree_per_iteration_; ++k) {
       output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features);
@@ -34,7 +35,8 @@ void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, doub
   int early_stop_round_counter = 0;
   // set zero
   std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);
-  for (int i = 0; i < num_iteration_for_pred_; ++i) {
+  const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_;
+  for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) {
     // predict all the trees for one iteration
     for (int k = 0; k < num_tree_per_iteration_; ++k) {
       output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features);
@@ -75,16 +77,20 @@ void GBDT::PredictByMap(const std::unordered_map<int, double>& features, double*
 }
 
 void GBDT::PredictLeafIndex(const double* features, double* output) const {
-  int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
-  for (int i = 0; i < total_tree; ++i) {
-    output[i] = models_[i]->PredictLeafIndex(features);
+  int start_tree = start_iteration_for_pred_ * num_tree_per_iteration_;
+  int num_trees =  num_iteration_for_pred_ * num_tree_per_iteration_;
+  const auto* models_ptr = models_.data() + start_tree;
+  for (int i = 0; i < num_trees; ++i) {
+    output[i] = models_ptr[i]->PredictLeafIndex(features);
   }
 }
 
 void GBDT::PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const {
-  int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
-  for (int i = 0; i < total_tree; ++i) {
-    output[i] = models_[i]->PredictLeafIndexByMap(features);
+  int start_tree = start_iteration_for_pred_ * num_tree_per_iteration_;
+  int num_trees = num_iteration_for_pred_ * num_tree_per_iteration_;
+  const auto* models_ptr = models_.data() + start_tree;
+  for (int i = 0; i < num_trees; ++i) {
+    output[i] = models_ptr[i]->PredictLeafIndexByMap(features);
   }
 }
 
diff --git a/src/c_api.cpp b/src/c_api.cpp
index cc82697ac35b..61b3038e660b 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -62,7 +62,7 @@ class SingleRowPredictor {
   PredictFunction predict_function;
   int64_t num_pred_in_one_row;
 
-  SingleRowPredictor(int predict_type, Boosting* boosting, const Config& config, int iter) {
+  SingleRowPredictor(int predict_type, Boosting* boosting, const Config& config, int start_iter, int num_iter) {
     bool is_predict_leaf = false;
     bool is_raw_score = false;
     bool predict_contrib = false;
@@ -78,10 +78,10 @@ class SingleRowPredictor {
     early_stop_ = config.pred_early_stop;
     early_stop_freq_ = config.pred_early_stop_freq;
     early_stop_margin_ = config.pred_early_stop_margin;
-    iter_ = iter;
-    predictor_.reset(new Predictor(boosting, iter_, is_raw_score, is_predict_leaf, predict_contrib,
+    iter_ = num_iter;
+    predictor_.reset(new Predictor(boosting, start_iter, iter_, is_raw_score, is_predict_leaf, predict_contrib,
                                    early_stop_, early_stop_freq_, early_stop_margin_));
-    num_pred_in_one_row = boosting->NumPredictOneRow(iter_, is_predict_leaf, predict_contrib);
+    num_pred_in_one_row = boosting->NumPredictOneRow(start_iter, iter_, is_predict_leaf, predict_contrib);
     predict_function = predictor_->GetPredictFunction();
     num_total_model_ = boosting->NumberOfTotalModel();
   }
@@ -369,12 +369,12 @@ class Booster {
     boosting_->RollbackOneIter();
   }
 
-  void SetSingleRowPredictor(int num_iteration, int predict_type, const Config& config) {
+  void SetSingleRowPredictor(int start_iteration, int num_iteration, int predict_type, const Config& config) {
       UNIQUE_LOCK(mutex_)
       if (single_row_predictor_[predict_type].get() == nullptr ||
           !single_row_predictor_[predict_type]->IsPredictorEqual(config, num_iteration, boosting_.get())) {
         single_row_predictor_[predict_type].reset(new SingleRowPredictor(predict_type, boosting_.get(),
-                                                                         config, num_iteration));
+                                                                         config, start_iteration, num_iteration));
       }
   }
 
@@ -395,7 +395,7 @@ class Booster {
     *out_len = single_row_predictor->num_pred_in_one_row;
   }
 
-  Predictor CreatePredictor(int num_iteration, int predict_type, int ncol, const Config& config) const {
+  Predictor CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const {
     if (!config.predict_disable_shape_check && ncol != boosting_->MaxFeatureIdx() + 1) {
       Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \
                  "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", ncol, boosting_->MaxFeatureIdx() + 1);
@@ -413,17 +413,17 @@ class Booster {
       is_raw_score = false;
     }
 
-    Predictor predictor(boosting_.get(), num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
+    Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
                         config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin);
     return predictor;
   }
 
-  void Predict(int num_iteration, int predict_type, int nrow, int ncol,
+  void Predict(int start_iteration, int num_iteration, int predict_type, int nrow, int ncol,
                std::function<std::vector<std::pair<int, double>>(int row_idx)> get_row_fun,
                const Config& config,
                double* out_result, int64_t* out_len) const {
     SHARED_LOCK(mutex_);
-    auto predictor = CreatePredictor(num_iteration, predict_type, ncol, config);
+    auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config);
     bool is_predict_leaf = false;
     bool predict_contrib = false;
     if (predict_type == C_API_PREDICT_LEAF_INDEX) {
@@ -431,7 +431,7 @@ class Booster {
     } else if (predict_type == C_API_PREDICT_CONTRIB) {
       predict_contrib = true;
     }
-    int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(num_iteration, is_predict_leaf, predict_contrib);
+    int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib);
     auto pred_fun = predictor.GetPredictFunction();
     OMP_INIT_EX();
     #pragma omp parallel for schedule(static)
@@ -446,13 +446,13 @@ class Booster {
     *out_len = num_pred_in_one_row * nrow;
   }
 
-  void PredictSparse(int num_iteration, int predict_type, int64_t nrow, int ncol,
+  void PredictSparse(int start_iteration, int num_iteration, int predict_type, int64_t nrow, int ncol,
                      std::function<std::vector<std::pair<int, double>>(int64_t row_idx)> get_row_fun,
                      const Config& config, int64_t* out_elements_size,
                      std::vector<std::vector<std::unordered_map<int, double>>>* agg_ptr,
                      int32_t** out_indices, void** out_data, int data_type,
                      bool* is_data_float32_ptr, int num_matrices) const {
-    auto predictor = CreatePredictor(num_iteration, predict_type, ncol, config);
+    auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config);
     auto pred_sparse_fun = predictor.GetPredictSparseFunction();
     std::vector<std::vector<std::unordered_map<int, double>>>& agg = *agg_ptr;
     OMP_INIT_EX();
@@ -488,7 +488,7 @@ class Booster {
     *out_indices = new int32_t[elements_size];
   }
 
-  void PredictSparseCSR(int num_iteration, int predict_type, int64_t nrow, int ncol,
+  void PredictSparseCSR(int start_iteration, int num_iteration, int predict_type, int64_t nrow, int ncol,
                         std::function<std::vector<std::pair<int, double>>(int64_t row_idx)> get_row_fun,
                         const Config& config,
                         int64_t* out_len, void** out_indptr, int indptr_type,
@@ -511,7 +511,7 @@ class Booster {
     // aggregated per row feature contribution results
     std::vector<std::vector<std::unordered_map<int, double>>> agg(nrow);
     int64_t elements_size = 0;
-    PredictSparse(num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg,
+    PredictSparse(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg,
                   out_indices, out_data, data_type, &is_data_float32, num_matrices);
     std::vector<int> row_sizes(num_matrices * nrow);
     std::vector<int64_t> row_matrix_offsets(num_matrices * nrow);
@@ -572,7 +572,7 @@ class Booster {
     out_len[1] = indptr_size;
   }
 
-  void PredictSparseCSC(int num_iteration, int predict_type, int64_t nrow, int ncol,
+  void PredictSparseCSC(int start_iteration, int num_iteration, int predict_type, int64_t nrow, int ncol,
                         std::function<std::vector<std::pair<int, double>>(int64_t row_idx)> get_row_fun,
                         const Config& config,
                         int64_t* out_len, void** out_col_ptr, int col_ptr_type,
@@ -580,7 +580,7 @@ class Booster {
     SHARED_LOCK(mutex_);
     // Get the number of trees per iteration (for multiclass scenario we output multiple sparse matrices)
     int num_matrices = boosting_->NumModelPerIteration();
-    auto predictor = CreatePredictor(num_iteration, predict_type, ncol, config);
+    auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config);
     auto pred_sparse_fun = predictor.GetPredictSparseFunction();
     bool is_col_ptr_int32 = false;
     bool is_data_float32 = false;
@@ -598,7 +598,7 @@ class Booster {
     // aggregated per row feature contribution results
     std::vector<std::vector<std::unordered_map<int, double>>> agg(nrow);
     int64_t elements_size = 0;
-    PredictSparse(num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg,
+    PredictSparse(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun, config, &elements_size, &agg,
                   out_indices, out_data, data_type, &is_data_float32, num_matrices);
     // calculate number of elements per column to construct
     // the CSC matrix with random access
@@ -676,7 +676,7 @@ class Booster {
     out_len[1] = col_ptr_size;
   }
 
-  void Predict(int num_iteration, int predict_type, const char* data_filename,
+  void Predict(int start_iteration, int num_iteration, int predict_type, const char* data_filename,
                int data_has_header, const Config& config,
                const char* result_filename) const {
     SHARED_LOCK(mutex_)
@@ -692,7 +692,7 @@ class Booster {
     } else {
       is_raw_score = false;
     }
-    Predictor predictor(boosting_.get(), num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
+    Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
                         config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin);
     bool bool_data_has_header = data_has_header > 0 ? true : false;
     predictor.Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check);
@@ -1728,6 +1728,7 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle,
                                const char* data_filename,
                                int data_has_header,
                                int predict_type,
+                               int start_iteration,
                                int num_iteration,
                                const char* parameter,
                                const char* result_filename) {
@@ -1739,7 +1740,7 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle,
     omp_set_num_threads(config.num_threads);
   }
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
-  ref_booster->Predict(num_iteration, predict_type, data_filename, data_has_header,
+  ref_booster->Predict(start_iteration, num_iteration, predict_type, data_filename, data_has_header,
                        config, result_filename);
   API_END();
 }
@@ -1747,11 +1748,12 @@ int LGBM_BoosterPredictForFile(BoosterHandle handle,
 int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
                                int num_row,
                                int predict_type,
+                               int start_iteration,
                                int num_iteration,
                                int64_t* out_len) {
   API_BEGIN();
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
-  *out_len = static_cast<int64_t>(num_row) * ref_booster->GetBoosting()->NumPredictOneRow(
+  *out_len = static_cast<int64_t>(num_row) * ref_booster->GetBoosting()->NumPredictOneRow(start_iteration,
     num_iteration, predict_type == C_API_PREDICT_LEAF_INDEX, predict_type == C_API_PREDICT_CONTRIB);
   API_END();
 }
@@ -1798,6 +1800,7 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle,
                               int64_t nelem,
                               int64_t num_col,
                               int predict_type,
+                              int start_iteration,
                               int num_iteration,
                               const char* parameter,
                               int64_t* out_len,
@@ -1817,7 +1820,7 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle,
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
   int nrow = static_cast<int>(nindptr - 1);
-  ref_booster->Predict(num_iteration, predict_type, nrow, static_cast<int>(num_col), get_row_fun,
+  ref_booster->Predict(start_iteration, num_iteration, predict_type, nrow, static_cast<int>(num_col), get_row_fun,
                        config, out_result, out_len);
   API_END();
 }
@@ -1832,6 +1835,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
                                     int64_t nelem,
                                     int64_t num_col_or_row,
                                     int predict_type,
+                                    int start_iteration,
                                     int num_iteration,
                                     const char* parameter,
                                     int matrix_type,
@@ -1855,7 +1859,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
     }
     auto get_row_fun = RowFunctionFromCSR<int64_t>(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
     int64_t nrow = nindptr - 1;
-    ref_booster->PredictSparseCSR(num_iteration, predict_type, nrow, static_cast<int>(num_col_or_row), get_row_fun,
+    ref_booster->PredictSparseCSR(start_iteration, num_iteration, predict_type, nrow, static_cast<int>(num_col_or_row), get_row_fun,
                                   config, out_len, out_indptr, indptr_type, out_indices, out_data, data_type);
   } else if (matrix_type == C_API_MATRIX_TYPE_CSC) {
     int num_threads = OMP_NUM_THREADS();
@@ -1879,7 +1883,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
       }
       return one_row;
     };
-    ref_booster->PredictSparseCSC(num_iteration, predict_type, num_col_or_row, ncol, get_row_fun, config,
+    ref_booster->PredictSparseCSC(start_iteration, num_iteration, predict_type, num_col_or_row, ncol, get_row_fun, config,
                                   out_len, out_indptr, indptr_type, out_indices, out_data, data_type);
   } else {
     Log::Fatal("Unknown matrix type in LGBM_BoosterPredictSparseOutput");
@@ -1917,6 +1921,7 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
                                        int64_t nelem,
                                        int64_t num_col,
                                        int predict_type,
+                                       int start_iteration,
                                        int num_iteration,
                                        const char* parameter,
                                        int64_t* out_len,
@@ -1935,13 +1940,14 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
   }
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
-  ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config);
+  ref_booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, config);
   ref_booster->PredictSingleRow(predict_type, static_cast<int32_t>(num_col), get_row_fun, config, out_result, out_len);
   API_END();
 }
 
 int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
                                                const int predict_type,
+                                               const int start_iteration,
                                                const int num_iteration,
                                                const int data_type,
                                                const int64_t num_col,
@@ -1965,7 +1971,7 @@ int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
     omp_set_num_threads(fastConfig_ptr->config.num_threads);
   }
 
-  fastConfig_ptr->booster->SetSingleRowPredictor(num_iteration, predict_type, fastConfig_ptr->config);
+  fastConfig_ptr->booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, fastConfig_ptr->config);
 
   *out_fastConfig = fastConfig_ptr.release();
   API_END();
@@ -1999,6 +2005,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
                               int64_t nelem,
                               int64_t num_row,
                               int predict_type,
+                              int start_iteration,
                               int num_iteration,
                               const char* parameter,
                               int64_t* out_len,
@@ -2032,7 +2039,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
         }
         return one_row;
       };
-  ref_booster->Predict(num_iteration, predict_type, static_cast<int>(num_row), ncol, get_row_fun, config,
+  ref_booster->Predict(start_iteration, num_iteration, predict_type, static_cast<int>(num_row), ncol, get_row_fun, config,
                        out_result, out_len);
   API_END();
 }
@@ -2044,6 +2051,7 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle,
                               int32_t ncol,
                               int is_row_major,
                               int predict_type,
+                              int start_iteration,
                               int num_iteration,
                               const char* parameter,
                               int64_t* out_len,
@@ -2057,7 +2065,7 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle,
   }
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
-  ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun,
+  ref_booster->Predict(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun,
                        config, out_result, out_len);
   API_END();
 }
@@ -2068,6 +2076,7 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
                                        int32_t ncol,
                                        int is_row_major,
                                        int predict_type,
+                                       int start_iteration,
                                        int num_iteration,
                                        const char* parameter,
                                        int64_t* out_len,
@@ -2081,13 +2090,14 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
   }
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major);
-  ref_booster->SetSingleRowPredictor(num_iteration, predict_type, config);
+  ref_booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, config);
   ref_booster->PredictSingleRow(predict_type, ncol, get_row_fun, config, out_result, out_len);
   API_END();
 }
 
 int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
                                                const int predict_type,
+                                               const int start_iteration,
                                                const int num_iteration,
                                                const int data_type,
                                                const int32_t ncol,
@@ -2105,7 +2115,7 @@ int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
     omp_set_num_threads(fastConfig_ptr->config.num_threads);
   }
 
-  fastConfig_ptr->booster->SetSingleRowPredictor(num_iteration, predict_type, fastConfig_ptr->config);
+  fastConfig_ptr->booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, fastConfig_ptr->config);
 
   *out_fastConfig = fastConfig_ptr.release();
   API_END();
@@ -2132,6 +2142,7 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle,
                                int32_t nrow,
                                int32_t ncol,
                                int predict_type,
+                               int start_iteration,
                                int num_iteration,
                                const char* parameter,
                                int64_t* out_len,
@@ -2145,7 +2156,7 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle,
   }
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type);
-  ref_booster->Predict(num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len);
+  ref_booster->Predict(start_iteration, num_iteration, predict_type, nrow, ncol, get_row_fun, config, out_result, out_len);
   API_END();
 }
 
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index ed593a48e4aa..b14af67fd30e 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -256,6 +256,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "categorical_feature",
   "forcedbins_filename",
   "save_binary",
+  "start_iteration_predict",
   "num_iteration_predict",
   "predict_raw_score",
   "predict_leaf_index",
@@ -513,6 +514,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetBool(params, "save_binary", &save_binary);
 
+  GetInt(params, "start_iteration_predict", &start_iteration_predict);
+
   GetInt(params, "num_iteration_predict", &num_iteration_predict);
 
   GetBool(params, "predict_raw_score", &predict_raw_score);
diff --git a/swig/lightgbmlib.i b/swig/lightgbmlib.i
index 56a0a454b3c3..0a27427d0969 100644
--- a/swig/lightgbmlib.i
+++ b/swig/lightgbmlib.i
@@ -83,13 +83,14 @@
                                       int ncol,
                                       int is_row_major,
                                       int predict_type,
+                                      int start_iteration,
                                       int num_iteration,
                                       const char* parameter,
                                       int64_t* out_len,
                                       double* out_result) {
     double* data0 = (double*)jenv->GetPrimitiveArrayCritical(data, 0);
 
-    int ret = LGBM_BoosterPredictForMatSingleRow(handle, data0, data_type, ncol, is_row_major, predict_type,
+    int ret = LGBM_BoosterPredictForMatSingleRow(handle, data0, data_type, ncol, is_row_major, predict_type, start_iteration,
                                                  num_iteration, parameter, out_len, out_result);
 
     jenv->ReleasePrimitiveArrayCritical(data, data0, JNI_ABORT);
@@ -130,6 +131,7 @@
                                       int64_t nelem,
                                       int64_t num_col,
                                       int predict_type,
+                                      int start_iteration,
                                       int num_iteration,
                                       const char* parameter,
                                       int64_t* out_len,
@@ -147,7 +149,7 @@
     int32_t ind[2] = { 0, numNonZeros };
 
     int ret = LGBM_BoosterPredictForCSRSingleRow(handle, ind, indptr_type, indices0, values0, data_type, 2,
-                                                 nelem, num_col, predict_type, num_iteration, parameter, out_len, out_result);
+                                                 nelem, num_col, predict_type, start_iteration, num_iteration, parameter, out_len, out_result);
 
     jenv->ReleasePrimitiveArrayCritical(values, values0, JNI_ABORT);
     jenv->ReleasePrimitiveArrayCritical(indices, indices0, JNI_ABORT);
diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py
index 8efa8d0ba88c..e124b6594193 100644
--- a/tests/c_api_test/test_.py
+++ b/tests/c_api_test/test_.py
@@ -263,6 +263,7 @@ def test_booster():
         mat.shape[1],
         1,
         1,
+        0,
         25,
         c_str(''),
         ctypes.byref(num_preb),
@@ -273,6 +274,17 @@ def test_booster():
                            '../../examples/binary_classification/binary.test')),
         0,
         0,
+        0,
+        25,
+        c_str(''),
+        c_str('preb.txt'))
+    LIB.LGBM_BoosterPredictForFile(
+        booster2,
+        c_str(os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                           '../../examples/binary_classification/binary.test')),
+        0,
+        0,
+        10,
         25,
         c_str(''),
         c_str('preb.txt'))
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 2cfdf67fe94c..3c24f39fbcce 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -2315,3 +2315,90 @@ def test_interaction_constraints(self):
         est = lgb.train(dict(params, interaction_constraints=[[0] + list(range(2, num_features)),
                                                               [1] + list(range(2, num_features))]),
                         train_data, num_boost_round=10)
+
+    def test_predict_with_start_iteration(self):
+        def inner_test(X, y, params, early_stopping_rounds):
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+            train_data = lgb.Dataset(X_train, label=y_train)
+            valid_data = lgb.Dataset(X_test, label=y_test)
+            booster = lgb.train(params, train_data, num_boost_round=100, early_stopping_rounds=early_stopping_rounds, valid_sets=[valid_data])
+
+            # test that the predict once with all iterations equals summed results with start_iteration and num_iteration
+            all_pred = booster.predict(X, raw_score=True)
+            all_pred_contrib = booster.predict(X, pred_contrib=True)
+            steps = [10, 12]
+            for step in steps:
+                pred = np.zeros_like(all_pred)
+                pred_contrib = np.zeros_like(all_pred_contrib)
+                for start_iter in range(0, 100, step):
+                    pred += booster.predict(X, num_iteration=step, start_iteration=start_iter, raw_score=True)
+                    pred_contrib += booster.predict(X, num_iteration=step, start_iteration=start_iter, pred_contrib=True)
+                np.testing.assert_allclose(all_pred, pred)
+                np.testing.assert_allclose(all_pred_contrib, pred_contrib)
+            # test the case where start_iteration <= 0, and num_iteration is None
+            pred1 = booster.predict(X, start_iteration=-1)
+            pred2 = booster.predict(X, num_iteration=booster.best_iteration)
+            pred3 = booster.predict(X, num_iteration=booster.best_iteration, start_iteration=0)
+            np.testing.assert_allclose(pred1, pred2)
+            np.testing.assert_allclose(pred1, pred3)
+
+            # test the case where start_iteration > 0, and num_iteration <= 0
+            pred4 = booster.predict(X, start_iteration=10, num_iteration=-1)
+            pred5 = booster.predict(X, start_iteration=10, num_iteration=90)
+            pred6 = booster.predict(X, start_iteration=10, num_iteration=0)
+            np.testing.assert_allclose(pred4, pred5)
+            np.testing.assert_allclose(pred4, pred6)
+
+            # test the case where start_iteration > 0, and num_iteration <= 0, with pred_leaf=True
+            pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_leaf=True)
+            pred5 = booster.predict(X, start_iteration=10, num_iteration=90, pred_leaf=True)
+            pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_leaf=True)
+            np.testing.assert_allclose(pred4, pred5)
+            np.testing.assert_allclose(pred4, pred6)
+
+            # test the case where start_iteration > 0, and num_iteration <= 0, with pred_contrib=True
+            pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_contrib=True)
+            pred5 = booster.predict(X, start_iteration=10, num_iteration=90, pred_contrib=True)
+            pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_contrib=True)
+            np.testing.assert_allclose(pred4, pred5)
+            np.testing.assert_allclose(pred4, pred6)
+
+        # test for regression
+        X, y = load_boston(True)
+        params = {
+            'objective': 'regression',
+            'verbose': -1,
+            'metric': 'l2',
+            'learning_rate': 0.5
+        }
+        # test both with and without early stopping
+        inner_test(X, y, params, early_stopping_rounds=1)
+        inner_test(X, y, params, early_stopping_rounds=10)
+        inner_test(X, y, params, early_stopping_rounds=None)
+
+        # test for multi-class
+        X, y = load_iris(True)
+        params = {
+            'objective': 'multiclass',
+            'metric': 'multi_logloss',
+            'num_class': 3,
+            'verbose': -1,
+            'metric': 'multi_error'
+        }
+        # test both with and without early stopping
+        inner_test(X, y, params, early_stopping_rounds=1)
+        inner_test(X, y, params, early_stopping_rounds=10)
+        inner_test(X, y, params, early_stopping_rounds=None)
+
+        # test for binary
+        X, y = load_breast_cancer(True)
+        params = {
+            'objective': 'binary',
+            'metric': 'binary_logloss',
+            'verbose': -1,
+            'metric': 'auc'
+        }
+        # test both with and without early stopping
+        inner_test(X, y, params, early_stopping_rounds=1)
+        inner_test(X, y, params, early_stopping_rounds=10)
+        inner_test(X, y, params, early_stopping_rounds=None)
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 51fdeedd09cc..47d0697b2e68 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -607,6 +607,41 @@ def test_predict(self):
                           np.testing.assert_allclose,
                           res_engine, res_sklearn_params)
 
+        # Tests start_iteration
+        # Tests same probabilities, starting from iteration 10
+        res_engine = gbm.predict(X_test, start_iteration=10)
+        res_sklearn = clf.predict_proba(X_test, start_iteration=10)
+        np.testing.assert_allclose(res_engine, res_sklearn)
+
+        # Tests same predictions, starting from iteration 10
+        res_engine = np.argmax(gbm.predict(X_test, start_iteration=10), axis=1)
+        res_sklearn = clf.predict(X_test, start_iteration=10)
+        np.testing.assert_equal(res_engine, res_sklearn)
+
+        # Tests same raw scores, starting from iteration 10
+        res_engine = gbm.predict(X_test, raw_score=True, start_iteration=10)
+        res_sklearn = clf.predict(X_test, raw_score=True, start_iteration=10)
+        np.testing.assert_allclose(res_engine, res_sklearn)
+
+        # Tests same leaf indices, starting from iteration 10
+        res_engine = gbm.predict(X_test, pred_leaf=True, start_iteration=10)
+        res_sklearn = clf.predict(X_test, pred_leaf=True, start_iteration=10)
+        np.testing.assert_equal(res_engine, res_sklearn)
+
+        # Tests same feature contributions, starting from iteration 10
+        res_engine = gbm.predict(X_test, pred_contrib=True, start_iteration=10)
+        res_sklearn = clf.predict(X_test, pred_contrib=True, start_iteration=10)
+        np.testing.assert_allclose(res_engine, res_sklearn)
+
+        # Tests other parameters for the prediction works, starting from iteration 10
+        res_engine = gbm.predict(X_test, start_iteration=10)
+        res_sklearn_params = clf.predict_proba(X_test,
+                                               pred_early_stop=True,
+                                               pred_early_stop_margin=1.0, start_iteration=10)
+        self.assertRaises(AssertionError,
+                          np.testing.assert_allclose,
+                          res_engine, res_sklearn_params)
+
     def test_evaluate_train_set(self):
         X, y = load_boston(return_X_y=True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)