Init estimation for regression. (#8272)

dmlc · Jan 10, 2023 · badeff1 · badeff1
1 parent 1b58d81
commit badeff1
Show file tree

Hide file tree

Showing 29 changed files with 466 additions and 132 deletions.
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
@@ -213,3 +213,60 @@ jobs:
       shell: bash -l {0}
       run: |
         pytest -s -v -rxXs --durations=0 ./tests/python
+
+  python-tests-on-ubuntu:
+    name: Test XGBoost Python package on ${{ matrix.config.os }}
+    runs-on: ${{ matrix.config.os }}
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        config:
+          - {os: ubuntu-latest, python-version: "3.8"}
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
+      with:
+        cache-downloads: true
+        cache-env: true
+        environment-name: linux_cpu_test
+        environment-file: tests/ci_build/conda_env/linux_cpu_test.yml
+
+    - name: Display Conda env
+      shell: bash -l {0}
+      run: |
+        conda info
+        conda list
+
+    - name: Build XGBoost on Ubuntu
+      shell: bash -l {0}
+      run: |
+        mkdir build
+        cd build
+        cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        ninja
+
+    - name: Install Python package
+      shell: bash -l {0}
+      run: |
+        cd python-package
+        python --version
+        python setup.py install
+
+    - name: Test Python package
+      shell: bash -l {0}
+      run: |
+        pytest -s -v -rxXs --durations=0 ./tests/python
+
+    - name: Test Dask Interface
+      shell: bash -l {0}
+      run: |
+        pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask
+
+    - name: Test PySpark Interface
+      shell: bash -l {0}
+      run: |
+        pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
@@ -320,7 +320,7 @@ test_that("prediction in early-stopping xgb.cv works", {
   expect_output(
     cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20,
                  early_stopping_rounds = 5, maximize = FALSE, stratified = FALSE,
-                 prediction = TRUE)
+                 prediction = TRUE, base_score = 0.5)
   , "Stopping. Best iteration")
 
   expect_false(is.null(cv$best_iteration))

diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
@@ -27,11 +27,13 @@ if (isTRUE(VCD_AVAILABLE)) {
     # binary
     bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
                         eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
-                        objective = "binary:logistic", booster = "gbtree")
+                        objective = "binary:logistic", booster = "gbtree",
+                        base_score = 0.5)
 
     bst.GLM <- xgboost(data = sparse_matrix, label = label,
                        eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
-                       objective = "binary:logistic", booster = "gblinear")
+                       objective = "binary:logistic", booster = "gblinear",
+                       base_score = 0.5)
 
     feature.names <- colnames(sparse_matrix)
 }
@@ -360,7 +362,8 @@ test_that("xgb.importance works with and without feature names", {
   m <- xgboost::xgboost(
     data = as.matrix(data.frame(x = c(0, 1))),
     label = c(1, 2),
-    nrounds = 1
+    nrounds = 1,
+    base_score = 0.5
   )
   df <- xgb.model.dt.tree(model = m)
   expect_equal(df$Feature, "Leaf")

diff --git a/demo/guide-python/feature_weights.py b/demo/guide-python/feature_weights.py
@@ -1,9 +1,9 @@
-'''
+"""
 Demo for using feature weight to change column sampling
 =======================================================
 
     .. versionadded:: 1.3.0
-'''
+"""
 
 import argparse
 
@@ -13,10 +13,10 @@
 import xgboost
 
 
-def main(args):
+def main(args: argparse.Namespace) -> None:
     rng = np.random.RandomState(1994)
 
-    kRows = 1000
+    kRows = 4196
     kCols = 10
 
     X = rng.randn(kRows, kCols)
@@ -28,26 +28,32 @@ def main(args):
     dtrain = xgboost.DMatrix(X, y)
     dtrain.set_info(feature_weights=fw)
 
-    bst = xgboost.train({'tree_method': 'hist',
-                         'colsample_bynode': 0.2},
-                        dtrain, num_boost_round=10,
-                        evals=[(dtrain, 'd')])
+    # Perform column sampling for each node split evaluation, the sampling process is
+    # weighted by feature weights.
+    bst = xgboost.train(
+        {"tree_method": "hist", "colsample_bynode": 0.2},
+        dtrain,
+        num_boost_round=10,
+        evals=[(dtrain, "d")],
+    )
     feature_map = bst.get_fscore()
+
     # feature zero has 0 weight
-    assert feature_map.get('f0', None) is None
-    assert max(feature_map.values()) == feature_map.get('f9')
+    assert feature_map.get("f0", None) is None
+    assert max(feature_map.values()) == feature_map.get("f9")
 
     if args.plot:
         xgboost.plot_importance(bst)
         plt.show()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '--plot',
+        "--plot",
         type=int,
         default=1,
-        help='Set to 0 to disable plotting the evaluation history.')
+        help="Set to 0 to disable plotting the evaluation history.",
+    )
     args = parser.parse_args()
     main(args)
diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py
@@ -12,10 +12,15 @@
 if __name__ == "__main__":
     print("Parallel Parameter optimization")
     X, y = fetch_california_housing(return_X_y=True)
-    xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2)
-    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
-                                   'n_estimators': [50, 100, 200]}, verbose=1,
-                       n_jobs=2)
+    xgb_model = xgb.XGBRegressor(
+        n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"
+    )
+    clf = GridSearchCV(
+        xgb_model,
+        {"max_depth": [2, 4, 6], "n_estimators": [50, 100, 200]},
+        verbose=1,
+        n_jobs=2,
+    )
     clf.fit(X, y)
     print(clf.best_score_)
     print(clf.best_params_)
diff --git a/...xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/...xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -261,10 +261,10 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit
       "eta" -> "1",
       "max_depth" -> "6",
       "silent" -> "1",
+      "base_score" -> 0.5,
       "objective" -> "binary:logistic",
       "tree_method" -> treeMethod,
       "max_bin" -> 16)
-
     val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
     val prediction1 = model1.predict(testDM)
 
@@ -453,5 +453,4 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit
     assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostClassificationModel").getPath,
       nativeUbjModelPath))
   }
-
 }
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -1078,7 +1078,7 @@ def predict(
         validate_features: bool = True,
         base_margin: Optional[ArrayLike] = None,
         iteration_range: Optional[Tuple[int, int]] = None,
-    ) -> np.ndarray:
+    ) -> ArrayLike:
         """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
         is used automatically.  For tree models, when data is on GPU, like cupy array or
         cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
@@ -1528,7 +1528,7 @@ def predict(
         validate_features: bool = True,
         base_margin: Optional[ArrayLike] = None,
         iteration_range: Optional[Tuple[int, int]] = None,
-    ) -> np.ndarray:
+    ) -> ArrayLike:
         with config_context(verbosity=self.verbosity):
             class_probs = super().predict(
                 X=X,

diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
@@ -0,0 +1,54 @@
+"""Tests for dask shared by different test modules."""
+import numpy as np
+from dask import array as da
+from distributed import Client
+from xgboost.testing.updater import get_basescore
+
+import xgboost as xgb
+
+
+def check_init_estimation_clf(tree_method: str, client: Client) -> None:
+    """Test init estimation for classsifier."""
+    from sklearn.datasets import make_classification
+
+    X, y = make_classification(n_samples=4096 * 2, n_features=32, random_state=1994)
+    clf = xgb.XGBClassifier(n_estimators=1, max_depth=1, tree_method=tree_method)
+    clf.fit(X, y)
+    base_score = get_basescore(clf)
+
+    dx = da.from_array(X).rechunk(chunks=(32, None))
+    dy = da.from_array(y).rechunk(chunks=(32,))
+    dclf = xgb.dask.DaskXGBClassifier(
+        n_estimators=1, max_depth=1, tree_method=tree_method
+    )
+    dclf.client = client
+    dclf.fit(dx, dy)
+    dbase_score = get_basescore(dclf)
+    np.testing.assert_allclose(base_score, dbase_score)
+
+
+def check_init_estimation_reg(tree_method: str, client: Client) -> None:
+    """Test init estimation for regressor."""
+    from sklearn.datasets import make_regression
+
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(n_samples=4096 * 2, n_features=32, random_state=1994)
+    reg = xgb.XGBRegressor(n_estimators=1, max_depth=1, tree_method=tree_method)
+    reg.fit(X, y)
+    base_score = get_basescore(reg)
+
+    dx = da.from_array(X).rechunk(chunks=(32, None))
+    dy = da.from_array(y).rechunk(chunks=(32,))
+    dreg = xgb.dask.DaskXGBRegressor(
+        n_estimators=1, max_depth=1, tree_method=tree_method
+    )
+    dreg.client = client
+    dreg.fit(dx, dy)
+    dbase_score = get_basescore(dreg)
+    np.testing.assert_allclose(base_score, dbase_score)
+
+
+def check_init_estimation(tree_method: str, client: Client) -> None:
+    """Test init estimation."""
+    check_init_estimation_reg(tree_method, client)
+    check_init_estimation_clf(tree_method, client)
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
@@ -0,0 +1,70 @@
+"""Tests for updaters."""
+import json
+
+import numpy as np
+
+import xgboost as xgb
+
+
+def get_basescore(model: xgb.XGBModel) -> float:
+    """Get base score from an XGBoost sklearn estimator."""
+    base_score = float(
+        json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][
+            "base_score"
+        ]
+    )
+    return base_score
+
+
+def check_init_estimation(tree_method: str) -> None:
+    """Test for init estimation."""
+    from sklearn.datasets import (
+        make_classification,
+        make_multilabel_classification,
+        make_regression,
+    )
+
+    def run_reg(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-name
+        reg = xgb.XGBRegressor(tree_method=tree_method, max_depth=1, n_estimators=1)
+        reg.fit(X, y, eval_set=[(X, y)])
+        base_score_0 = get_basescore(reg)
+        score_0 = reg.evals_result()["validation_0"]["rmse"][0]
+
+        reg = xgb.XGBRegressor(
+            tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
+        )
+        reg.fit(X, y, eval_set=[(X, y)])
+        base_score_1 = get_basescore(reg)
+        score_1 = reg.evals_result()["validation_0"]["rmse"][0]
+        assert not np.isclose(base_score_0, base_score_1)
+        assert score_0 < score_1  # should be better
+
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(n_samples=4096, random_state=17)
+    run_reg(X, y)
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(n_samples=4096, n_targets=3, random_state=17)
+    run_reg(X, y)
+
+    def run_clf(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-name
+        clf = xgb.XGBClassifier(tree_method=tree_method, max_depth=1, n_estimators=1)
+        clf.fit(X, y, eval_set=[(X, y)])
+        base_score_0 = get_basescore(clf)
+        score_0 = clf.evals_result()["validation_0"]["logloss"][0]
+
+        clf = xgb.XGBClassifier(
+            tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
+        base_score_1 = get_basescore(clf)
+        score_1 = clf.evals_result()["validation_0"]["logloss"][0]
+        assert not np.isclose(base_score_0, base_score_1)
+        assert score_0 < score_1  # should be better
+
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_classification(n_samples=4096, random_state=17)
+    run_clf(X, y)
+    X, y = make_multilabel_classification(
+        n_samples=4096, n_labels=3, n_classes=5, random_state=17
+    )
+    run_clf(X, y)
diff --git a/src/collective/rabit_communicator.h b/src/collective/rabit_communicator.h
@@ -119,7 +119,7 @@ class RabitCommunicator : public Communicator {
   }
 
   template <typename DType, std::enable_if_t<std::is_floating_point<DType>::value> * = nullptr>
-  void DoBitwiseAllReduce(void *send_receive_buffer, std::size_t count, Operation op) {
+  void DoBitwiseAllReduce(void *, std::size_t, Operation) {
     LOG(FATAL) << "Floating point types do not support bitwise operations.";
   }
 

diff --git a/src/data/data.cc b/src/data/data.cc
@@ -684,7 +684,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
 }
 
-void MetaInfo::Validate(int32_t device) const {
+void MetaInfo::Validate(std::int32_t device) const {
   if (group_ptr_.size() != 0 && weights_.Size() != 0) {
     CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
         << "Size of weights must equal to number of groups when ranking "

diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
@@ -15,7 +15,7 @@
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
 #include "../common/threading_utils.h"
-#include "../common/transform_iterator.h"  // MakeIndexTransformIter
+#include "../common/transform_iterator.h"  // common::MakeIndexTransformIter
 #include "adapter.h"
 #include "proxy_dmatrix.h"
 #include "xgboost/base.h"