Skip to content

Commit

Permalink
Init estimation for regression. (#8272)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Jan 10, 2023
1 parent 1b58d81 commit badeff1
Show file tree
Hide file tree
Showing 29 changed files with 466 additions and 132 deletions.
57 changes: 57 additions & 0 deletions .github/workflows/python_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,60 @@ jobs:
shell: bash -l {0}
run: |
pytest -s -v -rxXs --durations=0 ./tests/python
python-tests-on-ubuntu:
name: Test XGBoost Python package on ${{ matrix.config.os }}
runs-on: ${{ matrix.config.os }}
timeout-minutes: 90
strategy:
matrix:
config:
- {os: ubuntu-latest, python-version: "3.8"}

steps:
- uses: actions/checkout@v2
with:
submodules: 'true'

- uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
with:
cache-downloads: true
cache-env: true
environment-name: linux_cpu_test
environment-file: tests/ci_build/conda_env/linux_cpu_test.yml

- name: Display Conda env
shell: bash -l {0}
run: |
conda info
conda list
- name: Build XGBoost on Ubuntu
shell: bash -l {0}
run: |
mkdir build
cd build
cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
ninja
- name: Install Python package
shell: bash -l {0}
run: |
cd python-package
python --version
python setup.py install
- name: Test Python package
shell: bash -l {0}
run: |
pytest -s -v -rxXs --durations=0 ./tests/python
- name: Test Dask Interface
shell: bash -l {0}
run: |
pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask
- name: Test PySpark Interface
shell: bash -l {0}
run: |
pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
2 changes: 1 addition & 1 deletion R-package/tests/testthat/test_callbacks.R
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ test_that("prediction in early-stopping xgb.cv works", {
expect_output(
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20,
early_stopping_rounds = 5, maximize = FALSE, stratified = FALSE,
prediction = TRUE)
prediction = TRUE, base_score = 0.5)
, "Stopping. Best iteration")

expect_false(is.null(cv$best_iteration))
Expand Down
9 changes: 6 additions & 3 deletions R-package/tests/testthat/test_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ if (isTRUE(VCD_AVAILABLE)) {
# binary
bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
objective = "binary:logistic", booster = "gbtree")
objective = "binary:logistic", booster = "gbtree",
base_score = 0.5)

bst.GLM <- xgboost(data = sparse_matrix, label = label,
eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
objective = "binary:logistic", booster = "gblinear")
objective = "binary:logistic", booster = "gblinear",
base_score = 0.5)

feature.names <- colnames(sparse_matrix)
}
Expand Down Expand Up @@ -360,7 +362,8 @@ test_that("xgb.importance works with and without feature names", {
m <- xgboost::xgboost(
data = as.matrix(data.frame(x = c(0, 1))),
label = c(1, 2),
nrounds = 1
nrounds = 1,
base_score = 0.5
)
df <- xgb.model.dt.tree(model = m)
expect_equal(df$Feature, "Leaf")
Expand Down
32 changes: 19 additions & 13 deletions demo/guide-python/feature_weights.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
'''
"""
Demo for using feature weight to change column sampling
=======================================================
.. versionadded:: 1.3.0
'''
"""

import argparse

Expand All @@ -13,10 +13,10 @@
import xgboost


def main(args):
def main(args: argparse.Namespace) -> None:
rng = np.random.RandomState(1994)

kRows = 1000
kRows = 4196
kCols = 10

X = rng.randn(kRows, kCols)
Expand All @@ -28,26 +28,32 @@ def main(args):
dtrain = xgboost.DMatrix(X, y)
dtrain.set_info(feature_weights=fw)

bst = xgboost.train({'tree_method': 'hist',
'colsample_bynode': 0.2},
dtrain, num_boost_round=10,
evals=[(dtrain, 'd')])
# Perform column sampling for each node split evaluation, the sampling process is
# weighted by feature weights.
bst = xgboost.train(
{"tree_method": "hist", "colsample_bynode": 0.2},
dtrain,
num_boost_round=10,
evals=[(dtrain, "d")],
)
feature_map = bst.get_fscore()

# feature zero has 0 weight
assert feature_map.get('f0', None) is None
assert max(feature_map.values()) == feature_map.get('f9')
assert feature_map.get("f0", None) is None
assert max(feature_map.values()) == feature_map.get("f9")

if args.plot:
xgboost.plot_importance(bst)
plt.show()


if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--plot',
"--plot",
type=int,
default=1,
help='Set to 0 to disable plotting the evaluation history.')
help="Set to 0 to disable plotting the evaluation history.",
)
args = parser.parse_args()
main(args)
13 changes: 9 additions & 4 deletions demo/guide-python/sklearn_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,15 @@
if __name__ == "__main__":
print("Parallel Parameter optimization")
X, y = fetch_california_housing(return_X_y=True)
xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1,
n_jobs=2)
xgb_model = xgb.XGBRegressor(
n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"
)
clf = GridSearchCV(
xgb_model,
{"max_depth": [2, 4, 6], "n_estimators": [50, 100, 200]},
verbose=1,
n_jobs=2,
)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,10 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit
"eta" -> "1",
"max_depth" -> "6",
"silent" -> "1",
"base_score" -> 0.5,
"objective" -> "binary:logistic",
"tree_method" -> treeMethod,
"max_bin" -> 16)

val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
val prediction1 = model1.predict(testDM)

Expand Down Expand Up @@ -453,5 +453,4 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit
assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostClassificationModel").getPath,
nativeUbjModelPath))
}

}
4 changes: 2 additions & 2 deletions python-package/xgboost/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,7 +1078,7 @@ def predict(
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
) -> ArrayLike:
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration`
is used automatically. For tree models, when data is on GPU, like cupy array or
cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
Expand Down Expand Up @@ -1528,7 +1528,7 @@ def predict(
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
) -> ArrayLike:
with config_context(verbosity=self.verbosity):
class_probs = super().predict(
X=X,
Expand Down
54 changes: 54 additions & 0 deletions python-package/xgboost/testing/dask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Tests for dask shared by different test modules."""
import numpy as np
from dask import array as da
from distributed import Client
from xgboost.testing.updater import get_basescore

import xgboost as xgb


def check_init_estimation_clf(tree_method: str, client: Client) -> None:
"""Test init estimation for classsifier."""
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=4096 * 2, n_features=32, random_state=1994)
clf = xgb.XGBClassifier(n_estimators=1, max_depth=1, tree_method=tree_method)
clf.fit(X, y)
base_score = get_basescore(clf)

dx = da.from_array(X).rechunk(chunks=(32, None))
dy = da.from_array(y).rechunk(chunks=(32,))
dclf = xgb.dask.DaskXGBClassifier(
n_estimators=1, max_depth=1, tree_method=tree_method
)
dclf.client = client
dclf.fit(dx, dy)
dbase_score = get_basescore(dclf)
np.testing.assert_allclose(base_score, dbase_score)


def check_init_estimation_reg(tree_method: str, client: Client) -> None:
"""Test init estimation for regressor."""
from sklearn.datasets import make_regression

# pylint: disable=unbalanced-tuple-unpacking
X, y = make_regression(n_samples=4096 * 2, n_features=32, random_state=1994)
reg = xgb.XGBRegressor(n_estimators=1, max_depth=1, tree_method=tree_method)
reg.fit(X, y)
base_score = get_basescore(reg)

dx = da.from_array(X).rechunk(chunks=(32, None))
dy = da.from_array(y).rechunk(chunks=(32,))
dreg = xgb.dask.DaskXGBRegressor(
n_estimators=1, max_depth=1, tree_method=tree_method
)
dreg.client = client
dreg.fit(dx, dy)
dbase_score = get_basescore(dreg)
np.testing.assert_allclose(base_score, dbase_score)


def check_init_estimation(tree_method: str, client: Client) -> None:
"""Test init estimation."""
check_init_estimation_reg(tree_method, client)
check_init_estimation_clf(tree_method, client)
70 changes: 70 additions & 0 deletions python-package/xgboost/testing/updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Tests for updaters."""
import json

import numpy as np

import xgboost as xgb


def get_basescore(model: xgb.XGBModel) -> float:
"""Get base score from an XGBoost sklearn estimator."""
base_score = float(
json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][
"base_score"
]
)
return base_score


def check_init_estimation(tree_method: str) -> None:
"""Test for init estimation."""
from sklearn.datasets import (
make_classification,
make_multilabel_classification,
make_regression,
)

def run_reg(X: np.ndarray, y: np.ndarray) -> None: # pylint: disable=invalid-name
reg = xgb.XGBRegressor(tree_method=tree_method, max_depth=1, n_estimators=1)
reg.fit(X, y, eval_set=[(X, y)])
base_score_0 = get_basescore(reg)
score_0 = reg.evals_result()["validation_0"]["rmse"][0]

reg = xgb.XGBRegressor(
tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
)
reg.fit(X, y, eval_set=[(X, y)])
base_score_1 = get_basescore(reg)
score_1 = reg.evals_result()["validation_0"]["rmse"][0]
assert not np.isclose(base_score_0, base_score_1)
assert score_0 < score_1 # should be better

# pylint: disable=unbalanced-tuple-unpacking
X, y = make_regression(n_samples=4096, random_state=17)
run_reg(X, y)
# pylint: disable=unbalanced-tuple-unpacking
X, y = make_regression(n_samples=4096, n_targets=3, random_state=17)
run_reg(X, y)

def run_clf(X: np.ndarray, y: np.ndarray) -> None: # pylint: disable=invalid-name
clf = xgb.XGBClassifier(tree_method=tree_method, max_depth=1, n_estimators=1)
clf.fit(X, y, eval_set=[(X, y)])
base_score_0 = get_basescore(clf)
score_0 = clf.evals_result()["validation_0"]["logloss"][0]

clf = xgb.XGBClassifier(
tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
)
clf.fit(X, y, eval_set=[(X, y)])
base_score_1 = get_basescore(clf)
score_1 = clf.evals_result()["validation_0"]["logloss"][0]
assert not np.isclose(base_score_0, base_score_1)
assert score_0 < score_1 # should be better

# pylint: disable=unbalanced-tuple-unpacking
X, y = make_classification(n_samples=4096, random_state=17)
run_clf(X, y)
X, y = make_multilabel_classification(
n_samples=4096, n_labels=3, n_classes=5, random_state=17
)
run_clf(X, y)
2 changes: 1 addition & 1 deletion src/collective/rabit_communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class RabitCommunicator : public Communicator {
}

template <typename DType, std::enable_if_t<std::is_floating_point<DType>::value> * = nullptr>
void DoBitwiseAllReduce(void *send_receive_buffer, std::size_t count, Operation op) {
void DoBitwiseAllReduce(void *, std::size_t, Operation) {
LOG(FATAL) << "Floating point types do not support bitwise operations.";
}

Expand Down
2 changes: 1 addition & 1 deletion src/data/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
}
}

void MetaInfo::Validate(int32_t device) const {
void MetaInfo::Validate(std::int32_t device) const {
if (group_ptr_.size() != 0 && weights_.Size() != 0) {
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
<< "Size of weights must equal to number of groups when ranking "
Expand Down
2 changes: 1 addition & 1 deletion src/data/gradient_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "../common/hist_util.h"
#include "../common/numeric.h"
#include "../common/threading_utils.h"
#include "../common/transform_iterator.h" // MakeIndexTransformIter
#include "../common/transform_iterator.h" // common::MakeIndexTransformIter
#include "adapter.h"
#include "proxy_dmatrix.h"
#include "xgboost/base.h"
Expand Down
Loading

0 comments on commit badeff1

Please sign in to comment.