Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tests][dask] test all boosting types (fixes #3896) #4119

Merged
merged 3 commits into from
Mar 30, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 39 additions & 10 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@

tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical']
boosting_types = ['gbdt', 'dart', 'goss', 'rf']
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
task_to_dask_factory = {
'regression': lgb.DaskLGBMRegressor,
Expand Down Expand Up @@ -233,16 +234,25 @@ def _unpickle(filepath, serializer):

@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
def test_classifier(output, task, client):
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_classifier(output, task, boosting_type, client):
X, y, w, _, dX, dy, dw, _ = _create_data(
objective=task,
output=output
)

params = {
"boosting_type": boosting_type,
"n_estimators": 50,
"num_leaves": 31
}
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
elif boosting_type == 'goss':
params['top_rate'] = 0.5

dask_classifier = lgb.DaskLGBMClassifier(
client=client,
Expand All @@ -263,13 +273,18 @@ def test_classifier(output, task, client):
p2_proba = local_classifier.predict_proba(X)
s2 = local_classifier.score(X, y)

assert_eq(s1, s2)
assert_eq(p1, p2)
assert_eq(y, p1)
assert_eq(y, p2)
assert_eq(p1_proba, p2_proba, atol=0.01)
assert_eq(p1_local, p2)
assert_eq(y, p1_local)
if boosting_type == 'rf' and output == 'dataframe-with-categorical':
# https://github.com/microsoft/LightGBM/issues/4118
assert_eq(s1, s2, atol=0.01)
assert_eq(p1_proba, p2_proba, atol=0.8)
else:
assert_eq(s1, s2)
assert_eq(p1, p2)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we dropping assert_eq(s1, s2) from the else branch?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was totally my bad, I lost it when splitting the statements. I've included it in
5d7001d

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for catching it @StrikerRUS ! I missed it too

assert_eq(p1, y)
assert_eq(p2, y)
assert_eq(p1_proba, p2_proba, atol=0.03)
assert_eq(p1_local, p2)
assert_eq(p1_local, y)

# pref_leaf values should have the right shape
# and values that look like valid tree nodes
Expand Down Expand Up @@ -401,17 +416,24 @@ def test_training_does_not_fail_on_port_conflicts(client):


@pytest.mark.parametrize('output', data_output)
def test_regressor(output, client):
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_regressor(output, boosting_type, client):
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output=output
)

params = {
"boosting_type": boosting_type,
"random_state": 42,
"num_leaves": 31,
"n_estimators": 20,
}
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})

dask_regressor = lgb.DaskLGBMRegressor(
client=client,
Expand Down Expand Up @@ -569,7 +591,8 @@ def test_regressor_quantile(output, client, alpha):

@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize('group', [None, group_sizes])
def test_ranker(output, client, group):
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_ranker(output, group, boosting_type, client):
if output == 'dataframe-with-categorical':
X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking',
Expand Down Expand Up @@ -597,11 +620,17 @@ def test_ranker(output, client, group):
# use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
# serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
params = {
"boosting_type": boosting_type,
"random_state": 42,
"n_estimators": 50,
"num_leaves": 20,
"min_child_samples": 1
}
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})

dask_ranker = lgb.DaskLGBMRanker(
client=client,
Expand Down