Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tests][dask] Add voting_parallel algorithm in tests (fixes #3834) #4088

Merged
merged 10 commits into from
Apr 1, 2021
6 changes: 0 additions & 6 deletions python-package/lightgbm/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,12 +271,6 @@ def _train(
_log_warning('Parameter tree_learner set to %s, which is not allowed. Using "data" as default' % params['tree_learner'])
params['tree_learner'] = 'data'

if params['tree_learner'] not in {'data', 'data_parallel'}:
_log_warning(
'Support for tree_learner %s in lightgbm.dask is experimental and may break in a future release. \n'
'Use "data" for a stable, well-tested interface.' % params['tree_learner']
)

# Some passed-in parameters can be removed:
# * 'num_machines': set automatically from Dask worker list
# * 'num_threads': overridden to match nthreads on each Dask process
Expand Down
61 changes: 40 additions & 21 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
CLIENT_CLOSE_TIMEOUT = 120

tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
distributed_training_algorithms = ['data', 'voting']
data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical']
boosting_types = ['gbdt', 'dart', 'goss', 'rf']
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
Expand Down Expand Up @@ -235,14 +236,16 @@ def _unpickle(filepath, serializer):
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_classifier(output, task, boosting_type, client):
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_classifier(output, task, boosting_type, tree_learner, client):
X, y, w, _, dX, dy, dw, _ = _create_data(
objective=task,
output=output
)

params = {
"boosting_type": boosting_type,
"tree_learner": tree_learner,
"n_estimators": 50,
"num_leaves": 31
}
Expand Down Expand Up @@ -273,7 +276,7 @@ def test_classifier(output, task, boosting_type, client):
p2_proba = local_classifier.predict_proba(X)
s2 = local_classifier.score(X, y)

if boosting_type == 'rf' and output == 'dataframe-with-categorical':
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
if boosting_type == 'rf':
# https://github.com/microsoft/LightGBM/issues/4118
assert_eq(s1, s2, atol=0.01)
assert_eq(p1_proba, p2_proba, atol=0.8)
Expand Down Expand Up @@ -417,7 +420,8 @@ def test_training_does_not_fail_on_port_conflicts(client):

@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_regressor(output, boosting_type, client):
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_regressor(output, boosting_type, tree_learner, client):
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output=output
Expand All @@ -438,7 +442,7 @@ def test_regressor(output, boosting_type, client):
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree='data',
tree=tree_learner,
**params
)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
Expand Down Expand Up @@ -592,7 +596,8 @@ def test_regressor_quantile(output, client, alpha):
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize('group', [None, group_sizes])
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_ranker(output, group, boosting_type, client):
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_ranker(output, group, boosting_type, tree_learner, client):
if output == 'dataframe-with-categorical':
X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking',
Expand Down Expand Up @@ -635,7 +640,7 @@ def test_ranker(output, group, boosting_type, client):
dask_ranker = lgb.DaskLGBMRanker(
client=client,
time_out=5,
tree_learner_type='data_parallel',
tree_learner_type=tree_learner,
**params
)
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
Expand Down Expand Up @@ -930,22 +935,36 @@ def test_warns_and_continues_on_unrecognized_tree_learner(client):
client.close(timeout=CLIENT_CLOSE_TIMEOUT)


def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client):
X = da.random.random((1e3, 10))
y = da.random.random((1e3, 1))
for tree_learner in ['feature_parallel', 'voting']:
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner=tree_learner,
n_estimators=1,
num_leaves=2
)
with pytest.warns(UserWarning, match='Support for tree_learner %s in lightgbm' % tree_learner):
dask_regressor = dask_regressor.fit(X, y)
@pytest.mark.parametrize('task', tasks)
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize('tree_learner', ['data_parallel', 'voting_parallel'])
def test_training_respects_tree_learner_aliases(task, tree_learner, client):
_, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output='array')
dask_factory = task_to_dask_factory[task]
dask_model = dask_factory(
client=client,
tree_learner=tree_learner,
time_out=5,
n_estimators=10,
num_leaves=15
)
dask_model.fit(dX, dy, sample_weight=dw, group=dg)

assert dask_regressor.fitted_
assert dask_regressor.get_params()['tree_learner'] == tree_learner
assert dask_model.fitted_
assert dask_model.get_params()['tree_learner'] == tree_learner


def test_error_on_feature_parallel_tree_learner(client):
X = da.random.random((1_000, 10), chunks=(500, 10))
y = da.random.random((1_000, 1), chunks=500)
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner='feature_parallel',
n_estimators=1,
num_leaves=2
)
with pytest.raises(lgb.basic.LightGBMError, match='Do not support feature parallel in c api'):
dask_regressor = dask_regressor.fit(X, y)

client.close(timeout=CLIENT_CLOSE_TIMEOUT)

Expand Down