diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 103a5450291e..d4434d68d503 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs) return X, y, w, g_rle, dX, dy, dw, dg -def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwargs): +def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **kwargs): if objective.endswith('classification'): if objective == 'binary-classification': centers = [[-4, -4], [4, 4]] @@ -141,7 +141,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar raise ValueError(f"Unknown classification task '{objective}'") X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42) elif objective == 'regression': - X, y = make_regression(n_samples=n_samples, random_state=42) + X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42) elif objective == 'ranking': return _create_ranking_data( n_samples=n_samples, @@ -161,7 +161,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar elif output.startswith('dataframe'): X_df = pd.DataFrame(X, columns=['feature_%d' % i for i in range(X.shape[1])]) if output == 'dataframe-with-categorical': - num_cat_cols = 5 + num_cat_cols = 2 for i in range(num_cat_cols): col_name = "cat_col" + str(i) cat_values = rnd.choice(['a', 'b'], X.shape[0]) @@ -172,13 +172,15 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar X_df[col_name] = cat_series X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1))) - # for the small data sizes used in tests, it's hard to get LGBMRegressor to choose - # categorical features for splits. So for regression tests with categorical features, - # _create_data() returns a DataFrame with ONLY categorical features + # make one categorical feature relevant to the target + cat_col_is_a = X_df['cat_col0'] == 'a' if objective == 'regression': - cat_cols = [col for col in X_df.columns if col.startswith('cat_col')] - X_df = X_df[cat_cols] - X = X[:, -num_cat_cols:] + y = np.where(cat_col_is_a, y, 2 * y) + elif objective == 'binary-classification': + y = np.where(cat_col_is_a, y, 1 - y) + elif objective == 'multiclass-classification': + n_classes = 3 + y = np.where(cat_col_is_a, y, (1 + y) % n_classes) y_df = pd.Series(y, name='target') dX = dd.from_pandas(X_df, chunksize=chunk_size) dy = dd.from_pandas(y_df, chunksize=chunk_size) @@ -238,8 +240,8 @@ def test_classifier(output, task, client): ) params = { - "n_estimators": 10, - "num_leaves": 10 + "n_estimators": 50, + "num_leaves": 31 } dask_classifier = lgb.DaskLGBMClassifier( @@ -265,7 +267,7 @@ def test_classifier(output, task, client): assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) - assert_eq(p1_proba, p2_proba, atol=0.3) + assert_eq(p1_proba, p2_proba, atol=0.01) assert_eq(p1_local, p2) assert_eq(y, p1_local) @@ -407,7 +409,8 @@ def test_regressor(output, client): params = { "random_state": 42, - "num_leaves": 10 + "num_leaves": 31, + "n_estimators": 20, } dask_regressor = lgb.DaskLGBMRegressor( @@ -420,8 +423,7 @@ def test_regressor(output, client): p1 = dask_regressor.predict(dX) p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True) - if not output.startswith('dataframe'): - s1 = _r2_score(dy, p1) + s1 = _r2_score(dy, p1) p1 = p1.compute() p1_local = dask_regressor.to_local().predict(X) s1_local = dask_regressor.to_local().score(X, y) @@ -432,9 +434,8 @@ def test_regressor(output, client): p2 = local_regressor.predict(X) # Scores should be the same - if not output.startswith('dataframe'): - assert_eq(s1, s2, atol=.01) - assert_eq(s1, s1_local, atol=.003) + assert_eq(s1, s2, atol=0.01) + assert_eq(s1, s1_local) # Predictions should be roughly the same. assert_eq(p1, p1_local) @@ -450,13 +451,8 @@ def test_regressor(output, client): assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] - # The checks below are skipped - # for the categorical data case because it's difficult to get - # a good fit from just categoricals for a regression problem - # with small data - if output != 'dataframe-with-categorical': - assert_eq(y, p1, rtol=1., atol=100.) - assert_eq(y, p2, rtol=1., atol=50.) + assert_eq(p1, y, rtol=0.5, atol=50.) + assert_eq(p2, y, rtol=0.5, atol=50.) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature