Skip to content

Commit

Permalink
[tests][dask] Create an informative categorical feature (#4113)
Browse files Browse the repository at this point in the history
* make one categorical variable informative. increase n_samples. reduce n_features for regression

* adjust tolerances in checks
  • Loading branch information
jmoralez authored Mar 26, 2021
1 parent a45ed16 commit 8cc6eef
Showing 1 changed file with 21 additions and 25 deletions.
46 changes: 21 additions & 25 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
return X, y, w, g_rle, dX, dy, dw, dg


def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwargs):
def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **kwargs):
if objective.endswith('classification'):
if objective == 'binary-classification':
centers = [[-4, -4], [4, 4]]
Expand All @@ -141,7 +141,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
raise ValueError(f"Unknown classification task '{objective}'")
X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
elif objective == 'regression':
X, y = make_regression(n_samples=n_samples, random_state=42)
X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42)
elif objective == 'ranking':
return _create_ranking_data(
n_samples=n_samples,
Expand All @@ -161,7 +161,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
elif output.startswith('dataframe'):
X_df = pd.DataFrame(X, columns=['feature_%d' % i for i in range(X.shape[1])])
if output == 'dataframe-with-categorical':
num_cat_cols = 5
num_cat_cols = 2
for i in range(num_cat_cols):
col_name = "cat_col" + str(i)
cat_values = rnd.choice(['a', 'b'], X.shape[0])
Expand All @@ -172,13 +172,15 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
X_df[col_name] = cat_series
X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))

# for the small data sizes used in tests, it's hard to get LGBMRegressor to choose
# categorical features for splits. So for regression tests with categorical features,
# _create_data() returns a DataFrame with ONLY categorical features
# make one categorical feature relevant to the target
cat_col_is_a = X_df['cat_col0'] == 'a'
if objective == 'regression':
cat_cols = [col for col in X_df.columns if col.startswith('cat_col')]
X_df = X_df[cat_cols]
X = X[:, -num_cat_cols:]
y = np.where(cat_col_is_a, y, 2 * y)
elif objective == 'binary-classification':
y = np.where(cat_col_is_a, y, 1 - y)
elif objective == 'multiclass-classification':
n_classes = 3
y = np.where(cat_col_is_a, y, (1 + y) % n_classes)
y_df = pd.Series(y, name='target')
dX = dd.from_pandas(X_df, chunksize=chunk_size)
dy = dd.from_pandas(y_df, chunksize=chunk_size)
Expand Down Expand Up @@ -238,8 +240,8 @@ def test_classifier(output, task, client):
)

params = {
"n_estimators": 10,
"num_leaves": 10
"n_estimators": 50,
"num_leaves": 31
}

dask_classifier = lgb.DaskLGBMClassifier(
Expand All @@ -265,7 +267,7 @@ def test_classifier(output, task, client):
assert_eq(p1, p2)
assert_eq(y, p1)
assert_eq(y, p2)
assert_eq(p1_proba, p2_proba, atol=0.3)
assert_eq(p1_proba, p2_proba, atol=0.01)
assert_eq(p1_local, p2)
assert_eq(y, p1_local)

Expand Down Expand Up @@ -407,7 +409,8 @@ def test_regressor(output, client):

params = {
"random_state": 42,
"num_leaves": 10
"num_leaves": 31,
"n_estimators": 20,
}

dask_regressor = lgb.DaskLGBMRegressor(
Expand All @@ -420,8 +423,7 @@ def test_regressor(output, client):
p1 = dask_regressor.predict(dX)
p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)

if not output.startswith('dataframe'):
s1 = _r2_score(dy, p1)
s1 = _r2_score(dy, p1)
p1 = p1.compute()
p1_local = dask_regressor.to_local().predict(X)
s1_local = dask_regressor.to_local().score(X, y)
Expand All @@ -432,9 +434,8 @@ def test_regressor(output, client):
p2 = local_regressor.predict(X)

# Scores should be the same
if not output.startswith('dataframe'):
assert_eq(s1, s2, atol=.01)
assert_eq(s1, s1_local, atol=.003)
assert_eq(s1, s2, atol=0.01)
assert_eq(s1, s1_local)

# Predictions should be roughly the same.
assert_eq(p1, p1_local)
Expand All @@ -450,13 +451,8 @@ def test_regressor(output, client):
assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

# The checks below are skipped
# for the categorical data case because it's difficult to get
# a good fit from just categoricals for a regression problem
# with small data
if output != 'dataframe-with-categorical':
assert_eq(y, p1, rtol=1., atol=100.)
assert_eq(y, p2, rtol=1., atol=50.)
assert_eq(p1, y, rtol=0.5, atol=50.)
assert_eq(p2, y, rtol=0.5, atol=50.)

# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
Expand Down

0 comments on commit 8cc6eef

Please sign in to comment.