[tests][dask] Create an informative categorical feature (#4113)

* make one categorical variable informative. increase n_samples. reduce n_features for regression * adjust tolerances in checks
microsoft · Mar 26, 2021 · 8cc6eef · 8cc6eef
1 parent a45ed16
commit 8cc6eef
Showing 1 changed file with 21 additions and 25 deletions.
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
@@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
     return X, y, w, g_rle, dX, dy, dw, dg
 
 
-def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwargs):
+def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **kwargs):
     if objective.endswith('classification'):
         if objective == 'binary-classification':
             centers = [[-4, -4], [4, 4]]
@@ -141,7 +141,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
             raise ValueError(f"Unknown classification task '{objective}'")
         X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
     elif objective == 'regression':
-        X, y = make_regression(n_samples=n_samples, random_state=42)
+        X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42)
     elif objective == 'ranking':
         return _create_ranking_data(
             n_samples=n_samples,
@@ -161,7 +161,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
     elif output.startswith('dataframe'):
         X_df = pd.DataFrame(X, columns=['feature_%d' % i for i in range(X.shape[1])])
         if output == 'dataframe-with-categorical':
-            num_cat_cols = 5
+            num_cat_cols = 2
             for i in range(num_cat_cols):
                 col_name = "cat_col" + str(i)
                 cat_values = rnd.choice(['a', 'b'], X.shape[0])
@@ -172,13 +172,15 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
                 X_df[col_name] = cat_series
                 X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))
 
-            # for the small data sizes used in tests, it's hard to get LGBMRegressor to choose
-            # categorical features for splits. So for regression tests with categorical features,
-            # _create_data() returns a DataFrame with ONLY categorical features
+            # make one categorical feature relevant to the target
+            cat_col_is_a = X_df['cat_col0'] == 'a'
             if objective == 'regression':
-                cat_cols = [col for col in X_df.columns if col.startswith('cat_col')]
-                X_df = X_df[cat_cols]
-                X = X[:, -num_cat_cols:]
+                y = np.where(cat_col_is_a, y, 2 * y)
+            elif objective == 'binary-classification':
+                y = np.where(cat_col_is_a, y, 1 - y)
+            elif objective == 'multiclass-classification':
+                n_classes = 3
+                y = np.where(cat_col_is_a, y, (1 + y) % n_classes)
         y_df = pd.Series(y, name='target')
         dX = dd.from_pandas(X_df, chunksize=chunk_size)
         dy = dd.from_pandas(y_df, chunksize=chunk_size)
@@ -238,8 +240,8 @@ def test_classifier(output, task, client):
     )
 
     params = {
-        "n_estimators": 10,
-        "num_leaves": 10
+        "n_estimators": 50,
+        "num_leaves": 31
     }
 
     dask_classifier = lgb.DaskLGBMClassifier(
@@ -265,7 +267,7 @@ def test_classifier(output, task, client):
     assert_eq(p1, p2)
     assert_eq(y, p1)
     assert_eq(y, p2)
-    assert_eq(p1_proba, p2_proba, atol=0.3)
+    assert_eq(p1_proba, p2_proba, atol=0.01)
     assert_eq(p1_local, p2)
     assert_eq(y, p1_local)
 
@@ -407,7 +409,8 @@ def test_regressor(output, client):
 
     params = {
         "random_state": 42,
-        "num_leaves": 10
+        "num_leaves": 31,
+        "n_estimators": 20,
     }
 
     dask_regressor = lgb.DaskLGBMRegressor(
@@ -420,8 +423,7 @@ def test_regressor(output, client):
     p1 = dask_regressor.predict(dX)
     p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)
 
-    if not output.startswith('dataframe'):
-        s1 = _r2_score(dy, p1)
+    s1 = _r2_score(dy, p1)
     p1 = p1.compute()
     p1_local = dask_regressor.to_local().predict(X)
     s1_local = dask_regressor.to_local().score(X, y)
@@ -432,9 +434,8 @@ def test_regressor(output, client):
     p2 = local_regressor.predict(X)
 
     # Scores should be the same
-    if not output.startswith('dataframe'):
-        assert_eq(s1, s2, atol=.01)
-        assert_eq(s1, s1_local, atol=.003)
+    assert_eq(s1, s2, atol=0.01)
+    assert_eq(s1, s1_local)
 
     # Predictions should be roughly the same.
     assert_eq(p1, p1_local)
@@ -450,13 +451,8 @@ def test_regressor(output, client):
     assert np.min(pred_leaf_vals) >= 0
     assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
 
-    # The checks below are skipped
-    # for the categorical data case because it's difficult to get
-    # a good fit from just categoricals for a regression problem
-    # with small data
-    if output != 'dataframe-with-categorical':
-        assert_eq(y, p1, rtol=1., atol=100.)
-        assert_eq(y, p2, rtol=1., atol=50.)
+    assert_eq(p1, y, rtol=0.5, atol=50.)
+    assert_eq(p2, y, rtol=0.5, atol=50.)
 
     # be sure LightGBM actually used at least one categorical column,
     # and that it was correctly treated as a categorical feature