From a08dda9388bd195b2bc34534c02e7c39f2a6176f Mon Sep 17 00:00:00 2001
From: Wenjun Si <wenjun.swj@alibaba-inc.com>
Date: Wed, 13 Oct 2021 18:06:29 +0800
Subject: [PATCH] Add `make_regression` support for learn module (#2515)

---
 mars/_version.py                              |   2 +-
 mars/learn/datasets/__init__.py               |   3 +-
 mars/learn/datasets/samples_generator.py      | 134 ++++++++++++++++++
 .../datasets/tests/test_samples_generator.py  |  49 ++++++-
 mars/learn/utils/multiclass.py                |   2 +-
 5 files changed, 186 insertions(+), 4 deletions(-)

diff --git a/mars/_version.py b/mars/_version.py
index c46da36ce0..a16b0f6f04 100644
--- a/mars/_version.py
+++ b/mars/_version.py
@@ -16,7 +16,7 @@
 import os
 from typing import NamedTuple, Optional
 
-version_info = (0, 8, 0, 'b2')
+version_info = (0, 8, 0, 'rc1')
 _num_index = max(idx if isinstance(v, int) else 0
                  for idx, v in enumerate(version_info))
 __version__ = '.'.join(map(str, version_info[:_num_index + 1])) + \
diff --git a/mars/learn/datasets/__init__.py b/mars/learn/datasets/__init__.py
index 7a1586fbc2..f8d1f46d86 100644
--- a/mars/learn/datasets/__init__.py
+++ b/mars/learn/datasets/__init__.py
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .samples_generator import make_classification, make_blobs, make_low_rank_matrix
+from .samples_generator import make_classification, make_regression, \
+    make_blobs, make_low_rank_matrix
diff --git a/mars/learn/datasets/samples_generator.py b/mars/learn/datasets/samples_generator.py
index 7ea6b07d0f..9ccb0719e3 100644
--- a/mars/learn/datasets/samples_generator.py
+++ b/mars/learn/datasets/samples_generator.py
@@ -247,6 +247,140 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     return X, y
 
 
+def make_regression(
+    n_samples=100,
+    n_features=100,
+    *,
+    n_informative=10,
+    n_targets=1,
+    bias=0.0,
+    effective_rank=None,
+    tail_strength=0.5,
+    noise=0.0,
+    shuffle=True,
+    coef=False,
+    random_state=None,
+):
+    """Generate a random regression problem.
+
+    The input set can either be well conditioned (by default) or have a low
+    rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
+    more details.
+
+    The output is generated by applying a (potentially biased) random linear
+    regression model with `n_informative` nonzero regressors to the previously
+    generated input and some gaussian centered noise with some adjustable
+    scale.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=100
+        The number of features.
+
+    n_informative : int, default=10
+        The number of informative features, i.e., the number of features used
+        to build the linear model used to generate the output.
+
+    n_targets : int, default=1
+        The number of regression targets, i.e., the dimension of the y output
+        vector associated with a sample. By default, the output is a scalar.
+
+    bias : float, default=0.0
+        The bias term in the underlying linear model.
+
+    effective_rank : int, default=None
+        if not None:
+            The approximate number of singular vectors required to explain most
+            of the input data by linear combinations. Using this kind of
+            singular spectrum in the input allows the generator to reproduce
+            the correlations often observed in practice.
+        if None:
+            The input set is well conditioned, centered and gaussian with
+            unit variance.
+
+    tail_strength : float, default=0.5
+        The relative importance of the fat noisy tail of the singular values
+        profile if `effective_rank` is not None. When a float, it should be
+        between 0 and 1.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise applied to the output.
+
+    shuffle : bool, default=True
+        Shuffle the samples and the features.
+
+    coef : bool, default=False
+        If True, the coefficients of the underlying linear model are returned.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : tensor of shape (n_samples, n_features)
+        The input samples.
+
+    y : tensor of shape (n_samples,) or (n_samples, n_targets)
+        The output values.
+
+    coef : tensor of shape (n_features,) or (n_features, n_targets)
+        The coefficient of the underlying linear model. It is returned only if
+        coef is True.
+    """
+    n_informative = min(n_features, n_informative)
+    generator = check_random_state(random_state)
+
+    if effective_rank is None:
+        # Randomly generate a well conditioned input set
+        X = generator.randn(n_samples, n_features)
+
+    else:
+        # Randomly generate a low rank, fat tail input set
+        X = make_low_rank_matrix(
+            n_samples=n_samples,
+            n_features=n_features,
+            effective_rank=effective_rank,
+            tail_strength=tail_strength,
+            random_state=generator,
+        )
+
+    # Generate a ground truth model with only n_informative features being non
+    # zeros (the other features are not correlated to y and should be ignored
+    # by a sparsifying regularizers such as L1 or elastic net)
+    ground_truth = mt.zeros((n_features, n_targets))
+    ground_truth[:n_informative, :] = 100 * generator.rand(n_informative, n_targets)
+
+    y = mt.dot(X, ground_truth) + bias
+
+    # Add noise
+    if noise > 0.0:
+        y += generator.normal(scale=noise, size=y.shape)
+
+    # Randomly permute samples and features
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+        indices = mt.arange(n_features)
+        generator.shuffle(indices)
+        X[:, :] = X[:, indices]
+        ground_truth = ground_truth[indices]
+
+    y = mt.squeeze(y)
+
+    if coef:
+        return X, y, mt.squeeze(ground_truth)
+
+    else:
+        return X, y
+
+
 def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
                center_box=(-10.0, 10.0), shuffle=True, random_state=None):
     """Generate isotropic Gaussian blobs for clustering.
diff --git a/mars/learn/datasets/tests/test_samples_generator.py b/mars/learn/datasets/tests/test_samples_generator.py
index 3205c94577..06d8cd2389 100644
--- a/mars/learn/datasets/tests/test_samples_generator.py
+++ b/mars/learn/datasets/tests/test_samples_generator.py
@@ -22,7 +22,7 @@
 from .... import tensor as mt
 from ....tensor.linalg import svd
 from ..samples_generator import make_low_rank_matrix, \
-    make_classification, make_blobs
+    make_classification, make_regression, make_blobs
 
 
 def test_make_classification(setup):
@@ -134,6 +134,53 @@ def test_make_classification_informative_features(setup):
                   n_clusters_per_class=2)
 
 
+def test_make_regression(setup):
+    X, y, c = make_regression(
+        n_samples=100,
+        n_features=10,
+        n_informative=3,
+        effective_rank=5,
+        coef=True,
+        bias=0.0,
+        noise=1.0,
+        random_state=0,
+    )
+    X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch()
+
+    assert X.shape == (100, 10), "X shape mismatch"
+    assert y.shape == (100,), "y shape mismatch"
+    assert c.shape == (10,), "coef shape mismatch"
+    assert sum(c != 0.0) == 3, "Unexpected number of informative features"
+
+    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
+    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
+
+    # Test with small number of features.
+    X, y = make_regression(n_samples=100, n_features=1)  # n_informative=3
+    assert X.shape == (100, 1)
+
+
+def test_make_regression_multitarget():
+    X, y, c = make_regression(
+        n_samples=100,
+        n_features=10,
+        n_informative=3,
+        n_targets=3,
+        coef=True,
+        noise=1.0,
+        random_state=0,
+    )
+    X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch()
+
+    assert X.shape == (100, 10), "X shape mismatch"
+    assert y.shape == (100, 3), "y shape mismatch"
+    assert c.shape == (10, 3), "coef shape mismatch"
+    np.testing.assert_array_equal(sum(c != 0.0), 3, "Unexpected number of informative features")
+
+    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
+    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
+
+
 def test_make_blobs(setup):
     cluster_stds = np.array([0.05, 0.2, 0.4])
     cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
diff --git a/mars/learn/utils/multiclass.py b/mars/learn/utils/multiclass.py
index 98dcbfc9b0..3e10b0d11f 100644
--- a/mars/learn/utils/multiclass.py
+++ b/mars/learn/utils/multiclass.py
@@ -433,7 +433,7 @@ def check_classification_targets(y):
     def check(t):
         if t not in ['binary', 'multiclass', 'multiclass-multioutput',
                      'multilabel-indicator', 'multilabel-sequences']:
-            raise ValueError("Unknown label type: %r" % y_type)
+            raise ValueError("Unknown label type: %r" % t)
         return t
 
     y_type = y_type.map_chunk(check, dtype=y_type.dtype)