From 3a0cc17fb6086a44a750404b8d89ee98300f1c91 Mon Sep 17 00:00:00 2001 From: Wenjun Si Date: Wed, 13 Oct 2021 18:06:29 +0800 Subject: [PATCH] Add `make_regression` support for learn module (#2515) --- docs/source/reference/learn/reference.rst | 1 + mars/_version.py | 2 +- mars/learn/datasets/__init__.py | 3 +- mars/learn/datasets/samples_generator.py | 134 ++++++++++++++++++ .../datasets/tests/test_samples_generator.py | 49 ++++++- mars/learn/utils/multiclass.py | 2 +- 6 files changed, 187 insertions(+), 4 deletions(-) diff --git a/docs/source/reference/learn/reference.rst b/docs/source/reference/learn/reference.rst index 705a52bba2..3c43a37e83 100644 --- a/docs/source/reference/learn/reference.rst +++ b/docs/source/reference/learn/reference.rst @@ -58,6 +58,7 @@ Samples generator datasets.make_blobs datasets.make_classification datasets.make_low_rank_matrix + datasets.make_regression .. _decomposition_ref: diff --git a/mars/_version.py b/mars/_version.py index 9091fd3b60..125b8c91ca 100644 --- a/mars/_version.py +++ b/mars/_version.py @@ -16,7 +16,7 @@ import os from typing import NamedTuple, Optional -version_info = (0, 7, 4) +version_info = (0, 7, 5) _num_index = max(idx if isinstance(v, int) else 0 for idx, v in enumerate(version_info)) __version__ = '.'.join(map(str, version_info[:_num_index + 1])) + \ diff --git a/mars/learn/datasets/__init__.py b/mars/learn/datasets/__init__.py index 7a1586fbc2..f8d1f46d86 100644 --- a/mars/learn/datasets/__init__.py +++ b/mars/learn/datasets/__init__.py @@ -12,4 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .samples_generator import make_classification, make_blobs, make_low_rank_matrix +from .samples_generator import make_classification, make_regression, \ + make_blobs, make_low_rank_matrix diff --git a/mars/learn/datasets/samples_generator.py b/mars/learn/datasets/samples_generator.py index 7ea6b07d0f..9ccb0719e3 100644 --- a/mars/learn/datasets/samples_generator.py +++ b/mars/learn/datasets/samples_generator.py @@ -247,6 +247,140 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, return X, y +def make_regression( + n_samples=100, + n_features=100, + *, + n_informative=10, + n_targets=1, + bias=0.0, + effective_rank=None, + tail_strength=0.5, + noise=0.0, + shuffle=True, + coef=False, + random_state=None, +): + """Generate a random regression problem. + + The input set can either be well conditioned (by default) or have a low + rank-fat tail singular profile. See :func:`make_low_rank_matrix` for + more details. + + The output is generated by applying a (potentially biased) random linear + regression model with `n_informative` nonzero regressors to the previously + generated input and some gaussian centered noise with some adjustable + scale. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=100 + The number of features. + + n_informative : int, default=10 + The number of informative features, i.e., the number of features used + to build the linear model used to generate the output. + + n_targets : int, default=1 + The number of regression targets, i.e., the dimension of the y output + vector associated with a sample. By default, the output is a scalar. + + bias : float, default=0.0 + The bias term in the underlying linear model. + + effective_rank : int, default=None + if not None: + The approximate number of singular vectors required to explain most + of the input data by linear combinations. Using this kind of + singular spectrum in the input allows the generator to reproduce + the correlations often observed in practice. + if None: + The input set is well conditioned, centered and gaussian with + unit variance. + + tail_strength : float, default=0.5 + The relative importance of the fat noisy tail of the singular values + profile if `effective_rank` is not None. When a float, it should be + between 0 and 1. + + noise : float, default=0.0 + The standard deviation of the gaussian noise applied to the output. + + shuffle : bool, default=True + Shuffle the samples and the features. + + coef : bool, default=False + If True, the coefficients of the underlying linear model are returned. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : tensor of shape (n_samples, n_features) + The input samples. + + y : tensor of shape (n_samples,) or (n_samples, n_targets) + The output values. + + coef : tensor of shape (n_features,) or (n_features, n_targets) + The coefficient of the underlying linear model. It is returned only if + coef is True. + """ + n_informative = min(n_features, n_informative) + generator = check_random_state(random_state) + + if effective_rank is None: + # Randomly generate a well conditioned input set + X = generator.randn(n_samples, n_features) + + else: + # Randomly generate a low rank, fat tail input set + X = make_low_rank_matrix( + n_samples=n_samples, + n_features=n_features, + effective_rank=effective_rank, + tail_strength=tail_strength, + random_state=generator, + ) + + # Generate a ground truth model with only n_informative features being non + # zeros (the other features are not correlated to y and should be ignored + # by a sparsifying regularizers such as L1 or elastic net) + ground_truth = mt.zeros((n_features, n_targets)) + ground_truth[:n_informative, :] = 100 * generator.rand(n_informative, n_targets) + + y = mt.dot(X, ground_truth) + bias + + # Add noise + if noise > 0.0: + y += generator.normal(scale=noise, size=y.shape) + + # Randomly permute samples and features + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + indices = mt.arange(n_features) + generator.shuffle(indices) + X[:, :] = X[:, indices] + ground_truth = ground_truth[indices] + + y = mt.squeeze(y) + + if coef: + return X, y, mt.squeeze(ground_truth) + + else: + return X, y + + def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=None): """Generate isotropic Gaussian blobs for clustering. diff --git a/mars/learn/datasets/tests/test_samples_generator.py b/mars/learn/datasets/tests/test_samples_generator.py index 3205c94577..06d8cd2389 100644 --- a/mars/learn/datasets/tests/test_samples_generator.py +++ b/mars/learn/datasets/tests/test_samples_generator.py @@ -22,7 +22,7 @@ from .... import tensor as mt from ....tensor.linalg import svd from ..samples_generator import make_low_rank_matrix, \ - make_classification, make_blobs + make_classification, make_regression, make_blobs def test_make_classification(setup): @@ -134,6 +134,53 @@ def test_make_classification_informative_features(setup): n_clusters_per_class=2) +def test_make_regression(setup): + X, y, c = make_regression( + n_samples=100, + n_features=10, + n_informative=3, + effective_rank=5, + coef=True, + bias=0.0, + noise=1.0, + random_state=0, + ) + X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch() + + assert X.shape == (100, 10), "X shape mismatch" + assert y.shape == (100,), "y shape mismatch" + assert c.shape == (10,), "coef shape mismatch" + assert sum(c != 0.0) == 3, "Unexpected number of informative features" + + # Test that y ~= np.dot(X, c) + bias + N(0, 1.0). + assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) + + # Test with small number of features. + X, y = make_regression(n_samples=100, n_features=1) # n_informative=3 + assert X.shape == (100, 1) + + +def test_make_regression_multitarget(): + X, y, c = make_regression( + n_samples=100, + n_features=10, + n_informative=3, + n_targets=3, + coef=True, + noise=1.0, + random_state=0, + ) + X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch() + + assert X.shape == (100, 10), "X shape mismatch" + assert y.shape == (100, 3), "y shape mismatch" + assert c.shape == (10, 3), "coef shape mismatch" + np.testing.assert_array_equal(sum(c != 0.0), 3, "Unexpected number of informative features") + + # Test that y ~= np.dot(X, c) + bias + N(0, 1.0) + assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) + + def test_make_blobs(setup): cluster_stds = np.array([0.05, 0.2, 0.4]) cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) diff --git a/mars/learn/utils/multiclass.py b/mars/learn/utils/multiclass.py index 98dcbfc9b0..3e10b0d11f 100644 --- a/mars/learn/utils/multiclass.py +++ b/mars/learn/utils/multiclass.py @@ -433,7 +433,7 @@ def check_classification_targets(y): def check(t): if t not in ['binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences']: - raise ValueError("Unknown label type: %r" % y_type) + raise ValueError("Unknown label type: %r" % t) return t y_type = y_type.map_chunk(check, dtype=y_type.dtype)