Skip to content

Commit

Permalink
Add make_regression support for learn module (#2515)
Browse files Browse the repository at this point in the history
  • Loading branch information
wjsi authored Oct 13, 2021
1 parent 3b61957 commit a08dda9
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 4 deletions.
2 changes: 1 addition & 1 deletion mars/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
from typing import NamedTuple, Optional

version_info = (0, 8, 0, 'b2')
version_info = (0, 8, 0, 'rc1')
_num_index = max(idx if isinstance(v, int) else 0
for idx, v in enumerate(version_info))
__version__ = '.'.join(map(str, version_info[:_num_index + 1])) + \
Expand Down
3 changes: 2 additions & 1 deletion mars/learn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .samples_generator import make_classification, make_blobs, make_low_rank_matrix
from .samples_generator import make_classification, make_regression, \
make_blobs, make_low_rank_matrix
134 changes: 134 additions & 0 deletions mars/learn/datasets/samples_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,140 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
return X, y


def make_regression(
n_samples=100,
n_features=100,
*,
n_informative=10,
n_targets=1,
bias=0.0,
effective_rank=None,
tail_strength=0.5,
noise=0.0,
shuffle=True,
coef=False,
random_state=None,
):
"""Generate a random regression problem.
The input set can either be well conditioned (by default) or have a low
rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
more details.
The output is generated by applying a (potentially biased) random linear
regression model with `n_informative` nonzero regressors to the previously
generated input and some gaussian centered noise with some adjustable
scale.
Read more in the :ref:`User Guide <sample_generators>`.
Parameters
----------
n_samples : int, default=100
The number of samples.
n_features : int, default=100
The number of features.
n_informative : int, default=10
The number of informative features, i.e., the number of features used
to build the linear model used to generate the output.
n_targets : int, default=1
The number of regression targets, i.e., the dimension of the y output
vector associated with a sample. By default, the output is a scalar.
bias : float, default=0.0
The bias term in the underlying linear model.
effective_rank : int, default=None
if not None:
The approximate number of singular vectors required to explain most
of the input data by linear combinations. Using this kind of
singular spectrum in the input allows the generator to reproduce
the correlations often observed in practice.
if None:
The input set is well conditioned, centered and gaussian with
unit variance.
tail_strength : float, default=0.5
The relative importance of the fat noisy tail of the singular values
profile if `effective_rank` is not None. When a float, it should be
between 0 and 1.
noise : float, default=0.0
The standard deviation of the gaussian noise applied to the output.
shuffle : bool, default=True
Shuffle the samples and the features.
coef : bool, default=False
If True, the coefficients of the underlying linear model are returned.
random_state : int, RandomState instance or None, default=None
Determines random number generation for dataset creation. Pass an int
for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
X : tensor of shape (n_samples, n_features)
The input samples.
y : tensor of shape (n_samples,) or (n_samples, n_targets)
The output values.
coef : tensor of shape (n_features,) or (n_features, n_targets)
The coefficient of the underlying linear model. It is returned only if
coef is True.
"""
n_informative = min(n_features, n_informative)
generator = check_random_state(random_state)

if effective_rank is None:
# Randomly generate a well conditioned input set
X = generator.randn(n_samples, n_features)

else:
# Randomly generate a low rank, fat tail input set
X = make_low_rank_matrix(
n_samples=n_samples,
n_features=n_features,
effective_rank=effective_rank,
tail_strength=tail_strength,
random_state=generator,
)

# Generate a ground truth model with only n_informative features being non
# zeros (the other features are not correlated to y and should be ignored
# by a sparsifying regularizers such as L1 or elastic net)
ground_truth = mt.zeros((n_features, n_targets))
ground_truth[:n_informative, :] = 100 * generator.rand(n_informative, n_targets)

y = mt.dot(X, ground_truth) + bias

# Add noise
if noise > 0.0:
y += generator.normal(scale=noise, size=y.shape)

# Randomly permute samples and features
if shuffle:
X, y = util_shuffle(X, y, random_state=generator)

indices = mt.arange(n_features)
generator.shuffle(indices)
X[:, :] = X[:, indices]
ground_truth = ground_truth[indices]

y = mt.squeeze(y)

if coef:
return X, y, mt.squeeze(ground_truth)

else:
return X, y


def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
center_box=(-10.0, 10.0), shuffle=True, random_state=None):
"""Generate isotropic Gaussian blobs for clustering.
Expand Down
49 changes: 48 additions & 1 deletion mars/learn/datasets/tests/test_samples_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from .... import tensor as mt
from ....tensor.linalg import svd
from ..samples_generator import make_low_rank_matrix, \
make_classification, make_blobs
make_classification, make_regression, make_blobs


def test_make_classification(setup):
Expand Down Expand Up @@ -134,6 +134,53 @@ def test_make_classification_informative_features(setup):
n_clusters_per_class=2)


def test_make_regression(setup):
X, y, c = make_regression(
n_samples=100,
n_features=10,
n_informative=3,
effective_rank=5,
coef=True,
bias=0.0,
noise=1.0,
random_state=0,
)
X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch()

assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100,), "y shape mismatch"
assert c.shape == (10,), "coef shape mismatch"
assert sum(c != 0.0) == 3, "Unexpected number of informative features"

# Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)

# Test with small number of features.
X, y = make_regression(n_samples=100, n_features=1) # n_informative=3
assert X.shape == (100, 1)


def test_make_regression_multitarget():
X, y, c = make_regression(
n_samples=100,
n_features=10,
n_informative=3,
n_targets=3,
coef=True,
noise=1.0,
random_state=0,
)
X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch()

assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100, 3), "y shape mismatch"
assert c.shape == (10, 3), "coef shape mismatch"
np.testing.assert_array_equal(sum(c != 0.0), 3, "Unexpected number of informative features")

# Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)


def test_make_blobs(setup):
cluster_stds = np.array([0.05, 0.2, 0.4])
cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
Expand Down
2 changes: 1 addition & 1 deletion mars/learn/utils/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def check_classification_targets(y):
def check(t):
if t not in ['binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences']:
raise ValueError("Unknown label type: %r" % y_type)
raise ValueError("Unknown label type: %r" % t)
return t

y_type = y_type.map_chunk(check, dtype=y_type.dtype)
Expand Down

0 comments on commit a08dda9

Please sign in to comment.