Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT, API Change trunk-classification into three separate functions for generating trunk, trunk-mix, trunk-overlap and marron-wand #227

Merged
merged 8 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 58 additions & 18 deletions sktree/datasets/hyppo.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional

import numpy as np
from scipy.integrate import nquad
from scipy.stats import entropy, multivariate_normal
Expand Down Expand Up @@ -74,11 +76,12 @@ def make_trunk_classification(
n_dim=4096,
n_informative=256,
simulation: str = "trunk",
m_factor: int = -1,
mu_0: int = 0,
mu_1: int = 1,
rho: int = 0,
band_type: str = "ma",
return_params: bool = False,
mix: float = 0.5,
mix: Optional[float] = None,
seed=None,
):
"""Generate trunk and/or Marron-Wand datasets.
Expand Down Expand Up @@ -114,11 +117,14 @@ def make_trunk_classification(
When calling the Marron-Wand simulations, only the covariance parameters are considered
(`rho` and `band_type`). Means are taken from :footcite:`marron1992exact`.
By default 'trunk'.
m_factor : int, optional
The multiplicative factor to apply to the mean-vector of the first
distribution to obtain the mean-vector of the second distribution.
This is only used when ``simulation = trunk``.
By default -1.
mu_0 : int, optional
The mean of the first distribution. By default -1. The mean of the distribution will decrease
by a factor of ``sqrt(i)`` for each dimension ``i``. Not used if simulation is
one of the Marron-Wand simulations, or 'trunk_overlap'.
mu_1 : int, optional
The mean of the second distribution. By default 1. The mean of the distribution will decrease
by a factor of ``sqrt(i)`` for each dimension ``i``. Not used if simulation is
one of the Marron-Wand simulations, or 'trunk_overlap'.
rho : float, optional
The covariance value of the bands. By default 0 indicating, an identity matrix is used.
band_type : str
Expand All @@ -128,7 +134,7 @@ def make_trunk_classification(
Whether or not to return the distribution parameters of the classes normal distributions.
mix : int, optional
The probabilities associated with the mixture of Gaussians in the ``trunk-mix`` simulation.
By default 0.5.
By default None. Must be specified if ``simulation = trunk_mix``. Otherwise, it is ignored.
seed : int, optional
Random seed, by default None.

Expand All @@ -153,6 +159,31 @@ def make_trunk_classification(
The weight vector for the Marron-Wand simulations.
Returned if ``return_params`` is True.

Notes
-----
**Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality
increases. This is implemented by decreasing the mean of the distribution by a factor of
``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one
and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1,
for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on.

**Trunk Overlap**: The trunk overlap simulation generates two classes of data with the same
covariance matrix and mean vector of zeros.

**Trunk Mix**: The trunk mix simulation generates two classes of data with the same covariance
matrix. The first class (label 0) is generated from a multivariate-Gaussians with mean vector of
zeros and the second class is generated from a mixture of Gaussians with mean vectors
specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, which
is the probability of the first Gaussian in the mixture.

**Marron-Wand Simulations**: The Marron-Wand simulations generate two classes of data with the
setup specified in the paper.

Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter
and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while
the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going
from one dimension to the next.

References
----------
.. footbibliography::
Expand All @@ -162,10 +193,16 @@ def make_trunk_classification(
f"Number of informative dimensions {n_informative} must be less than number "
f"of dimensions, {n_dim}"
)
if mix is not None and simulation != "trunk_mix":
raise ValueError(
f"Mix should not be specified when simulation is not 'trunk_mix'. Simulation is {simulation}."
)
if mix is None and simulation == "trunk_mix":
raise ValueError("Mix must be specified when simulation is 'trunk_mix'.")
rng = np.random.default_rng(seed=seed)

mu_1 = np.array([1 / np.sqrt(i) for i in range(1, n_informative + 1)])
mu_0 = m_factor * mu_1
mu_1_vec = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)])
mu_0_vec = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)])

if rho != 0:
if band_type == "ma":
Expand All @@ -177,7 +214,7 @@ def make_trunk_classification(
else:
cov = np.identity(n_informative)

if mix < 0 or mix > 1:
if mix is not None and (mix < 0 or mix > 1): # type: ignore
raise ValueError("Mix must be between 0 and 1.")

# speed up computations for large multivariate normal matrix with SVD approximation
Expand All @@ -189,8 +226,8 @@ def make_trunk_classification(
if simulation == "trunk":
X = np.vstack(
(
rng.multivariate_normal(mu_1, cov, n_samples // 2, method=method),
rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method),
rng.multivariate_normal(mu_1_vec, cov, n_samples // 2, method=method),
rng.multivariate_normal(mu_0_vec, cov, n_samples // 2, method=method),
)
)
elif simulation == "trunk_overlap":
Expand All @@ -205,8 +242,11 @@ def make_trunk_classification(
)
)
elif simulation == "trunk_mix":
mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix])
norm_params = [[mu_0, cov * (2 / 3) ** 2], [mu_1, cov * (2 / 3) ** 2]]
mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]) # type: ignore

# When variance is 1, trunk-mix does not look bimodal at low dimensions.
# It is set it to (2/3)**2 since that is consistent with Marron and Wand bimodal
norm_params = [[mu_0_vec, cov * (2 / 3) ** 2], [mu_1_vec, cov * (2 / 3) ** 2]]
X_mixture = np.fromiter(
(
rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
Expand Down Expand Up @@ -268,10 +308,10 @@ def make_trunk_classification(
if return_params:
returns = [X, y]
if simulation == "trunk":
returns += [[mu_0, mu_1], [cov, cov]]
elif simulation == "trunk-overlap":
returns += [[mu_0_vec, mu_1_vec], [cov, cov]]
elif simulation == "trunk_overlap":
returns += [[np.zeros(n_informative), np.zeros(n_informative)], [cov, cov]]
elif simulation == "trunk-mix":
elif simulation == "trunk_mix":
returns += [*list(zip(*norm_params)), X_mixture]
else:
returns += [*list(zip(*norm_params)), G, w]
Expand Down
31 changes: 27 additions & 4 deletions sktree/datasets/tests/test_hyppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_make_trunk_classification_custom_parameters():
n_samples=50,
n_dim=5,
n_informative=2,
m_factor=2,
mu_0=0,
rho=0.5,
band_type="ma",
return_params=False,
Expand Down Expand Up @@ -55,8 +55,13 @@ def test_make_trunk_classification_autoregressive_cov():

def test_make_trunk_classification_mixture():
# Test with default parameters
X, y, _, _ = make_trunk_classification(
n_samples=100, n_dim=10, n_informative=5, mix=0.5, return_params=True
[X, y, _, _, _] = make_trunk_classification(
n_samples=100,
n_dim=10,
n_informative=5,
simulation="trunk_mix",
mix=0.5,
return_params=True,
)
assert X.shape == (100, 10), X.shape
assert y.shape == (100,)
Expand All @@ -83,7 +88,7 @@ def test_make_trunk_classification_invalid_band_type():
def test_make_trunk_classification_invalid_mix():
# Test with an invalid band type
with pytest.raises(ValueError, match="Mix must be between 0 and 1."):
make_trunk_classification(n_samples=50, rho=0.5, mix=2)
make_trunk_classification(n_samples=50, simulation="trunk_mix", rho=0.5, mix=2)


def test_make_trunk_classification_invalid_n_informative():
Expand All @@ -98,6 +103,19 @@ def test_make_trunk_classification_invalid_simulation_name():
make_trunk_classification(n_samples=50, rho=0.5, simulation=None)


def test_make_trunk_classification_errors_trunk_mix():
# test with mix but not trunk_mix
with pytest.raises(
ValueError,
match="Mix should not be specified when simulation is not 'trunk_mix'. Simulation is trunk.",
):
make_trunk_classification(n_samples=2, simulation="trunk", mix=0.5)

# test without mix but trunk_mix
with pytest.raises(ValueError, match="Mix must be specified when simulation is 'trunk_mix'."):
make_trunk_classification(n_samples=2, simulation="trunk_mix")


@pytest.mark.parametrize(
"simulation", ["trunk", "trunk_overlap", "trunk_mix", *MARRON_WAND_SIMS.keys()]
)
Expand All @@ -106,11 +124,16 @@ def test_make_trunk_classification_simulations(simulation):
n_samples = 100
n_dim = 10
n_informative = 10
if simulation == "trunk_mix":
mix = 0.5
else:
mix = None
X, y = make_trunk_classification(
n_samples=n_samples,
n_dim=n_dim,
n_informative=n_informative,
simulation=simulation,
mix=mix,
)
assert X.shape == (n_samples, n_dim)
assert y.shape == (n_samples,)
Expand Down
Loading