neurodata · adam2392 · Feb 23, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 23, 2024
diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 from scipy.integrate import nquad
 from scipy.stats import entropy, multivariate_normal
@@ -74,11 +76,12 @@ def make_trunk_classification(
     n_dim=4096,
     n_informative=256,
     simulation: str = "trunk",
-    m_factor: int = -1,
+    mu_0: int = 0,
+    mu_1: int = 1,
     rho: int = 0,
     band_type: str = "ma",
     return_params: bool = False,
-    mix: float = 0.5,
+    mix: Optional[float] = None,
     seed=None,
 ):
     """Generate trunk and/or Marron-Wand datasets.
@@ -114,11 +117,14 @@ def make_trunk_classification(
         When calling the Marron-Wand simulations, only the covariance parameters are considered
         (`rho` and `band_type`). Means are taken from :footcite:`marron1992exact`.
         By default 'trunk'.
-    m_factor : int, optional
-        The multiplicative factor to apply to the mean-vector of the first
-        distribution to obtain the mean-vector of the second distribution.
-        This is only used when ``simulation = trunk``.
-        By default -1.
+    mu_0 : int, optional
+        The mean of the first distribution. By default -1. The mean of the distribution will decrease
+        by a factor of ``sqrt(i)`` for each dimension ``i``. Not used if simulation is
+        one of the Marron-Wand simulations, or 'trunk_overlap'.
+    mu_1 : int, optional
+        The mean of the second distribution. By default 1. The mean of the distribution will decrease
+        by a factor of ``sqrt(i)`` for each dimension ``i``. Not used if simulation is
+        one of the Marron-Wand simulations, or 'trunk_overlap'.
     rho : float, optional
         The covariance value of the bands. By default 0 indicating, an identity matrix is used.
     band_type : str
@@ -128,7 +134,7 @@ def make_trunk_classification(
         Whether or not to return the distribution parameters of the classes normal distributions.
     mix : int, optional
         The probabilities associated with the mixture of Gaussians in the ``trunk-mix`` simulation.
-        By default 0.5.
+        By default None. Must be specified if ``simulation = trunk_mix``. Otherwise, it is ignored.
     seed : int, optional
         Random seed, by default None.
 
@@ -153,6 +159,31 @@ def make_trunk_classification(
         The weight vector for the Marron-Wand simulations.
         Returned if ``return_params`` is True.
 
+    Notes
+    -----
+    **Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality
+    increases. This is implemented by decreasing the mean of the distribution by a factor of
+    ``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one
+    and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1,
+    for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on.
+
+    **Trunk Overlap**: The trunk overlap simulation generates two classes of data with the same
+    covariance matrix and mean vector of zeros.
+
+    **Trunk Mix**: The trunk mix simulation generates two classes of data with the same covariance
+    matrix. The first class (label 0) is generated from a multivariate-Gaussians with mean vector of
+    zeros and the second class is generated from a mixture of Gaussians with mean vectors
+    specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, which
+    is the probability of the first Gaussian in the mixture.
+
+    **Marron-Wand Simulations**: The Marron-Wand simulations generate two classes of data with the
+    setup specified in the paper.
+
+    Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter
+    and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while
+    the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going
+    from one dimension to the next.
+
     References
     ----------
     .. footbibliography::
@@ -162,10 +193,16 @@ def make_trunk_classification(
             f"Number of informative dimensions {n_informative} must be less than number "
             f"of dimensions, {n_dim}"
         )
+    if mix is not None and simulation != "trunk_mix":
+        raise ValueError(
+            f"Mix should not be specified when simulation is not 'trunk_mix'. Simulation is {simulation}."
+        )
+    if mix is None and simulation == "trunk_mix":
+        raise ValueError("Mix must be specified when simulation is 'trunk_mix'.")
     rng = np.random.default_rng(seed=seed)
 
-    mu_1 = np.array([1 / np.sqrt(i) for i in range(1, n_informative + 1)])
-    mu_0 = m_factor * mu_1
+    mu_1_vec = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)])
+    mu_0_vec = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)])
 
     if rho != 0:
         if band_type == "ma":
@@ -177,7 +214,7 @@ def make_trunk_classification(
     else:
         cov = np.identity(n_informative)
 
-    if mix < 0 or mix > 1:
+    if mix is not None and (mix < 0 or mix > 1):  # type: ignore
         raise ValueError("Mix must be between 0 and 1.")
 
     # speed up computations for large multivariate normal matrix with SVD approximation
@@ -189,8 +226,8 @@ def make_trunk_classification(
     if simulation == "trunk":
         X = np.vstack(
             (
-                rng.multivariate_normal(mu_1, cov, n_samples // 2, method=method),
-                rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method),
+                rng.multivariate_normal(mu_1_vec, cov, n_samples // 2, method=method),
+                rng.multivariate_normal(mu_0_vec, cov, n_samples // 2, method=method),
             )
         )
     elif simulation == "trunk_overlap":
@@ -205,8 +242,11 @@ def make_trunk_classification(
             )
         )
     elif simulation == "trunk_mix":
-        mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix])
-        norm_params = [[mu_0, cov * (2 / 3) ** 2], [mu_1, cov * (2 / 3) ** 2]]
+        mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix])  # type: ignore
+
+        # When variance is 1, trunk-mix does not look bimodal at low dimensions.
+        # It is set it to (2/3)**2 since that is consistent with Marron and Wand bimodal
+        norm_params = [[mu_0_vec, cov * (2 / 3) ** 2], [mu_1_vec, cov * (2 / 3) ** 2]]
         X_mixture = np.fromiter(
             (
                 rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
@@ -268,10 +308,10 @@ def make_trunk_classification(
     if return_params:
         returns = [X, y]
         if simulation == "trunk":
-            returns += [[mu_0, mu_1], [cov, cov]]
-        elif simulation == "trunk-overlap":
+            returns += [[mu_0_vec, mu_1_vec], [cov, cov]]
+        elif simulation == "trunk_overlap":
             returns += [[np.zeros(n_informative), np.zeros(n_informative)], [cov, cov]]
-        elif simulation == "trunk-mix":
+        elif simulation == "trunk_mix":
             returns += [*list(zip(*norm_params)), X_mixture]
         else:
             returns += [*list(zip(*norm_params)), G, w]

diff --git a/sktree/datasets/tests/test_hyppo.py b/sktree/datasets/tests/test_hyppo.py
@@ -26,7 +26,7 @@ def test_make_trunk_classification_custom_parameters():
         n_samples=50,
         n_dim=5,
         n_informative=2,
-        m_factor=2,
+        mu_0=0,
         rho=0.5,
         band_type="ma",
         return_params=False,
@@ -55,8 +55,13 @@ def test_make_trunk_classification_autoregressive_cov():
 
 def test_make_trunk_classification_mixture():
     # Test with default parameters
-    X, y, _, _ = make_trunk_classification(
-        n_samples=100, n_dim=10, n_informative=5, mix=0.5, return_params=True
+    [X, y, _, _, _] = make_trunk_classification(
+        n_samples=100,
+        n_dim=10,
+        n_informative=5,
+        simulation="trunk_mix",
+        mix=0.5,
+        return_params=True,
     )
     assert X.shape == (100, 10), X.shape
     assert y.shape == (100,)
@@ -83,7 +88,7 @@ def test_make_trunk_classification_invalid_band_type():
 def test_make_trunk_classification_invalid_mix():
     # Test with an invalid band type
     with pytest.raises(ValueError, match="Mix must be between 0 and 1."):
-        make_trunk_classification(n_samples=50, rho=0.5, mix=2)
+        make_trunk_classification(n_samples=50, simulation="trunk_mix", rho=0.5, mix=2)
 
 
 def test_make_trunk_classification_invalid_n_informative():
@@ -98,6 +103,19 @@ def test_make_trunk_classification_invalid_simulation_name():
         make_trunk_classification(n_samples=50, rho=0.5, simulation=None)
 
 
+def test_make_trunk_classification_errors_trunk_mix():
+    # test with mix but not trunk_mix
+    with pytest.raises(
+        ValueError,
+        match="Mix should not be specified when simulation is not 'trunk_mix'. Simulation is trunk.",
+    ):
+        make_trunk_classification(n_samples=2, simulation="trunk", mix=0.5)
+
+    # test without mix but trunk_mix
+    with pytest.raises(ValueError, match="Mix must be specified when simulation is 'trunk_mix'."):
+        make_trunk_classification(n_samples=2, simulation="trunk_mix")
+
+
 @pytest.mark.parametrize(
     "simulation", ["trunk", "trunk_overlap", "trunk_mix", *MARRON_WAND_SIMS.keys()]
 )
@@ -106,11 +124,16 @@ def test_make_trunk_classification_simulations(simulation):
     n_samples = 100
     n_dim = 10
     n_informative = 10
+    if simulation == "trunk_mix":
+        mix = 0.5
+    else:
+        mix = None
     X, y = make_trunk_classification(
         n_samples=n_samples,
         n_dim=n_dim,
         n_informative=n_informative,
         simulation=simulation,
+        mix=mix,
     )
     assert X.shape == (n_samples, n_dim)
     assert y.shape == (n_samples,)