diff --git a/doc/api.rst b/doc/api.rst index 793adb7a5..f4ef89aed 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -168,6 +168,8 @@ those offered in scikit-learn. make_joint_factor_model make_quadratic_classification make_trunk_classification + make_trunk_mixture_classification + make_marron_wand_classification approximate_clf_mutual_information approximate_clf_mutual_information_with_monte_carlo diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index cc389e17e..1435da20d 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -27,6 +27,9 @@ Changelog by `Sambit Panda`_ (:pr:`#203`) - |Feature| :func:`sktree.stats.build_coleman_forest` and :func:`sktree.stats.build_permutation_forest` are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`) +- |API| :func:`sktree.datasets.make_trunk_classification` for generating trunk mixture and Marron-Wand + simulations are separated out into :func:`sktree.datasets.make_marron_wand_classification` and + :func:`sktree.datasets.make_trunk_mixture_classification`, `Adam Li`_ (:pr:`#227`) - |API| :class:`sktree.HonestForestClassifier` and :class:`sktree.tree.HonestTreeClassifier` now overwrite all parameters set by the underlying ``tree_estimator`` and allow you to directly pass any extra parameters that ``tree_estimator`` has compared to the original diff --git a/sktree/datasets/__init__.py b/sktree/datasets/__init__.py index cc2df6d84..29830b80a 100644 --- a/sktree/datasets/__init__.py +++ b/sktree/datasets/__init__.py @@ -1,7 +1,9 @@ from .hyppo import ( approximate_clf_mutual_information, approximate_clf_mutual_information_with_monte_carlo, + make_marron_wand_classification, make_quadratic_classification, make_trunk_classification, + make_trunk_mixture_classification, ) from .multiview import make_gaussian_mixture, make_joint_factor_model diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py index 51ce32ea0..629178dbd 100644 --- a/sktree/datasets/hyppo.py +++ b/sktree/datasets/hyppo.py @@ -69,26 +69,23 @@ def make_quadratic_classification(n_samples: int, n_features: int, noise=False, } -def make_trunk_classification( +def make_marron_wand_classification( n_samples, n_dim=4096, n_informative=256, - simulation: str = "trunk", - m_factor: int = -1, + simulation: str = "gaussian", rho: int = 0, band_type: str = "ma", return_params: bool = False, - mix: float = 0.5, seed=None, ): - """Generate trunk and/or Marron-Wand datasets. - - For each dimension in the first distribution, there is a mean of :math:`1 / d`, where - ``d`` is the dimensionality. The covariance is the identity matrix. - The second distribution has a mean vector that is the negative of the first. - As ``d`` increases, the two distributions become closer and closer. + """Generate Marron-Wand binary classification dataset. - Full details for the trunk simulation can be found in :footcite:`trunk1982`. + The simulation is similar to that of :func:`sktree.datasets.make_trunk_classification` + where the first class is generated from a multivariate-Gaussians with mean vector of + 0's. The second class is generated from a mixture of Gaussians with mean vectors + specified by the Marron-Wand simulations, but as the dimensionality increases, the second + class distribution approaches the first class distribution by a factor of :math:`1 / sqrt(d)`. Full details for the Marron-Wand simulations can be found in :footcite:`marron1992exact`. @@ -106,19 +103,174 @@ def make_trunk_classification( The informative dimensions. All others for ``n_dim - n_informative`` are Gaussian noise. Default is 256. simulation : str, optional - Which simulation to run. Must be: 'trunk', 'trunk_overlap', 'trunk_mix', or one of the + Which simulation to run. Must be one of the following Marron-Wand simulations: 'gaussian', 'skewed_unimodal', 'strongly_skewed', 'kurtotic_unimodal', 'outlier', 'bimodal', 'separated_bimodal', 'skewed_bimodal', 'trimodal', 'claw', 'double_claw', 'asymmetric_claw', 'asymmetric_double_claw', 'smooth_comb', 'discrete_comb'. When calling the Marron-Wand simulations, only the covariance parameters are considered (`rho` and `band_type`). Means are taken from :footcite:`marron1992exact`. - By default 'trunk'. - m_factor : int, optional - The multiplicative factor to apply to the mean-vector of the first - distribution to obtain the mean-vector of the second distribution. - This is only used when ``simulation = trunk``. - By default -1. + By default 'gaussian'. + rho : float, optional + The covariance value of the bands. By default 0 indicating, an identity matrix is used. + band_type : str + The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. + Either 'ma', or 'ar'. + return_params : bool, optional + Whether or not to return the distribution parameters of the classes normal distributions. + seed : int, optional + Random seed, by default None. + + Returns + ------- + X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 + Trunk dataset as a dense array. + y : np.ndarray of shape (n_samples,), dtype=np.intp + Labels of the dataset. + G : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 + The mixture of Gaussians for the Marron-Wand simulations. + Returned if ``return_params`` is True. + w : np.ndarray of shape (n_dim,), dtype=np.float64 + The weight vector for the Marron-Wand simulations. + Returned if ``return_params`` is True. + + Notes + ----- + **Marron-Wand Simulations**: The Marron-Wand simulations generate two classes of data with the + setup specified in the paper. + + Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter + and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while + the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going + from one dimension to the next. + + For each dimension in the first distribution, there is a mean of :math:`1 / d`, where + ``d`` is the dimensionality. The covariance is the identity matrix. + + The second distribution has a mean vector that is the negative of the first. + As ``d`` increases, the two distributions become closer and closer. + Full details for the trunk simulation can be found in :footcite:`trunk1982`. + + References + ---------- + .. footbibliography:: + """ + if n_dim < n_informative: + raise ValueError( + f"Number of informative dimensions {n_informative} must be less than number " + f"of dimensions, {n_dim}" + ) + if simulation not in MARRON_WAND_SIMS.keys(): + raise ValueError( + f"Simulation must be: trunk, trunk_overlap, trunk_mix, {MARRON_WAND_SIMS.keys()}" + ) + + rng = np.random.default_rng(seed=seed) + + if rho != 0: + if band_type == "ma": + cov = _moving_avg_cov(n_informative, rho) + elif band_type == "ar": + cov = _autoregressive_cov(n_informative, rho) + else: + raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".') + else: + cov = np.identity(n_informative) + + # speed up computations for large multivariate normal matrix with SVD approximation + if n_informative > 1000: + mvg_sampling_method = "cholesky" + else: + mvg_sampling_method = "svd" + + mixture_idx = rng.choice( + len(MARRON_WAND_SIMS[simulation]), # type: ignore + size=n_samples // 2, + replace=True, + p=MARRON_WAND_SIMS[simulation], + ) + # the parameters used for each Gaussian in the mixture for each Marron Wand simulation + norm_params = MarronWandSims(n_dim=n_informative, cov=cov)(simulation) + G = np.fromiter( + ( + rng.multivariate_normal(*(norm_params[i]), size=1, method=mvg_sampling_method) + for i in mixture_idx + ), + dtype=np.dtype((float, n_informative)), + ) + + # as the dimensionality of the simulations increasing, we are adding more and + # more noise to the data using the w parameter + w_vec = np.array([1.0 / np.sqrt(i) for i in range(1, n_informative + 1)]) + X = np.vstack( + ( + rng.multivariate_normal( + np.zeros(n_informative), cov, n_samples // 2, method=mvg_sampling_method + ), + (1 - w_vec) + * rng.multivariate_normal( + np.zeros(n_informative), cov, n_samples // 2, method=mvg_sampling_method + ) + + w_vec * G.reshape(n_samples // 2, n_informative), + ) + ) + if n_dim > n_informative: + X = np.hstack((X, rng.normal(loc=0, scale=1, size=(X.shape[0], n_dim - n_informative)))) + + y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2))) + + if return_params: + returns = [X, y] + returns += [*list(zip(*norm_params)), G, w_vec] + return returns + return X, y + + +def make_trunk_mixture_classification( + n_samples, + n_dim=4096, + n_informative=256, + mu_0: int = 0, + mu_1: int = 1, + rho: int = 0, + band_type: str = "ma", + return_params: bool = False, + mix: float = 0.5, + seed=None, +): + """Generate trunk mixture binary classification dataset. + + The first class is generated from a multivariate-Gaussians with mean vector of + 0's. The second class is generated from a mixture of Gaussians with mean vectors + specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, + which is the probability of the first Gaussian in the mixture. + + For each dimension in the first distribution, there is a mean of :math:`1 / d`, where + ``d`` is the dimensionality. The covariance is the identity matrix. + The second distribution has a mean vector that is the negative of the first. + As ``d`` increases, the two distributions become closer and closer. + + Full details for the trunk simulation can be found in :footcite:`trunk1982`. + + Instead of the identity covariance matrix, one can implement a banded covariance matrix + that follows :footcite:`Bickel_2008`. + + Parameters + ---------- + n_samples : int + Number of sample to generate. + n_dim : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 4096. + n_informative : int, optional + The informative dimensions. All others for ``n_dim - n_informative`` + are Gaussian noise. Default is 256. + mu_0 : int, optional + The mean of the first distribution. By default -1. The mean of the distribution will decrease + by a factor of ``sqrt(i)`` for each dimension ``i``. + mu_1 : int, optional + The mean of the second distribution. By default 1. The mean of the distribution will decrease + by a factor of ``sqrt(i)`` for each dimension ``i``. rho : float, optional The covariance value of the bands. By default 0 indicating, an identity matrix is used. band_type : str @@ -144,15 +296,28 @@ def make_trunk_classification( covs : list of ArrayLike of shape (n_dim, n_dim), dtype=np.float64 The covariance for each class. Returned if ``return_params`` is True. X_mixture : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 - The mixture of Gaussians for the ``trunk_mix`` simulation. - Returned if ``return_params`` is True. - G : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 - The mixture of Gaussians for the Marron-Wand simulations. - Returned if ``return_params`` is True. - w : np.ndarray of shape (n_dim,), dtype=np.float64 - The weight vector for the Marron-Wand simulations. + The mixture of Gaussians. Returned if ``return_params`` is True. + Notes + ----- + **Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality + increases. This is implemented by decreasing the mean of the distribution by a factor of + ``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one + and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1, + for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on. + + **Trunk Mix**: The trunk mix simulation generates two classes of data with the same covariance + matrix. The first class (label 0) is generated from a multivariate-Gaussians with mean vector of + zeros and the second class is generated from a mixture of Gaussians with mean vectors + specified by ``mu_0`` and ``mu_1``. The mixture is specified by the ``mix`` parameter, which + is the probability of the first Gaussian in the mixture. + + Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter + and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while + the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going + from one dimension to the next. + References ---------- .. footbibliography:: @@ -162,10 +327,13 @@ def make_trunk_classification( f"Number of informative dimensions {n_informative} must be less than number " f"of dimensions, {n_dim}" ) + if mix < 0 or mix > 1: # type: ignore + raise ValueError("Mix must be between 0 and 1.") + rng = np.random.default_rng(seed=seed) - mu_1 = np.array([1 / np.sqrt(i) for i in range(1, n_informative + 1)]) - mu_0 = m_factor * mu_1 + mu_1_vec = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)]) + mu_0_vec = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)]) if rho != 0: if band_type == "ma": @@ -177,88 +345,156 @@ def make_trunk_classification( else: cov = np.identity(n_informative) - if mix < 0 or mix > 1: - raise ValueError("Mix must be between 0 and 1.") - # speed up computations for large multivariate normal matrix with SVD approximation if n_informative > 1000: method = "cholesky" else: method = "svd" - if simulation == "trunk": - X = np.vstack( - ( - rng.multivariate_normal(mu_1, cov, n_samples // 2, method=method), - rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method), - ) - ) - elif simulation == "trunk_overlap": - X = np.vstack( - ( - rng.multivariate_normal( - np.zeros(n_informative), cov, n_samples // 2, method=method - ), - rng.multivariate_normal( - np.zeros(n_informative), cov, n_samples // 2, method=method - ), - ) - ) - elif simulation == "trunk_mix": - mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]) - norm_params = [[mu_0, cov * (2 / 3) ** 2], [mu_1, cov * (2 / 3) ** 2]] - X_mixture = np.fromiter( - ( - rng.multivariate_normal(*(norm_params[i]), size=1, method=method) - for i in mixture_idx - ), - dtype=np.dtype((float, n_informative)), - ) + mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]) # type: ignore - X = np.vstack( - ( - rng.multivariate_normal( - np.zeros(n_informative), cov * (2 / 3) ** 2, n_samples // 2, method=method - ), - X_mixture.reshape(n_samples // 2, n_informative), - ) - ) - elif simulation in MARRON_WAND_SIMS.keys(): - mixture_idx = rng.choice( - len(MARRON_WAND_SIMS[simulation]), # type: ignore - size=n_samples // 2, - replace=True, - p=MARRON_WAND_SIMS[simulation], - ) - # the parameters used for each Gaussian in the mixture for each Marron Wand simulation - norm_params = MarronWandSims(n_dim=n_informative, cov=cov)(simulation) - G = np.fromiter( - ( - rng.multivariate_normal(*(norm_params[i]), size=1, method=method) - for i in mixture_idx + # When variance is 1, trunk-mix does not look bimodal at low dimensions. + # It is set it to (2/3)**2 since that is consistent with Marron and Wand bimodal + norm_params = [[mu_0_vec, cov * (2 / 3) ** 2], [mu_1_vec, cov * (2 / 3) ** 2]] + X_mixture = np.fromiter( + (rng.multivariate_normal(*(norm_params[i]), size=1, method=method) for i in mixture_idx), + dtype=np.dtype((float, n_informative)), + ) + + X = np.vstack( + ( + rng.multivariate_normal( + np.zeros(n_informative), cov * (2 / 3) ** 2, n_samples // 2, method=method ), - dtype=np.dtype((float, n_informative)), + X_mixture.reshape(n_samples // 2, n_informative), ) + ) - # as the dimensionality of the simulations increasing, we are adding more and - # more noise to the data using the w parameter - w = mu_1 - X = np.vstack( - ( - rng.multivariate_normal( - np.zeros(n_informative), cov, n_samples // 2, method=method - ), - (1 - w) - * rng.multivariate_normal( - np.zeros(n_informative), cov, n_samples // 2, method=method - ) - + w * G.reshape(n_samples // 2, n_informative), - ) + if n_dim > n_informative: + X = np.hstack((X, rng.normal(loc=0, scale=1, size=(X.shape[0], n_dim - n_informative)))) + + y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2))) + + if return_params: + returns = [X, y] + returns += [*list(zip(*norm_params)), X_mixture] + return returns + return X, y + + +def make_trunk_classification( + n_samples, + n_dim=4096, + n_informative=256, + mu_0: int = 0, + mu_1: int = 1, + rho: int = 0, + band_type: str = "ma", + return_params: bool = False, + seed=None, +): + """Generate trunk binary classification dataset. + + For each dimension in the first distribution, there is a mean of :math:`1 / d`, where + ``d`` is the dimensionality. The covariance is the identity matrix. + The second distribution has a mean vector that is the negative of the first. + As ``d`` increases, the two distributions become closer and closer. + + Full details for the trunk simulation can be found in :footcite:`trunk1982`. + + Instead of the identity covariance matrix, one can implement a banded covariance matrix + that follows :footcite:`Bickel_2008`. + + Parameters + ---------- + n_samples : int + Number of sample to generate. + n_dim : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 4096. + n_informative : int, optional + The informative dimensions. All others for ``n_dim - n_informative`` + are Gaussian noise. Default is 256. + mu_0 : int, optional + The mean of the first distribution. By default -1. The mean of the distribution will decrease + by a factor of ``sqrt(i)`` for each dimension ``i``. + mu_1 : int, optional + The mean of the second distribution. By default 1. The mean of the distribution will decrease + by a factor of ``sqrt(i)`` for each dimension ``i``. + rho : float, optional + The covariance value of the bands. By default 0 indicating, an identity matrix is used. + band_type : str + The band type to use. For details, see Example 1 and 2 in :footcite:`Bickel_2008`. + Either 'ma', or 'ar'. + return_params : bool, optional + Whether or not to return the distribution parameters of the classes normal distributions. + seed : int, optional + Random seed, by default None. + + Returns + ------- + X : np.ndarray of shape (n_samples, n_dim), dtype=np.float64 + Trunk dataset as a dense array. + y : np.ndarray of shape (n_samples,), dtype=np.intp + Labels of the dataset. + means : list of ArrayLike of shape (n_dim,), dtype=np.float64 + The mean vector for each class starting with class 0. + Returned if ``return_params`` is True. + covs : list of ArrayLike of shape (n_dim, n_dim), dtype=np.float64 + The covariance for each class. Returned if ``return_params`` is True. + + Notes + ----- + **Trunk**: The trunk simulation decreases the signal-to-noise ratio as the dimensionality + increases. This is implemented by decreasing the mean of the distribution by a factor of + ``sqrt(i)`` for each dimension ``i``. Thus for instance if the means of distribution one + and two are 1 and -1 respectively, the means for the first dimension will be 1 and -1, + for the second dimension will be 1/sqrt(2) and -1/sqrt(2), and so on. + + **Trunk Overlap**: The trunk overlap simulation generates two classes of data with the same + covariance matrix and mean vector of zeros. + + Covariance: The covariance matrix among different dimensions is controlled by the ``rho`` parameter + and the ``band_type`` parameter. The ``band_type`` parameter controls the type of band to use, while + the ``rho`` parameter controls the specific scaling factor for the covariance matrix while going + from one dimension to the next. + + References + ---------- + .. footbibliography:: + """ + if n_dim < n_informative: + raise ValueError( + f"Number of informative dimensions {n_informative} must be less than number " + f"of dimensions, {n_dim}" ) + rng = np.random.default_rng(seed=seed) + + mu_1_vec = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)]) + mu_0_vec = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)]) + + if rho != 0: + if band_type == "ma": + cov = _moving_avg_cov(n_informative, rho) + elif band_type == "ar": + cov = _autoregressive_cov(n_informative, rho) + else: + raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".') else: - raise ValueError( - f"Simulation must be: trunk, trunk_overlap, trunk_mix, {MARRON_WAND_SIMS.keys()}" + cov = np.identity(n_informative) + + # speed up computations for large multivariate normal matrix with SVD approximation + if n_informative > 1000: + method = "cholesky" + else: + method = "svd" + + X = np.vstack( + ( + rng.multivariate_normal(mu_1_vec, cov, n_samples // 2, method=method), + rng.multivariate_normal(mu_0_vec, cov, n_samples // 2, method=method), ) + ) if n_dim > n_informative: X = np.hstack((X, rng.normal(loc=0, scale=1, size=(X.shape[0], n_dim - n_informative)))) @@ -267,14 +503,7 @@ def make_trunk_classification( if return_params: returns = [X, y] - if simulation == "trunk": - returns += [[mu_0, mu_1], [cov, cov]] - elif simulation == "trunk-overlap": - returns += [[np.zeros(n_informative), np.zeros(n_informative)], [cov, cov]] - elif simulation == "trunk-mix": - returns += [*list(zip(*norm_params)), X_mixture] - else: - returns += [*list(zip(*norm_params)), G, w] + returns += [[mu_0_vec, mu_1_vec], [cov, cov]] return returns return X, y diff --git a/sktree/datasets/tests/test_hyppo.py b/sktree/datasets/tests/test_hyppo.py index 5c20acedd..c32612ff8 100644 --- a/sktree/datasets/tests/test_hyppo.py +++ b/sktree/datasets/tests/test_hyppo.py @@ -5,8 +5,10 @@ from sktree.datasets import ( approximate_clf_mutual_information, approximate_clf_mutual_information_with_monte_carlo, + make_marron_wand_classification, make_quadratic_classification, make_trunk_classification, + make_trunk_mixture_classification, ) from sktree.datasets.hyppo import MARRON_WAND_SIMS @@ -20,13 +22,16 @@ def test_make_quadratic_classification_v(): assert len(x) == len(v) -def test_make_trunk_classification_custom_parameters(): +@pytest.mark.parametrize( + "trunk_gen", [make_trunk_classification, make_trunk_mixture_classification] +) +def test_make_trunk_classification_custom_parameters(trunk_gen): # Test with custom parameters - X, y = make_trunk_classification( + X, y = trunk_gen( n_samples=50, n_dim=5, n_informative=2, - m_factor=2, + mu_0=0, rho=0.5, band_type="ma", return_params=False, @@ -35,12 +40,15 @@ def test_make_trunk_classification_custom_parameters(): assert y.shape == (50,) -def test_make_trunk_classification_autoregressive_cov(): +@pytest.mark.parametrize( + "trunk_gen", [make_trunk_classification, make_trunk_mixture_classification] +) +def test_make_trunk_classification_autoregressive_cov(trunk_gen): # Test with default parameters n_dim = 10 n_informative = 10 rho = 0.5 - _, _, _, cov_list = make_trunk_classification( + data = trunk_gen( n_samples=100, n_dim=n_dim, n_informative=n_informative, @@ -48,18 +56,17 @@ def test_make_trunk_classification_autoregressive_cov(): band_type="ar", return_params=True, ) + cov_list = data[3] + if trunk_gen == make_trunk_classification: + assert len(data) == 4 + assert len(data[2]) == 2 + assert len(data[3]) == 2 + assert_array_equal(cov_list[0][0, :], [rho**idx for idx in range(n_dim)]) + elif trunk_gen == make_trunk_mixture_classification: + assert len(data) == 5 + assert_array_equal(cov_list[0][0, :], [rho**idx * (2.0 / 3) ** 2 for idx in range(n_dim)]) assert_array_equal(cov_list[0], cov_list[1]) assert cov_list[0].shape == (n_dim, n_dim) - assert_array_equal(cov_list[0][0, :], [rho**idx for idx in range(n_dim)]) - - -def test_make_trunk_classification_mixture(): - # Test with default parameters - X, y, _, _ = make_trunk_classification( - n_samples=100, n_dim=10, n_informative=5, mix=0.5, return_params=True - ) - assert X.shape == (100, 10), X.shape - assert y.shape == (100,) def test_make_trunk_classification_return_params(): @@ -74,39 +81,38 @@ def test_make_trunk_classification_return_params(): assert len(covs) == 2 -def test_make_trunk_classification_invalid_band_type(): +@pytest.mark.parametrize( + "trunk_gen", [make_trunk_classification, make_trunk_mixture_classification] +) +def test_make_trunk_generator_errors(trunk_gen): # Test with an invalid band type with pytest.raises(ValueError, match=r"Band type .* must be one of"): - make_trunk_classification(n_samples=50, rho=0.5, band_type="invalid_band_type") - + trunk_gen(n_samples=50, rho=0.5, band_type="invalid_band_type") -def test_make_trunk_classification_invalid_mix(): # Test with an invalid band type - with pytest.raises(ValueError, match="Mix must be between 0 and 1."): - make_trunk_classification(n_samples=50, rho=0.5, mix=2) + with pytest.raises(ValueError, match="Number of informative dimensions"): + trunk_gen(n_samples=50, n_dim=10, n_informative=11, rho=0.5) -def test_make_trunk_classification_invalid_n_informative(): +def test_make_trunk_mixture_errors(): # Test with an invalid band type - with pytest.raises(ValueError, match="Number of informative dimensions"): - make_trunk_classification(n_samples=50, n_dim=10, n_informative=11, rho=0.5, mix=2) + with pytest.raises(ValueError, match="Mix must be between 0 and 1."): + make_trunk_mixture_classification(n_samples=50, rho=0.5, mix=2) -def test_make_trunk_classification_invalid_simulation_name(): +def test_make_marron_wand_errors(): # Test with an invalid band type with pytest.raises(ValueError, match="Simulation must be"): - make_trunk_classification(n_samples=50, rho=0.5, simulation=None) + make_marron_wand_classification(n_samples=50, rho=0.5, simulation=None) -@pytest.mark.parametrize( - "simulation", ["trunk", "trunk_overlap", "trunk_mix", *MARRON_WAND_SIMS.keys()] -) -def test_make_trunk_classification_simulations(simulation): +@pytest.mark.parametrize("simulation", [*MARRON_WAND_SIMS.keys()]) +def test_make_marron_wand_simulations(simulation): # Test with default parameters n_samples = 100 n_dim = 10 n_informative = 10 - X, y = make_trunk_classification( + X, y = make_marron_wand_classification( n_samples=n_samples, n_dim=n_dim, n_informative=n_informative, diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index ec146bda2..f263bcd90 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -156,7 +156,7 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin): ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. - bootstrap : bool, default=True + bootstrap : bool, default=False Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.