From e0300502a5fef905a05b6a028513f53f3932da73 Mon Sep 17 00:00:00 2001 From: YuxinB <99897042+YuxinB@users.noreply.github.com> Date: Thu, 12 Oct 2023 12:10:16 -0400 Subject: [PATCH 01/19] Startify sampling when split tran/test data --- sktree/stats/forestht.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 4d6dc7b77..e57d72dba 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -161,7 +161,7 @@ def reset(self): self._is_fitted = False self._seeds = None - def _get_estimators_indices(self, sample_separate=False): + def _get_estimators_indices(self, y, sample_separate=False): indices = np.arange(self._n_samples_, dtype=int) # Get drawn indices along both sample and feature axes @@ -191,7 +191,7 @@ def _get_estimators_indices(self, sample_separate=False): # Operations accessing random_state must be performed identically # to those in `_parallel_build_trees()` indices_train, indices_test = train_test_split( - indices, test_size=self.test_size, shuffle=True, random_state=seed + indices, test_size=self.test_size, shuffle=True, stratify = y ,random_state=seed ) yield indices_train, indices_test @@ -206,13 +206,16 @@ def _get_estimators_indices(self, sample_separate=False): indices_train, indices_test = train_test_split( indices, test_size=self.test_size, + stratify = y, random_state=self._seeds, + ) + for _ in self.estimator_.estimators_: yield indices_train, indices_test - @property - def train_test_samples_(self): + + def train_test_samples_(self,y): """ The subset of drawn samples for each base estimator. @@ -229,7 +232,7 @@ def train_test_samples_(self): return [ (indices_train, indices_test) - for indices_train, indices_test in self._get_estimators_indices() + for indices_train, indices_test in self._get_estimators_indices(y) ] def _statistic( @@ -332,6 +335,9 @@ def statistic( if self._type_of_target_ is None: self._type_of_target_ = type_of_target(y) + # if self.sample_dataset_per_tree and not self.permute_per_tree: + # raise ValueError("sample_dataset_per_tree is only valid when permute_per_tree=True") + if covariate_index is None: self.estimator_ = self._get_estimator() estimator = self.estimator_ @@ -426,8 +432,7 @@ def test( y : ArrayLike of shape (n_samples, n_outputs) The target matrix. covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, will shuffle all columns by - default (corresponding to None). + The index array of covariates to shuffle, by default None. metric : str, optional The metric to compute, by default "mse". n_repeats : int, optional @@ -463,9 +468,6 @@ def test( observe_stat = self.observe_stat_ # next permute the data - if covariate_index is None: - covariate_index = np.arange(X.shape[1], dtype=int) - permute_stat, permute_posteriors, permute_samples = self.statistic( X, y, @@ -493,7 +495,7 @@ def test( # If not sampling a new dataset per tree, then we may either be # permuting the covariate index per tree or per forest. If not permuting # there is only one train and test split, so we can just use that - _, indices_test = self.train_test_samples_[0] + _, indices_test = self.train_test_samples_(y)[0] indices_test = observe_samples y_test = y[indices_test, :] y_pred_proba_normal = observe_posteriors[:, indices_test, :] @@ -725,12 +727,12 @@ def _statistic( self._type_of_target_, ) for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(sample_separate=True) + self._get_estimators_indices(y,sample_separate=True) ) ) else: # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_[0] + indices_train, indices_test = self.train_test_samples_(y)[0] X_train, X_test = X[indices_train, :], X[indices_test, :] y_train, y_test = y[indices_train, :], y[indices_test, :] @@ -946,12 +948,13 @@ def _statistic( self._type_of_target_, ) for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(sample_separate=True) + self._get_estimators_indices(y,sample_separate=True) ) ) else: # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_[0] + indices_train, indices_test = self.train_test_samples_(y)[0] + X_train, X_test = X[indices_train, :], X[indices_test, :] y_train = y[indices_train, :] From 5d60959becd120746dcf2498745d3a3a2a31186e Mon Sep 17 00:00:00 2001 From: YuxinB <99897042+YuxinB@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:48:49 -0400 Subject: [PATCH 02/19] Stratified_Sample, Let startify = None for Regressor --- sktree/stats/forestht.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index e57d72dba..d1beccde6 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -161,9 +161,26 @@ def reset(self): self._is_fitted = False self._seeds = None - def _get_estimators_indices(self, y, sample_separate=False): + def _get_estimators_indices(self, stratifier, sample_separate=False): + + # Check stratifier + # if stratifier is None, stratifier is regressor + if stratifier is not None: + if self._n_samples_ is not None and stratifier.shape[0] != self._n_samples_: + raise RuntimeError( + f"stratifier must have {self._n_samples_} samples, got {stratifier.shape[0]}. " + f"If running on a new dataset, call the 'reset' method." + ) + + if self._type_of_target_ is not None and type_of_target(stratifier) != self._type_of_target_: + raise RuntimeError( + f"stratifier must have type {self._type_of_target_}, got {type_of_target(stratifier)}. " + f"If running on a new dataset, call the 'reset' method." + ) + indices = np.arange(self._n_samples_, dtype=int) + # Get drawn indices along both sample and feature axes rng = np.random.default_rng(self.estimator_.random_state) @@ -191,7 +208,7 @@ def _get_estimators_indices(self, y, sample_separate=False): # Operations accessing random_state must be performed identically # to those in `_parallel_build_trees()` indices_train, indices_test = train_test_split( - indices, test_size=self.test_size, shuffle=True, stratify = y ,random_state=seed + indices, test_size=self.test_size, shuffle=True, stratify = stratifier ,random_state=seed ) yield indices_train, indices_test @@ -206,7 +223,7 @@ def _get_estimators_indices(self, y, sample_separate=False): indices_train, indices_test = train_test_split( indices, test_size=self.test_size, - stratify = y, + stratify = stratifier, random_state=self._seeds, ) @@ -215,7 +232,7 @@ def _get_estimators_indices(self, y, sample_separate=False): yield indices_train, indices_test - def train_test_samples_(self,y): + def train_test_samples_(self,stratifier): """ The subset of drawn samples for each base estimator. @@ -232,7 +249,7 @@ def train_test_samples_(self,y): return [ (indices_train, indices_test) - for indices_train, indices_test in self._get_estimators_indices(y) + for indices_train, indices_test in self._get_estimators_indices(stratifier = stratifier) ] def _statistic( @@ -495,7 +512,7 @@ def test( # If not sampling a new dataset per tree, then we may either be # permuting the covariate index per tree or per forest. If not permuting # there is only one train and test split, so we can just use that - _, indices_test = self.train_test_samples_(y)[0] + _, indices_test = self.train_test_samples_(stratifier=y)[0] indices_test = observe_samples y_test = y[indices_test, :] y_pred_proba_normal = observe_posteriors[:, indices_test, :] @@ -732,7 +749,7 @@ def _statistic( ) else: # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_(y)[0] + indices_train, indices_test = self.train_test_samples_(stratifier=None)[0] X_train, X_test = X[indices_train, :], X[indices_test, :] y_train, y_test = y[indices_train, :], y[indices_test, :] @@ -953,7 +970,7 @@ def _statistic( ) else: # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_(y)[0] + indices_train, indices_test = self.train_test_samples_(stratifier=y)[0] X_train, X_test = X[indices_train, :], X[indices_test, :] From 78837d2457d1eb947542048f0fc0842285e65f51 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Tue, 17 Oct 2023 10:21:05 -0400 Subject: [PATCH 03/19] FIX correct changes & black format --- sktree/stats/forestht.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index d1beccde6..00ae6371f 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -162,7 +162,7 @@ def reset(self): self._seeds = None def _get_estimators_indices(self, stratifier, sample_separate=False): - + # Check stratifier # if stratifier is None, stratifier is regressor if stratifier is not None: @@ -171,8 +171,11 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): f"stratifier must have {self._n_samples_} samples, got {stratifier.shape[0]}. " f"If running on a new dataset, call the 'reset' method." ) - - if self._type_of_target_ is not None and type_of_target(stratifier) != self._type_of_target_: + + if ( + self._type_of_target_ is not None + and type_of_target(stratifier) != self._type_of_target_ + ): raise RuntimeError( f"stratifier must have type {self._type_of_target_}, got {type_of_target(stratifier)}. " f"If running on a new dataset, call the 'reset' method." @@ -180,7 +183,6 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): indices = np.arange(self._n_samples_, dtype=int) - # Get drawn indices along both sample and feature axes rng = np.random.default_rng(self.estimator_.random_state) @@ -208,7 +210,11 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): # Operations accessing random_state must be performed identically # to those in `_parallel_build_trees()` indices_train, indices_test = train_test_split( - indices, test_size=self.test_size, shuffle=True, stratify = stratifier ,random_state=seed + indices, + test_size=self.test_size, + shuffle=True, + stratify=stratifier, + random_state=seed, ) yield indices_train, indices_test @@ -223,16 +229,14 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): indices_train, indices_test = train_test_split( indices, test_size=self.test_size, - stratify = stratifier, + stratify=stratifier, random_state=self._seeds, - ) - + for _ in self.estimator_.estimators_: yield indices_train, indices_test - - def train_test_samples_(self,stratifier): + def train_test_samples_(self, stratifier): """ The subset of drawn samples for each base estimator. @@ -249,7 +253,7 @@ def train_test_samples_(self,stratifier): return [ (indices_train, indices_test) - for indices_train, indices_test in self._get_estimators_indices(stratifier = stratifier) + for indices_train, indices_test in self._get_estimators_indices(stratifier=stratifier) ] def _statistic( @@ -352,9 +356,6 @@ def statistic( if self._type_of_target_ is None: self._type_of_target_ = type_of_target(y) - # if self.sample_dataset_per_tree and not self.permute_per_tree: - # raise ValueError("sample_dataset_per_tree is only valid when permute_per_tree=True") - if covariate_index is None: self.estimator_ = self._get_estimator() estimator = self.estimator_ @@ -449,7 +450,8 @@ def test( y : ArrayLike of shape (n_samples, n_outputs) The target matrix. covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, by default None. + The index array of covariates to shuffle, will shuffle all columns by + default (corresponding to None). metric : str, optional The metric to compute, by default "mse". n_repeats : int, optional @@ -484,6 +486,9 @@ def test( observe_posteriors = self.observe_posteriors_ observe_stat = self.observe_stat_ + if covariate_index is None: + covariate_index = np.arange(X.shape[1], dtype=int) + # next permute the data permute_stat, permute_posteriors, permute_samples = self.statistic( X, @@ -744,7 +749,7 @@ def _statistic( self._type_of_target_, ) for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(y,sample_separate=True) + self._get_estimators_indices(y, sample_separate=True) ) ) else: @@ -965,13 +970,12 @@ def _statistic( self._type_of_target_, ) for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(y,sample_separate=True) + self._get_estimators_indices(y, sample_separate=True) ) ) else: # fitting a forest will only get one unique train/test split indices_train, indices_test = self.train_test_samples_(stratifier=y)[0] - X_train, X_test = X[indices_train, :], X[indices_test, :] y_train = y[indices_train, :] From 4f88518efc1504fd0b569b13700bf9885a3f1d05 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Tue, 17 Oct 2023 10:24:50 -0400 Subject: [PATCH 04/19] DOC modify warning text --- sktree/stats/forestht.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 00ae6371f..371b16199 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -168,7 +168,8 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): if stratifier is not None: if self._n_samples_ is not None and stratifier.shape[0] != self._n_samples_: raise RuntimeError( - f"stratifier must have {self._n_samples_} samples, got {stratifier.shape[0]}. " + f"Stratifier must have {self._n_samples_} samples, " + "got {stratifier.shape[0]}. " f"If running on a new dataset, call the 'reset' method." ) @@ -177,7 +178,8 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): and type_of_target(stratifier) != self._type_of_target_ ): raise RuntimeError( - f"stratifier must have type {self._type_of_target_}, got {type_of_target(stratifier)}. " + f"Stratifier must have type {self._type_of_target_}, " + f"got {type_of_target(stratifier)}. " f"If running on a new dataset, call the 'reset' method." ) From ffb81368ff1a2fc0c0c7bc8496f170890ffbffaa Mon Sep 17 00:00:00 2001 From: YuxinB <99897042+YuxinB@users.noreply.github.com> Date: Tue, 17 Oct 2023 13:01:49 -0400 Subject: [PATCH 05/19] Add unit test for verifying stratified sampling --- requirements.txt | 1 + sktree/stats/tests/test_forestht.py | 44 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/requirements.txt b/requirements.txt index 92f3a6b2b..f3ec2a094 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.25 scipy scikit-learn>=1.3 +black=22.12.0 diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index cecf34b8c..462cbab55 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -69,6 +69,50 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") +@pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) +def test_featureimportance_forest_startified(sample_dataset_per_tree): + est = FeatureImportanceForestClassifier( + estimator=RandomForestClassifier( + n_estimators=10, + random_state=seed, + ), + permute_per_tree=True, + test_size=0.7, + random_state=seed, + sample_dataset_per_tree=sample_dataset_per_tree, + ) + n_samples = 100 + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") + + + iris_X_class0 = iris_X[iris_y==0] + iris_X_class1 = iris_X[iris_y==1] + iris_y_class0 = iris_y[iris_y==0] + iris_y_class1 = iris_y[iris_y==1] + + assert ( + len(est.train_test_samples_(iris_y[:n_samples])[0][1]) == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + ), f"{len(est.train_test_samples_(iris_y[:n_samples])[0][1])} {len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" + assert len(est.train_test_samples_(iris_y[:n_samples])[0][0]) == est._n_samples_ - (len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size) + + est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mse") + assert ( + len(est.train_test_samples_(iris_y[:n_samples])[0][1]) == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + ), f"{len(est.train_test_samples_(iris_y[:n_samples])[0][1])} {len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" + assert len(est.train_test_samples_(iris_y[:n_samples])[0][0]) == est._n_samples_ - (len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size) + + with pytest.raises(RuntimeError, match="Metric must be"): + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") + + # covariate index must be an iterable + with pytest.raises(RuntimeError, match="covariate_index must be an iterable"): + est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi") + + # covariate index must be an iterable of ints + with pytest.raises(RuntimeError, match="Not all covariate_index"): + est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") + + def test_featureimportance_forest_errors(): permute_per_tree = False sample_dataset_per_tree = True From 3ff6340d99828a783f990330e79875638c4f7eca Mon Sep 17 00:00:00 2001 From: YuxinB <99897042+YuxinB@users.noreply.github.com> Date: Tue, 17 Oct 2023 13:19:27 -0400 Subject: [PATCH 06/19] Correct Typo for Stratified --- sktree/stats/tests/test_forestht.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 462cbab55..68ecb8e4d 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -70,7 +70,7 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) -def test_featureimportance_forest_startified(sample_dataset_per_tree): +def test_featureimportance_forest_stratified(sample_dataset_per_tree): est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( n_estimators=10, From 70a14a57a89d2f9b7e339f455018c4d1c608c1dd Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 21:46:52 -0400 Subject: [PATCH 07/19] Fixed example and whatsnew Signed-off-by: Adam Li --- doc/whats_new/v0.3.rst | 3 +- ...t_MI_gigantic_hypothesis_testing_forest.py | 10 +++--- sktree/_lib/sklearn_fork | 2 +- sktree/stats/forestht.py | 35 ++++++++++++------- sktree/stats/tests/test_forestht.py | 31 ++++++++++------ 5 files changed, 51 insertions(+), 30 deletions(-) diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst index fec97bb01..7b163ef19 100644 --- a/doc/whats_new/v0.3.rst +++ b/doc/whats_new/v0.3.rst @@ -15,6 +15,7 @@ Changelog - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`) - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`) - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`) +- |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`) Code and Documentation Contributors ----------------------------------- @@ -24,4 +25,4 @@ the project since version inception, including: * `Adam Li`_ * `Sambit Panda`_ - +* `Yuxin Bai`_ diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py index 423bc63dc..149580fb5 100644 --- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py +++ b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py @@ -49,8 +49,8 @@ # We simulate the two feature sets, and the target variable. We then combine them # into a single dataset to perform hypothesis testing. -n_samples = 1000 -n_features_set = 500 +n_samples = 2000 +n_features_set = 20 mean = 1.0 sigma = 2.0 beta = 5.0 @@ -91,7 +91,7 @@ # computed as the proportion of samples in the null distribution that are less than the # observed test statistic. -n_estimators = 200 +n_estimators = 100 max_features = "sqrt" test_size = 0.2 n_repeats = 1000 @@ -103,12 +103,12 @@ max_features=max_features, tree_estimator=DecisionTreeClassifier(), random_state=seed, - honest_fraction=0.7, + honest_fraction=0.25, n_jobs=n_jobs, ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_per_tree=False, sample_dataset_per_tree=False, ) diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 6c7a5f44e..1adb20907 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 6c7a5f44eb4ec3bea5dd6a9e4d5db748d12b209e +Subproject commit 1adb209077f12adac8f760196ae5260abab0cbdd diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 371b16199..140686aff 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -122,6 +122,7 @@ def __init__( test_size=0.2, permute_per_tree=True, sample_dataset_per_tree=True, + stratify=True, ): self.estimator = estimator self.random_state = random_state @@ -129,6 +130,7 @@ def __init__( self.test_size = test_size self.permute_per_tree = permute_per_tree self.sample_dataset_per_tree = sample_dataset_per_tree + self.stratify = stratify self.n_samples_test_ = None self._n_samples_ = None @@ -160,9 +162,9 @@ def reset(self): self.n_features_in_ = None self._is_fitted = False self._seeds = None + self._y = None - def _get_estimators_indices(self, stratifier, sample_separate=False): - + def _get_estimators_indices(self, stratifier=None, sample_separate=False): # Check stratifier # if stratifier is None, stratifier is regressor if stratifier is not None: @@ -173,6 +175,8 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): f"If running on a new dataset, call the 'reset' method." ) + # Type of target should be one that fits a classifier as this is + # the only instance where stratification is needed. if ( self._type_of_target_ is not None and type_of_target(stratifier) != self._type_of_target_ @@ -238,7 +242,8 @@ def _get_estimators_indices(self, stratifier, sample_separate=False): for _ in self.estimator_.estimators_: yield indices_train, indices_test - def train_test_samples_(self, stratifier): + @property + def train_test_samples_(self): """ The subset of drawn samples for each base estimator. @@ -253,6 +258,9 @@ def train_test_samples_(self, stratifier): if self._n_samples_ is None: raise RuntimeError("The estimator must be fitted before accessing this attribute.") + # Stratifier uses a cached _y attribute if available + stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None + return [ (indices_train, indices_test) for indices_train, indices_test in self._get_estimators_indices(stratifier=stratifier) @@ -365,6 +373,10 @@ def statistic( self.permuted_estimator_ = self._get_estimator() estimator = self.permuted_estimator_ + # Store a cache of the y variable + if is_classifier(self._get_estimator()): + self._y = y.copy() + # Infer type of target y if not hasattr(self, "_type_of_target"): self._type_of_target_ = type_of_target(y) @@ -519,7 +531,7 @@ def test( # If not sampling a new dataset per tree, then we may either be # permuting the covariate index per tree or per forest. If not permuting # there is only one train and test split, so we can just use that - _, indices_test = self.train_test_samples_(stratifier=y)[0] + _, indices_test = self.train_test_samples_[0] indices_test = observe_samples y_test = y[indices_test, :] y_pred_proba_normal = observe_posteriors[:, indices_test, :] @@ -658,6 +670,7 @@ def __init__( test_size=test_size, permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, + stratify=False, ) def _get_estimator(self): @@ -750,13 +763,11 @@ def _statistic( self.permute_per_tree, self._type_of_target_, ) - for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(y, sample_separate=True) - ) + for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_) ) else: # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_(stratifier=None)[0] + indices_train, indices_test = self.train_test_samples_[0] X_train, X_test = X[indices_train, :], X[indices_test, :] y_train, y_test = y[indices_train, :], y[indices_test, :] @@ -903,6 +914,7 @@ def __init__( test_size=0.2, permute_per_tree=True, sample_dataset_per_tree=True, + stratify=True, ): super().__init__( estimator=estimator, @@ -911,6 +923,7 @@ def __init__( test_size=test_size, permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, + stratify=stratify, ) def _get_estimator(self): @@ -971,13 +984,11 @@ def _statistic( self.permute_per_tree, self._type_of_target_, ) - for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(y, sample_separate=True) - ) + for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_) ) else: # fitting a forest will only get one unique train/test split - indices_train, indices_test = self.train_test_samples_(stratifier=y)[0] + indices_train, indices_test = self.train_test_samples_[0] X_train, X_test = X[indices_train, :], X[indices_test, :] y_train = y[indices_train, :] diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 68ecb8e4d..e50ed261c 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -84,22 +84,31 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): n_samples = 100 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") - - iris_X_class0 = iris_X[iris_y==0] - iris_X_class1 = iris_X[iris_y==1] - iris_y_class0 = iris_y[iris_y==0] - iris_y_class1 = iris_y[iris_y==1] + iris_y_class0 = iris_y[iris_y == 0] + iris_y_class1 = iris_y[iris_y == 1] assert ( - len(est.train_test_samples_(iris_y[:n_samples])[0][1]) == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size - ), f"{len(est.train_test_samples_(iris_y[:n_samples])[0][1])} {len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" - assert len(est.train_test_samples_(iris_y[:n_samples])[0][0]) == est._n_samples_ - (len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size) + len(est.train_test_samples_[0][1]) + == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + ), ( + f"{len(est.train_test_samples_[0][1])} " + f"{len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" + ) + assert len(est.train_test_samples_[0][0]) == est._n_samples_ - ( + len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + ) est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mse") assert ( - len(est.train_test_samples_(iris_y[:n_samples])[0][1]) == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size - ), f"{len(est.train_test_samples_(iris_y[:n_samples])[0][1])} {len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" - assert len(est.train_test_samples_(iris_y[:n_samples])[0][0]) == est._n_samples_ - (len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size) + len(est.train_test_samples_[0][1]) + == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + ), ( + f"{len(est.train_test_samples_[0][1])} " + f"{len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" + ) + assert len(est.train_test_samples_[0][0]) == est._n_samples_ - ( + len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + ) with pytest.raises(RuntimeError, match="Metric must be"): est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") From f555e2cc8e2baa9acffaf11784293f9915107e50 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 18 Oct 2023 14:29:36 -0400 Subject: [PATCH 08/19] ENH correct tests & add coverage --- sktree/stats/forestht.py | 6 ++---- sktree/stats/tests/test_forestht.py | 32 ++++++++++++++--------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 140686aff..d2e44fd0a 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -363,6 +363,8 @@ def statistic( if self._n_samples_ is None: self._n_samples_, self.n_features_in_ = X.shape + + # Infer type of target y if self._type_of_target_ is None: self._type_of_target_ = type_of_target(y) @@ -377,10 +379,6 @@ def statistic( if is_classifier(self._get_estimator()): self._y = y.copy() - # Infer type of target y - if not hasattr(self, "_type_of_target"): - self._type_of_target_ = type_of_target(y) - # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking # and then setting the internal meta data structures # first run a dummy fit on the samples to initialize the diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index e50ed261c..6e226a78f 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -82,44 +82,42 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): sample_dataset_per_tree=sample_dataset_per_tree, ) n_samples = 100 - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") iris_y_class0 = iris_y[iris_y == 0] iris_y_class1 = iris_y[iris_y == 1] assert ( len(est.train_test_samples_[0][1]) - == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + == sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size ), ( f"{len(est.train_test_samples_[0][1])} " - f"{len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" + f"{sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size}" ) assert len(est.train_test_samples_[0][0]) == est._n_samples_ - ( - len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size ) - est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mse") + est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi") assert ( len(est.train_test_samples_[0][1]) - == len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + == sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size ), ( f"{len(est.train_test_samples_[0][1])} " - f"{len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size}" + f"{sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size}" ) assert len(est.train_test_samples_[0][0]) == est._n_samples_ - ( - len(iris_y_class1) * est.test_size + len(iris_y_class0) * est.test_size + sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size ) - with pytest.raises(RuntimeError, match="Metric must be"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") + # Test if y has different shape + with pytest.raises(RuntimeError, match="Stratifier must have"): + est.statistic(iris_X[: n_samples - 1], iris_y[: n_samples - 1], metric="mi") - # covariate index must be an iterable - with pytest.raises(RuntimeError, match="covariate_index must be an iterable"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi") - - # covariate index must be an iterable of ints - with pytest.raises(RuntimeError, match="Not all covariate_index"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") + # Test if y has different type + with pytest.raises(RuntimeError, match="Stratifier must have type"): + iris_y = np.hstack((iris_y[:n_samples].reshape(-1, 1), iris_y[:n_samples].reshape(-1, 1))) + est.statistic(iris_X[:n_samples], iris_y, metric="mi") def test_featureimportance_forest_errors(): From 4595df33b9791b55540c670c1246a19dac0fdbdc Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 18 Oct 2023 14:31:38 -0400 Subject: [PATCH 09/19] FIX change n_samples for test to be valid --- sktree/stats/tests/test_forestht.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 6e226a78f..77450cf15 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -81,7 +81,7 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, ) - n_samples = 100 + n_samples = 80 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") iris_y_class0 = iris_y[iris_y == 0] From 30b6d3e006f8a2d56d4bf44c4512898468aabd48 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 18 Oct 2023 14:36:45 -0400 Subject: [PATCH 10/19] DOC update name for MIGHT & black format --- ...rest.py => plot_MI_genuine_hypothesis_testing_forest.py} | 6 +++--- sktree/stats/tests/test_forestht.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) rename examples/hypothesis_testing/{plot_MI_gigantic_hypothesis_testing_forest.py => plot_MI_genuine_hypothesis_testing_forest.py} (96%) diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py similarity index 96% rename from examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py rename to examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py index 149580fb5..e6831a9e7 100644 --- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py +++ b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py @@ -1,7 +1,7 @@ """ -=========================================================== -Mutual Information for Gigantic Hypothesis Testing (MIGHT) -=========================================================== +========================================================= +Mutual Information for Genuine Hypothesis Testing (MIGHT) +========================================================= An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric multivariate hypothesis test, on simulated datasets. Here, we present a simulation diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 77450cf15..3d6015e30 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -116,8 +116,10 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): # Test if y has different type with pytest.raises(RuntimeError, match="Stratifier must have type"): - iris_y = np.hstack((iris_y[:n_samples].reshape(-1, 1), iris_y[:n_samples].reshape(-1, 1))) - est.statistic(iris_X[:n_samples], iris_y, metric="mi") + iris_y_new = np.hstack( + (iris_y[:n_samples].reshape(-1, 1), iris_y[:n_samples].reshape(-1, 1)) + ) + est.statistic(iris_X[:n_samples], iris_y_new, metric="mi") def test_featureimportance_forest_errors(): From 9a7459d06d47f3385d567014a31534daa091a7e3 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 18 Oct 2023 14:48:27 -0400 Subject: [PATCH 11/19] FIX update the test for stratification --- sktree/stats/tests/test_forestht.py | 31 +++++++++++------------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 3d6015e30..8f9230d5f 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -81,33 +81,26 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, ) - n_samples = 80 + n_samples = 100 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") iris_y_class0 = iris_y[iris_y == 0] iris_y_class1 = iris_y[iris_y == 1] - assert ( - len(est.train_test_samples_[0][1]) - == sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size - ), ( - f"{len(est.train_test_samples_[0][1])} " - f"{sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size}" - ) - assert len(est.train_test_samples_[0][0]) == est._n_samples_ - ( - sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size + _, indices_test = est.train_test_samples_[0] + y_test = y[indices_test, :] + + assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( + f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" ) est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi") - assert ( - len(est.train_test_samples_[0][1]) - == sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size - ), ( - f"{len(est.train_test_samples_[0][1])} " - f"{sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size}" - ) - assert len(est.train_test_samples_[0][0]) == est._n_samples_ - ( - sum(iris_y_class1) * est.test_size + sum(iris_y_class0) * est.test_size + + _, indices_test = est.train_test_samples_[0] + y_test = y[indices_test, :] + + assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( + f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" ) # Test if y has different shape From e0cbb60cbf5fe431a499c360508755207e194f3f Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 18 Oct 2023 19:03:13 -0400 Subject: [PATCH 12/19] FIX correct test variables --- sktree/stats/tests/test_forestht.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 8f9230d5f..bb0216aff 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -84,11 +84,8 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): n_samples = 100 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") - iris_y_class0 = iris_y[iris_y == 0] - iris_y_class1 = iris_y[iris_y == 1] - _, indices_test = est.train_test_samples_[0] - y_test = y[indices_test, :] + y_test = iris_y[indices_test, :] assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" @@ -97,7 +94,7 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi") _, indices_test = est.train_test_samples_[0] - y_test = y[indices_test, :] + y_test = iris_y[indices_test, :] assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" From e248a7c39c8b5f6d8613f86b6bd4602bc04a2163 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 18 Oct 2023 20:17:18 -0400 Subject: [PATCH 13/19] FIX correct variable shape --- sktree/stats/tests/test_forestht.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index bb0216aff..692db2c27 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -85,7 +85,7 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") _, indices_test = est.train_test_samples_[0] - y_test = iris_y[indices_test, :] + y_test = iris_y[indices_test] assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" @@ -94,7 +94,7 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi") _, indices_test = est.train_test_samples_[0] - y_test = iris_y[indices_test, :] + y_test = iris_y[indices_test] assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" From 8ba06ef4643d0ca566d0469e9e712c7680cc4e68 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 19 Oct 2023 09:11:23 -0400 Subject: [PATCH 14/19] FIX correct test method --- sktree/stats/tests/test_forestht.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 692db2c27..351447e62 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -87,8 +87,8 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): _, indices_test = est.train_test_samples_[0] y_test = iris_y[indices_test] - assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( - f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" + assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), ( + f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}" ) est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi") @@ -96,8 +96,8 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): _, indices_test = est.train_test_samples_[0] y_test = iris_y[indices_test] - assert sum(y_test[y_test == 0]) == sum(y_test[y_test == 1]), ( - f"{sum(y_test[y_test==0])} " f"{sum(y_test[y_test==1])}" + assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), ( + f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}" ) # Test if y has different shape From 5d516a74df29825af7a8c11ab7adf99db05c5bca Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 19 Oct 2023 09:26:51 -0400 Subject: [PATCH 15/19] FIX disable check_input for correct error --- sktree/stats/tests/test_forestht.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 351447e62..e810d653c 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -102,14 +102,16 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): # Test if y has different shape with pytest.raises(RuntimeError, match="Stratifier must have"): - est.statistic(iris_X[: n_samples - 1], iris_y[: n_samples - 1], metric="mi") + est.statistic( + iris_X[: n_samples - 1], iris_y[: n_samples - 1], metric="mi", check_input=False + ) # Test if y has different type with pytest.raises(RuntimeError, match="Stratifier must have type"): iris_y_new = np.hstack( (iris_y[:n_samples].reshape(-1, 1), iris_y[:n_samples].reshape(-1, 1)) ) - est.statistic(iris_X[:n_samples], iris_y_new, metric="mi") + est.statistic(iris_X[:n_samples], iris_y_new, metric="mi", check_input=False) def test_featureimportance_forest_errors(): From 735a10be530a9477d42e7512e2478841d88ded92 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 19 Oct 2023 09:38:54 -0400 Subject: [PATCH 16/19] FIX remove duplicate checks --- sktree/stats/forestht.py | 23 ----------------------- sktree/stats/tests/test_forestht.py | 13 ------------- 2 files changed, 36 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index d2e44fd0a..89c02f5ad 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -165,28 +165,6 @@ def reset(self): self._y = None def _get_estimators_indices(self, stratifier=None, sample_separate=False): - # Check stratifier - # if stratifier is None, stratifier is regressor - if stratifier is not None: - if self._n_samples_ is not None and stratifier.shape[0] != self._n_samples_: - raise RuntimeError( - f"Stratifier must have {self._n_samples_} samples, " - "got {stratifier.shape[0]}. " - f"If running on a new dataset, call the 'reset' method." - ) - - # Type of target should be one that fits a classifier as this is - # the only instance where stratification is needed. - if ( - self._type_of_target_ is not None - and type_of_target(stratifier) != self._type_of_target_ - ): - raise RuntimeError( - f"Stratifier must have type {self._type_of_target_}, " - f"got {type_of_target(stratifier)}. " - f"If running on a new dataset, call the 'reset' method." - ) - indices = np.arange(self._n_samples_, dtype=int) # Get drawn indices along both sample and feature axes @@ -231,7 +209,6 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False): else: self._seeds = self.estimator_.random_state - # TODO: make random_state consistent indices_train, indices_test = train_test_split( indices, test_size=self.test_size, diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index e810d653c..e71e5e09b 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -100,19 +100,6 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}" ) - # Test if y has different shape - with pytest.raises(RuntimeError, match="Stratifier must have"): - est.statistic( - iris_X[: n_samples - 1], iris_y[: n_samples - 1], metric="mi", check_input=False - ) - - # Test if y has different type - with pytest.raises(RuntimeError, match="Stratifier must have type"): - iris_y_new = np.hstack( - (iris_y[:n_samples].reshape(-1, 1), iris_y[:n_samples].reshape(-1, 1)) - ) - est.statistic(iris_X[:n_samples], iris_y_new, metric="mi", check_input=False) - def test_featureimportance_forest_errors(): permute_per_tree = False From 47857c32574b8f26423daf91a93856652c81042a Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 19 Oct 2023 10:47:14 -0400 Subject: [PATCH 17/19] DOC add docstring for stratify --- sktree/stats/forestht.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 89c02f5ad..56a044c5c 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -122,7 +122,7 @@ def __init__( test_size=0.2, permute_per_tree=True, sample_dataset_per_tree=True, - stratify=True, + stratify=False, ): self.estimator = estimator self.random_state = random_state @@ -645,7 +645,6 @@ def __init__( test_size=test_size, permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, - stratify=False, ) def _get_estimator(self): @@ -837,6 +836,9 @@ class FeatureImportanceForestClassifier(BaseForestHT): sample_dataset_per_tree : bool, default=False Whether to sample the dataset per tree or per forest. + stratify : bool, default=True + Whether to stratify the samples by class labels. + Attributes ---------- estimator_ : BaseForest From 35eb7767eb8fb4dce8289fb0f4d675756b1484b9 Mon Sep 17 00:00:00 2001 From: YuxinB <99897042+YuxinB@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:18:36 -0400 Subject: [PATCH 18/19] Add contributor --- doc/whats_new/_contributors.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index 3e5ca2110..eb441d66d 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -26,3 +26,4 @@ .. _SUKI-O : https://github.com/SUKI-O .. _Ronan Perry : https://rflperry.github.io/ .. _Haoyin Xu : https://github.com/PSSF23 +.. _Yuxin Bai : https://github.com/YuxinB From 3332e9af42ca945d9e3587bfe28f679f69463c3f Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 19 Oct 2023 13:43:14 -0400 Subject: [PATCH 19/19] DOC update reference --- .../plot_MI_imbalanced_hyppo_testing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py index 882f80c3d..c8a5478a4 100644 --- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -1,7 +1,7 @@ """ -=============================================================================== -Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data -=============================================================================== +============================================================================== +Mutual Information for Genuine Hypothesis Testing (MIGHT) with Imbalanced Data +============================================================================== Here, we demonstrate how to do hypothesis testing on highly imbalanced data in terms of their feature-set dimensionalities. @@ -17,7 +17,7 @@ For other examples of hypothesis testing, see the following: -- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_gigantic_hypothesis_testing_forest.py` +- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_genuine_hypothesis_testing_forest.py` - :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_might_auc.py` For more information on the multi-view decision-tree, see