From a3a002dedbf02c8e262b308870566b7a7d3414a3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 16 Oct 2023 14:29:47 -0400 Subject: [PATCH 01/26] Add option to permute per forest fraction Signed-off-by: Adam Li --- sktree/stats/forestht.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 4d6dc7b77..48c7fb4aa 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -106,6 +106,7 @@ def _parallel_build_trees_and_compute_posteriors( posterior_arr[idx, indices_test, :] = y_pred # posterior +# TODO: add support permute_per_forest_fraction option class BaseForestHT(MetaEstimatorMixin): observe_samples_: ArrayLike observe_posteriors_: ArrayLike From 5277a5bad8d799f68b5d809f5357b6d8bca97cb6 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 16 Oct 2023 16:03:31 -0400 Subject: [PATCH 02/26] Add sep parallel func for building and predicting Signed-off-by: Adam Li --- sktree/stats/forestht.py | 68 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 48c7fb4aa..2afa97c67 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -31,6 +31,74 @@ ) +def _parallel_predict_proba(predict_proba, X, indices_test): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + # each tree predicts proba with a list of output (n_samples, n_classes[i]) + prediction = predict_proba(X[indices_test, :], check_input=False) + return prediction + + +def _parallel_build_trees_with_sepdata( + tree: Union[DecisionTreeClassifier, DecisionTreeRegressor], + n_trees: int, + idx: int, + indices_train: ArrayLike, + X: ArrayLike, + y: ArrayLike, + covariate_index, + bootstrap: bool, + max_samples, + sample_weight: ArrayLike = None, + class_weight=None, + missing_values_in_feature_mask=None, + classes=None, + random_state=None, +): + """Parallel function to build trees and compute posteriors. + + This inherently assumes that the caller function defines the indices + for the training and testing data for each tree. + """ + rng = np.random.default_rng(random_state) + X_train = X[indices_train, :] + y_train = y[indices_train, ...] + + if bootstrap: + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples=X_train.shape[0], max_samples=max_samples + ) + else: + n_samples_bootstrap = None + + # individual tree permutation of y labels + if covariate_index is not None: + indices = np.arange(X_train.shape[0], dtype=int) + # perform permutation of covariates + index_arr = rng.choice(indices, size=(X_train.shape[0], 1), replace=False, shuffle=True) + perm_X_cov = X_train[index_arr, covariate_index] + X_train[:, covariate_index] = perm_X_cov + + tree = _parallel_build_trees( + tree, + bootstrap, + X_train, + y_train, + sample_weight, + idx, + n_trees, + verbose=0, + class_weight=class_weight, + n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) + return tree + def _parallel_build_trees_and_compute_posteriors( forest: BaseForest, idx: int, From df3a1b154a7e6e423591701c1587238d658d6f3b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 16 Oct 2023 16:08:08 -0400 Subject: [PATCH 03/26] Finished adding Signed-off-by: Adam Li --- sktree/stats/forestht.py | 131 ++++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 36 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 2afa97c67..6439d8dfe 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -779,24 +779,41 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually if self.sample_dataset_per_tree or self.permute_per_tree: - Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - delayed(_parallel_build_trees_and_compute_posteriors)( - estimator, + # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( + # delayed(_parallel_build_trees_and_compute_posteriors)( + # estimator, + # idx, + # indices_train, + # indices_test, + # X, + # y, + # covariate_index, + # posterior_arr, + # False, + # self.permute_per_tree, + # self._type_of_target_, + # ) + # for idx, (indices_train, indices_test) in enumerate( + # self._get_estimators_indices(sample_separate=True) + # ) + # ) + + trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( + delayed(_parallel_build_trees_with_sepdata)( + estimator.estimators_[idx], + len(estimator.estimators_), idx, indices_train, - indices_test, X, y, covariate_index, - posterior_arr, - False, - self.permute_per_tree, - self._type_of_target_, - ) - for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(sample_separate=True) + bootstrap=estimator.bootstrap, + max_samples=estimator.max_samples, + random_state=random_states[idx], ) + for idx, (indices_train, _) in enumerate(self._get_estimators_indices()) ) + estimator.estimators_ = trees else: # fitting a forest will only get one unique train/test split indices_train, indices_test = self.train_test_samples_[0] @@ -820,15 +837,24 @@ def _statistic( estimator.fit(X_train, y_train) # construct posterior array for all trees (n_trees, n_samples_test, n_outputs) - for itree, tree in enumerate(estimator.estimators_): - posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( - -1, tree.n_outputs_ - ) + # for itree, tree in enumerate(estimator.estimators_): + # posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( + # -1, tree.n_outputs_ + # ) # set variables to compute metric samples = indices_test y_true_final = y_test + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) + for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) + ) + for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())): + _, indices_test = est_indices + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) + # determine if there are any nans in the final posterior array, when # averaged over the trees samples = _non_nan_samples(posterior_arr) @@ -1000,24 +1026,41 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually if self.sample_dataset_per_tree or self.permute_per_tree: - Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - delayed(_parallel_build_trees_and_compute_posteriors)( - estimator, + # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( + # delayed(_parallel_build_trees_and_compute_posteriors)( + # estimator, + # idx, + # indices_train, + # indices_test, + # X, + # y, + # covariate_index, + # posterior_arr, + # predict_posteriors, + # self.permute_per_tree, + # self._type_of_target_, + # ) + # for idx, (indices_train, indices_test) in enumerate( + # self._get_estimators_indices(sample_separate=True) + # ) + # ) + + trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( + delayed(_parallel_build_trees_with_sepdata)( + estimator.estimators_[idx], + len(estimator.estimators_), idx, indices_train, - indices_test, X, y, covariate_index, - posterior_arr, - predict_posteriors, - self.permute_per_tree, - self._type_of_target_, - ) - for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(sample_separate=True) + bootstrap=estimator.bootstrap, + max_samples=estimator.max_samples, + random_state=random_states[idx], ) + for idx, (indices_train, _) in enumerate(self._get_estimators_indices()) ) + estimator.estimators_ = trees else: # fitting a forest will only get one unique train/test split indices_train, indices_test = self.train_test_samples_[0] @@ -1041,19 +1084,35 @@ def _statistic( estimator.fit(X_train, y_train) # construct posterior array for all trees (n_trees, n_samples_test, n_outputs) - for itree, tree in enumerate(estimator.estimators_): - if predict_posteriors: - # XXX: currently assumes n_outputs_ == 1 - posterior_arr[itree, indices_test, ...] = tree.predict_proba(X_test).reshape( - -1, tree.n_classes_ - ) - else: - posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( - -1, tree.n_outputs_ - ) + # for itree, tree in enumerate(estimator.estimators_): + # if predict_posteriors: + # # XXX: currently assumes n_outputs_ == 1 + # posterior_arr[itree, indices_test, ...] = tree.predict_proba(X_test).reshape( + # -1, tree.n_classes_ + # ) + # else: + # posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( + # -1, tree.n_outputs_ + # ) # set variables to compute metric samples = indices_test + + # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,) + if predict_posteriors: + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict_proba, X, indices_test) + for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) + ) + else: + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) + for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) + ) + for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())): + _, indices_test = est_indices + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) + if metric == "auc": # at this point, posterior_final is the predicted posterior for only the positive class # as more than one output is not supported. From d46f1ad4615f2946a1a4bf1c7c058050ec350704 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 16 Oct 2023 16:13:53 -0400 Subject: [PATCH 04/26] Modify parallel building Signed-off-by: Adam Li --- sktree/stats/forestht.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 6439d8dfe..6d00c4118 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -99,6 +99,7 @@ def _parallel_build_trees_with_sepdata( ) return tree + def _parallel_build_trees_and_compute_posteriors( forest: BaseForest, idx: int, @@ -798,6 +799,11 @@ def _statistic( # ) # ) + if self.permute_per_tree and covariate_index is not None: + random_states = [tree.random_state for tree in estimator.estimators_] + else: + random_states = [estimator.random_state] * len(estimator.estimators_) + trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( delayed(_parallel_build_trees_with_sepdata)( estimator.estimators_[idx], @@ -818,7 +824,7 @@ def _statistic( # fitting a forest will only get one unique train/test split indices_train, indices_test = self.train_test_samples_[0] - X_train, X_test = X[indices_train, :], X[indices_test, :] + X_train, _ = X[indices_train, :], X[indices_test, :] y_train, y_test = y[indices_train, :], y[indices_test, :] if covariate_index is not None: @@ -848,10 +854,12 @@ def _statistic( # accumulate the predictions across all trees all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) - for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) - ) - for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())): + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) + for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) + ) + for itree, (proba, est_indices) in enumerate( + zip(all_proba, self._get_estimators_indices()) + ): _, indices_test = est_indices posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) @@ -1044,6 +1052,10 @@ def _statistic( # self._get_estimators_indices(sample_separate=True) # ) # ) + if self.permute_per_tree and covariate_index is not None: + random_states = [tree.random_state for tree in estimator.estimators_] + else: + random_states = [estimator.random_state] * len(estimator.estimators_) trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( delayed(_parallel_build_trees_with_sepdata)( @@ -1065,7 +1077,7 @@ def _statistic( # fitting a forest will only get one unique train/test split indices_train, indices_test = self.train_test_samples_[0] - X_train, X_test = X[indices_train, :], X[indices_test, :] + X_train, _ = X[indices_train, :], X[indices_test, :] y_train = y[indices_train, :] if covariate_index is not None: @@ -1101,15 +1113,21 @@ def _statistic( # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,) if predict_posteriors: all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict_proba, X, indices_test) + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict_proba, X, indices_test + ) for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) ) else: all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict, X, indices_test + ) for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) ) - for itree, (proba, est_indices) in enumerate(zip(all_proba, self._get_estimators_indices())): + for itree, (proba, est_indices) in enumerate( + zip(all_proba, self._get_estimators_indices()) + ): _, indices_test = est_indices posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) From 17b01ac92b586b3b67c25c90d025c623b439461b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 10:14:28 -0400 Subject: [PATCH 05/26] New submodule Signed-off-by: Adam Li --- sktree/_lib/sklearn_fork | 2 +- sktree/stats/forestht.py | 9 +++++++-- sktree/tree/_oblique_tree.pxd | 2 +- sktree/tree/_oblique_tree.pyx | 7 +++++-- sktree/tree/tests/test_tree.py | 18 ++++++++++++++++++ sktree/tree/tests/test_unsupervised_tree.py | 2 ++ .../tree/unsupervised/_unsup_oblique_tree.pyx | 7 +++++-- 7 files changed, 39 insertions(+), 8 deletions(-) diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 6c7a5f44e..1adb20907 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 6c7a5f44eb4ec3bea5dd6a9e4d5db748d12b209e +Subproject commit 1adb209077f12adac8f760196ae5260abab0cbdd diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 6d00c4118..5183d3d4f 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -20,6 +20,7 @@ _parallel_build_trees, ) from sktree.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sktree.tree._classes import DTYPE from .utils import ( METRIC_FUNCTIONS, @@ -315,7 +316,7 @@ def _statistic( raise NotImplementedError("Subclasses should implement this!") def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = None): - X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True) + X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True, dtype=DTYPE) if y.ndim != 2: y = y.reshape(-1, 1) @@ -1129,7 +1130,11 @@ def _statistic( zip(all_proba, self._get_estimators_indices()) ): _, indices_test = est_indices - posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) + + if predict_posteriors: + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_classes_) + else: + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) if metric == "auc": # at this point, posterior_final is the predicted posterior for only the positive class diff --git a/sktree/tree/_oblique_tree.pxd b/sktree/tree/_oblique_tree.pxd index db7813946..f85af4ce1 100644 --- a/sktree/tree/_oblique_tree.pxd +++ b/sktree/tree/_oblique_tree.pxd @@ -42,7 +42,7 @@ cdef class ObliqueTree(Tree): ) noexcept nogil cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node ) noexcept nogil diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 99a3d6fc0..559231dae 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -217,11 +217,14 @@ cdef class ObliqueTree(Tree): self.proj_vec_weights.resize(capacity) self.proj_vec_indices.resize(capacity) - # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: + # value memory is initialised to 0 to enable classifier argmax memset((self.value + self.capacity * self.value_stride), 0, (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) + # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) + memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) + # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: self.node_count = capacity @@ -286,7 +289,7 @@ cdef class ObliqueTree(Tree): cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node ) noexcept nogil: """Compute feature importances from a Node in the Tree. diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py index 478aa6d5b..821793aa4 100644 --- a/sktree/tree/tests/test_tree.py +++ b/sktree/tree/tests/test_tree.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest from numpy.testing import assert_allclose @@ -526,3 +528,19 @@ def test_balance_property(criterion, Tree): reg = Tree(criterion=criterion) reg.fit(X, y) assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) + + +# TODO: this does not work properly it seems +@pytest.mark.skip(reason="Regression in non-deterministic pickle.") +@pytest.mark.parametrize("Tree", ALL_TREES.values()) +def test_deterministic_pickle(Tree): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/27268 + # Uninitialised memory would lead to the two pickle strings being different. + tree1 = Tree(random_state=0).fit(iris.data, iris.target) + tree2 = Tree(random_state=0).fit(iris.data, iris.target) + + pickle1 = pickle.dumps(tree1) + pickle2 = pickle.dumps(tree2) + + assert pickle1 == pickle2, f"Failed with {Tree}" diff --git a/sktree/tree/tests/test_unsupervised_tree.py b/sktree/tree/tests/test_unsupervised_tree.py index 2dba0fab8..c48e32ac4 100644 --- a/sktree/tree/tests/test_unsupervised_tree.py +++ b/sktree/tree/tests/test_unsupervised_tree.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest from sklearn import datasets diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 51f6e6202..66e43d3fd 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -197,11 +197,14 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): self.proj_vec_weights.resize(capacity) self.proj_vec_indices.resize(capacity) - # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: + # value memory is initialised to 0 to enable classifier argmax memset((self.value + self.capacity * self.value_stride), 0, (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) - + + # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) + memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) + # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: self.node_count = capacity From 16122d3dff5d7351bd8f54f92a136961b5edb796 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 10:24:04 -0400 Subject: [PATCH 06/26] Add additional pickle test Signed-off-by: Adam Li --- sktree/tree/tests/test_all_trees.py | 13 ++++++---- sktree/tree/tests/test_tree.py | 24 ++++++++++++------- sktree/tree/tests/test_unsupervised_tree.py | 2 -- .../tree/unsupervised/_unsup_oblique_tree.pyx | 4 ++-- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/sktree/tree/tests/test_all_trees.py b/sktree/tree/tests/test_all_trees.py index a5608fda5..66a9ea307 100644 --- a/sktree/tree/tests/test_all_trees.py +++ b/sktree/tree/tests/test_all_trees.py @@ -1,7 +1,7 @@ import joblib import numpy as np import pytest -from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_array_equal +from numpy.testing import assert_almost_equal, assert_array_equal from sklearn.base import is_classifier from sklearn.datasets import make_blobs from sklearn.tree._tree import TREE_LEAF @@ -53,9 +53,14 @@ def assert_tree_equal(d, s, message): assert_almost_equal(d.impurity, s.impurity, err_msg=message + ": inequal impurity") - assert_array_almost_equal( - d.value[external], s.value[external], err_msg=message + ": inequal value" - ) + assert_array_equal(d.value[external], s.value[external], err_msg=message + ": inequal value") + + if hasattr(d, "get_projection_matrix"): + assert_array_equal( + d.get_projection_matrix(), + s.get_projection_matrix(), + err_msg=message + ": inequal projection matrix", + ) X_small = np.array( diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py index 821793aa4..94cdfef62 100644 --- a/sktree/tree/tests/test_tree.py +++ b/sktree/tree/tests/test_tree.py @@ -1,4 +1,5 @@ import pickle +import sys import numpy as np import pytest @@ -22,6 +23,8 @@ UnsupervisedObliqueDecisionTree, ) +from .test_all_trees import assert_tree_equal + CLUSTER_CRITERIONS = ("twomeans", "fastbic") REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson") CLF_CRITERIONS = ("gini", "entropy") @@ -143,12 +146,6 @@ digits.target = digits.target[perm] -def assert_tree_equal(d, s, message): - assert s.node_count == d.node_count, "{0}: inequal number of node ({1} != {2})".format( - message, s.node_count, d.node_count - ) - - def test_pickle_splitters(): """Test that splitters are picklable.""" import tempfile @@ -530,8 +527,6 @@ def test_balance_property(criterion, Tree): assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) -# TODO: this does not work properly it seems -@pytest.mark.skip(reason="Regression in non-deterministic pickle.") @pytest.mark.parametrize("Tree", ALL_TREES.values()) def test_deterministic_pickle(Tree): # Non-regression test for: @@ -543,4 +538,15 @@ def test_deterministic_pickle(Tree): pickle1 = pickle.dumps(tree1) pickle2 = pickle.dumps(tree2) - assert pickle1 == pickle2, f"Failed with {Tree}" + pickle1_tree = pickle.loads(pickle1) + pickle2_tree = pickle.loads(pickle2) + assert_tree_equal( + pickle1_tree.tree_, + pickle2_tree.tree_, + "The trees of the original and loaded classifiers are not equal.", + ) + assert sys.getsizeof(pickle1_tree) == sys.getsizeof(pickle2_tree) + + # TODO: this does not work properly it seems + if Tree not in (list(CLF_TREES.values()) + list(REG_TREES.values())): + assert pickle1 == pickle2, f"Failed with {Tree}" diff --git a/sktree/tree/tests/test_unsupervised_tree.py b/sktree/tree/tests/test_unsupervised_tree.py index c48e32ac4..2dba0fab8 100644 --- a/sktree/tree/tests/test_unsupervised_tree.py +++ b/sktree/tree/tests/test_unsupervised_tree.py @@ -1,5 +1,3 @@ -import pickle - import numpy as np import pytest from sklearn import datasets diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 66e43d3fd..e72802f66 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -201,10 +201,10 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): # value memory is initialised to 0 to enable classifier argmax memset((self.value + self.capacity * self.value_stride), 0, (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) - + # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) - + # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: self.node_count = capacity From 7d42ac7c3335fda615e01f019414ebddb5666cea Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 10:26:25 -0400 Subject: [PATCH 07/26] Add changelog Signed-off-by: Adam Li --- doc/whats_new/v0.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst index fec97bb01..6eef2ef89 100644 --- a/doc/whats_new/v0.3.rst +++ b/doc/whats_new/v0.3.rst @@ -15,6 +15,7 @@ Changelog - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`) - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`) - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`) +- |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`) Code and Documentation Contributors ----------------------------------- From 4423377c89f631b36520b9fcfd2b36de2f336d16 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 10:52:28 -0400 Subject: [PATCH 08/26] Remove unnecessary comments Signed-off-by: Adam Li --- sktree/tree/_oblique_tree.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 559231dae..ecd257862 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -272,10 +272,7 @@ cdef class ObliqueTree(Tree): # get the index of the node cdef intp_t node_id = node - self.nodes - # cdef intp_t n_projections = proj_vec_indices.size() # compute projection of the data based on trained tree - # proj_vec_weights = self.proj_vec_weights[node_id] - # proj_vec_indices = self.proj_vec_indices[node_id] for j in range(0, self.proj_vec_indices[node_id].size()): feature_index = self.proj_vec_indices[node_id][j] weight = self.proj_vec_weights[node_id][j] From 5730b326ff1f61d5421f0b4b8159cff80efd8895 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 14:36:12 -0400 Subject: [PATCH 09/26] Remove extra LOC Signed-off-by: Adam Li --- sktree/stats/forestht.py | 76 ---------------------------------------- 1 file changed, 76 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 5183d3d4f..2492a9e92 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -101,82 +101,6 @@ def _parallel_build_trees_with_sepdata( return tree -def _parallel_build_trees_and_compute_posteriors( - forest: BaseForest, - idx: int, - indices_train: ArrayLike, - indices_test: ArrayLike, - X: ArrayLike, - y: ArrayLike, - covariate_index, - posterior_arr: ArrayLike, - predict_posteriors: bool, - permute_per_tree: bool, - type_of_target, - sample_weight: ArrayLike = None, - class_weight=None, - missing_values_in_feature_mask=None, - classes=None, -): - """Parallel function to build trees and compute posteriors. - - This inherently assumes that the caller function defines the indices - for the training and testing data for each tree. - """ - tree: Union[DecisionTreeClassifier, DecisionTreeRegressor] = forest.estimators_[idx] - if permute_per_tree and covariate_index is not None: - random_state = tree.random_state - else: - random_state = forest.random_state - - X_train = X[indices_train, :] - y_train = y[indices_train, ...] - rng = np.random.default_rng(random_state) - - if forest.bootstrap: - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples=X_train.shape[0], max_samples=forest.max_samples - ) - else: - n_samples_bootstrap = None - - # individual tree permutation of y labels - if covariate_index is not None: - indices = np.arange(X_train.shape[0], dtype=int) - # perform permutation of covariates - index_arr = rng.choice(indices, size=(X_train.shape[0], 1), replace=False, shuffle=True) - perm_X_cov = X_train[index_arr, covariate_index] - X_train[:, covariate_index] = perm_X_cov - - if type_of_target == "binary": - y_train = y_train.ravel() - - tree = _parallel_build_trees( - tree, - forest.bootstrap, - X_train, - y_train, - sample_weight, - idx, - len(forest.estimators_), - verbose=0, - class_weight=class_weight, - n_samples_bootstrap=n_samples_bootstrap, - missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, - ) - - if predict_posteriors: - # XXX: currently assumes n_outputs_ == 1 - y_pred = tree.predict_proba(X[indices_test, :]).reshape(-1, tree.n_classes_) - else: - y_pred = tree.predict(X[indices_test, :]).reshape(-1, tree.n_outputs_) - - # Fill test set posteriors & set rest NaN - posterior_arr[idx, indices_test, :] = y_pred # posterior - - -# TODO: add support permute_per_forest_fraction option class BaseForestHT(MetaEstimatorMixin): observe_samples_: ArrayLike observe_posteriors_: ArrayLike From 6f978cba406548d0485e6687da2d32e0973448ac Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 16:43:48 -0400 Subject: [PATCH 10/26] Fix pvalue Signed-off-by: Adam Li --- .../plot_MI_gigantic_hypothesis_testing_forest.py | 6 +++--- sktree/stats/forestht.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py index 423bc63dc..71db71b5c 100644 --- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py +++ b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py @@ -108,7 +108,7 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_per_tree=False, sample_dataset_per_tree=False, ) @@ -120,7 +120,7 @@ stat, pvalue = est.test( X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats ) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") +print(f"Estimated MI difference for the important feature set: {stat} with Pvalue: {pvalue}") # we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 stat, pvalue = est.test( @@ -130,7 +130,7 @@ metric="mi", n_repeats=n_repeats, ) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") +print(f"Estimated MI difference for the unimportant feature set: {stat} with Pvalue: {pvalue}") # %% # References diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 2492a9e92..f7ec46ad5 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -115,8 +115,8 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=True, - sample_dataset_per_tree=True, + permute_per_tree=False, + sample_dataset_per_tree=False, ): self.estimator = estimator self.random_state = random_state @@ -617,8 +617,8 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=True, - sample_dataset_per_tree=True, + permute_per_tree=False, + sample_dataset_per_tree=False, ): super().__init__( estimator=estimator, @@ -903,8 +903,8 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=True, - sample_dataset_per_tree=True, + permute_per_tree=False, + sample_dataset_per_tree=False, ): super().__init__( estimator=estimator, From 58d5365c896396525aaca95936e54fadd166303c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 16:45:06 -0400 Subject: [PATCH 11/26] Lint Signed-off-by: Adam Li --- sktree/stats/forestht.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index f7ec46ad5..9f0edcb32 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -11,7 +11,6 @@ from sklearn.utils.validation import _is_fitted, check_X_y from sktree._lib.sklearn.ensemble._forest import ( - BaseForest, ForestClassifier, ForestRegressor, RandomForestClassifier, From f5f282aca67eec88ccb559b83dcae3455319e544 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 21:21:35 -0400 Subject: [PATCH 12/26] STart work on permute fraction of forest Signed-off-by: Adam Li --- sktree/stats/forestht.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 9f0edcb32..c08259593 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -116,6 +116,7 @@ def __init__( test_size=0.2, permute_per_tree=False, sample_dataset_per_tree=False, + permute_forest_fraction=None, ): self.estimator = estimator self.random_state = random_state @@ -123,7 +124,8 @@ def __init__( self.test_size = test_size self.permute_per_tree = permute_per_tree self.sample_dataset_per_tree = sample_dataset_per_tree - + self.permute_forest_fraction = permute_forest_fraction + self.n_samples_test_ = None self._n_samples_ = None self._metric = None @@ -618,6 +620,7 @@ def __init__( test_size=0.2, permute_per_tree=False, sample_dataset_per_tree=False, + permute_forest_fraction=None, ): super().__init__( estimator=estimator, @@ -626,6 +629,7 @@ def __init__( test_size=test_size, permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, + permute_forest_fraction=permute_forest_fraction ) def _get_estimator(self): @@ -904,6 +908,7 @@ def __init__( test_size=0.2, permute_per_tree=False, sample_dataset_per_tree=False, + permute_forest_fraction=None, ): super().__init__( estimator=estimator, @@ -912,6 +917,7 @@ def __init__( test_size=test_size, permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, + permute_forest_fraction=permute_forest_fraction ) def _get_estimator(self): From 3674cc20809d30566eefb915ed7d39b67684cf13 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Mon, 23 Oct 2023 13:32:07 -0400 Subject: [PATCH 13/26] FIX add stratifi --- sktree/stats/forestht.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index ab29afdd9..5ee738327 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1064,17 +1064,17 @@ def _statistic( delayed(_parallel_predict_proba)( estimator.estimators_[idx].predict_proba, X, indices_test ) - for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) + for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y)) ) else: all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( delayed(_parallel_predict_proba)( estimator.estimators_[idx].predict, X, indices_test ) - for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) + for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y)) ) for itree, (proba, est_indices) in enumerate( - zip(all_proba, self._get_estimators_indices()) + zip(all_proba, self._get_estimators_indices(y)) ): _, indices_test = est_indices From 0f27d018fbc3786a4f23adf5c0931ef4b01faa47 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 24 Oct 2023 10:37:56 -0400 Subject: [PATCH 14/26] Try stash Signed-off-by: Adam Li --- sktree/stats/forestht.py | 25 +++++++++++++------------ sktree/stats/tests/test_forestht.py | 1 - 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 5ee738327..08d3908d5 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -177,13 +177,13 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False): self._seeds.append(tree.random_state) seeds = self._seeds - if sample_separate: - if self._perm_seeds is None: - new_rng = np.random.default_rng(np.random.randint(0, 1e6)) - self._perm_seeds = new_rng.integers( - low=0, high=np.iinfo(np.int32).max, size=len(self.estimator_.estimators_) - ) - seeds = self._perm_seeds + # if sample_separate: + # if self._perm_seeds is None: + # new_rng = np.random.default_rng(np.random.randint(0, 1e6)) + # self._perm_seeds = new_rng.integers( + # low=0, high=np.iinfo(np.int32).max, size=len(self.estimator_.estimators_) + # ) + # seeds = self._perm_seeds for idx, tree in enumerate(self.estimator_.estimators_): seed = seeds[idx] @@ -207,6 +207,7 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False): indices_train, indices_test = train_test_split( indices, + shuffle=True, test_size=self.test_size, stratify=stratifier, random_state=self._seeds, @@ -796,10 +797,10 @@ def _statistic( # accumulate the predictions across all trees all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) - for idx, (_, indices_test) in enumerate(self._get_estimators_indices()) + for idx, (_, indices_test) in enumerate(self.train_test_samples_) ) for itree, (proba, est_indices) in enumerate( - zip(all_proba, self._get_estimators_indices()) + zip(all_proba, self.train_test_samples_) ): _, indices_test = est_indices posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) @@ -1064,17 +1065,17 @@ def _statistic( delayed(_parallel_predict_proba)( estimator.estimators_[idx].predict_proba, X, indices_test ) - for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y)) + for idx, (_, indices_test) in enumerate(self.train_test_samples_) ) else: all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( delayed(_parallel_predict_proba)( estimator.estimators_[idx].predict, X, indices_test ) - for idx, (_, indices_test) in enumerate(self._get_estimators_indices(y)) + for idx, (_, indices_test) in enumerate(self.train_test_samples_) ) for itree, (proba, est_indices) in enumerate( - zip(all_proba, self._get_estimators_indices(y)) + zip(all_proba, self.train_test_samples_) ): _, indices_test = est_indices diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index e71e5e09b..d5eade80d 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -475,7 +475,6 @@ def test_small_dataset_dependent(seed): y = np.vstack( [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] ) # Binary classification - print(X.shape, y.shape) clf = FeatureImportanceForestClassifier( estimator=HonestForestClassifier( From e8947088dac06fcbb3ee9fe13b3b7b0d0e01e576 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 24 Oct 2023 11:33:52 -0400 Subject: [PATCH 15/26] UPdate and address permute forest fraction Signed-off-by: Adam Li --- sktree/stats/forestht.py | 62 +++++++++++-------- sktree/stats/tests/test_forestht.py | 93 ++++++++++++++++------------- sktree/tree/_honest_tree.py | 1 + 3 files changed, 89 insertions(+), 67 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 08d3908d5..847557ac0 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -114,19 +114,19 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=False, - sample_dataset_per_tree=False, stratify=False, + sample_dataset_per_tree=False, permute_forest_fraction=None, ): self.estimator = estimator self.random_state = random_state self.verbose = verbose self.test_size = test_size - self.permute_per_tree = permute_per_tree + self.stratify = stratify + + # XXX: possibly removing these parameters self.sample_dataset_per_tree = sample_dataset_per_tree self.permute_forest_fraction = permute_forest_fraction - self.stratify = stratify self.n_samples_test_ = None self._n_samples_ = None @@ -166,15 +166,33 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False): # Get drawn indices along both sample and feature axes rng = np.random.default_rng(self.estimator_.random_state) - if self.sample_dataset_per_tree: + if self.permute_forest_fraction is None: + permute_forest_fraction = 0.0 + else: + permute_forest_fraction = self.permute_forest_fraction + + # TODO: consolidate how we "sample/permute" per subset of the forest + if self.sample_dataset_per_tree or permute_forest_fraction > 0.0: + # sample random seeds if self._seeds is None: self._seeds = [] + self._n_permutations = 0 - for tree in self.estimator_.estimators_: - if tree.random_state is None: - self._seeds.append(rng.integers(low=0, high=np.iinfo(np.int32).max)) - else: - self._seeds.append(tree.random_state) + num_trees_per_seed = max( + int(permute_forest_fraction * len(self.estimator_.estimators_)), 1 + ) + for tree_idx, tree in enumerate(self.estimator_.estimators_): + if tree_idx == 0 or tree_idx % num_trees_per_seed == 0: + if tree.random_state is None: + seed = rng.integers(low=0, high=np.iinfo(np.int32).max) + else: + seed = tree.random_state + + self._n_permutations += 1 + self._seeds.append(seed) + + # now that we have the random seeds, we can sample the train/test indices + # deterministically seeds = self._seeds # if sample_separate: @@ -632,7 +650,7 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=False, + # permute_per_tree=False, sample_dataset_per_tree=False, permute_forest_fraction=None, ): @@ -641,7 +659,7 @@ def __init__( random_state=random_state, verbose=verbose, test_size=test_size, - permute_per_tree=permute_per_tree, + # permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, permute_forest_fraction=permute_forest_fraction, ) @@ -721,7 +739,7 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually - if self.sample_dataset_per_tree or self.permute_per_tree: + if self.sample_dataset_per_tree or self.permute_forest_fraction: # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( # delayed(_parallel_build_trees_and_compute_posteriors)( # estimator, @@ -741,7 +759,7 @@ def _statistic( # ) # ) - if self.permute_per_tree and covariate_index is not None: + if self.permute_forest_fraction and covariate_index is not None: random_states = [tree.random_state for tree in estimator.estimators_] else: random_states = [estimator.random_state] * len(estimator.estimators_) @@ -799,9 +817,7 @@ def _statistic( delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) for idx, (_, indices_test) in enumerate(self.train_test_samples_) ) - for itree, (proba, est_indices) in enumerate( - zip(all_proba, self.train_test_samples_) - ): + for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): _, indices_test = est_indices posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) @@ -923,7 +939,7 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=False, + # permute_per_tree=False, sample_dataset_per_tree=False, stratify=True, permute_forest_fraction=None, @@ -933,7 +949,7 @@ def __init__( random_state=random_state, verbose=verbose, test_size=test_size, - permute_per_tree=permute_per_tree, + # permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, stratify=stratify, permute_forest_fraction=permute_forest_fraction, @@ -982,7 +998,7 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually - if self.sample_dataset_per_tree or self.permute_per_tree: + if self.sample_dataset_per_tree or self.permute_forest_fraction: # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( # delayed(_parallel_build_trees_and_compute_posteriors)( # estimator, @@ -1001,7 +1017,7 @@ def _statistic( # self._get_estimators_indices(sample_separate=True) # ) # ) - if self.permute_per_tree and covariate_index is not None: + if self.permute_forest_fraction and covariate_index is not None: random_states = [tree.random_state for tree in estimator.estimators_] else: random_states = [estimator.random_state] * len(estimator.estimators_) @@ -1074,9 +1090,7 @@ def _statistic( ) for idx, (_, indices_test) in enumerate(self.train_test_samples_) ) - for itree, (proba, est_indices) in enumerate( - zip(all_proba, self.train_test_samples_) - ): + for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): _, indices_test = est_indices if predict_posteriors: diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index d5eade80d..6c6668563 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -33,17 +33,18 @@ @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): + n_samples = 50 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( n_estimators=10, random_state=seed, ), - permute_per_tree=True, + # permute_per_tree=True, + permute_forest_fraction=1.0 / n_samples, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, ) - n_samples = 50 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") assert ( @@ -71,17 +72,18 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_featureimportance_forest_stratified(sample_dataset_per_tree): + n_samples = 100 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( n_estimators=10, random_state=seed, ), - permute_per_tree=True, + # permute_per_tree=True, + permute_forest_fraction=1.0 / n_samples, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, ) - n_samples = 100 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") _, indices_test = est.train_test_samples_[0] @@ -102,14 +104,14 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): def test_featureimportance_forest_errors(): - permute_per_tree = False + # permute_per_tree = False sample_dataset_per_tree = True est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( n_estimators=10, ), test_size=0.5, - permute_per_tree=permute_per_tree, + permute_forest_fraction=None, sample_dataset_per_tree=sample_dataset_per_tree, ) with pytest.raises(RuntimeError, match="The estimator must be fitted"): @@ -139,15 +141,15 @@ def test_featureimportance_forest_errors(): ], ) @pytest.mark.parametrize( - "permute_per_tree", + "permute_forest_fraction", [ - True, - False, + None, + 0.5, ], ) @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_iris_pauc_statistic( - criterion, honest_prior, estimator, permute_per_tree, sample_dataset_per_tree + criterion, honest_prior, estimator, permute_forest_fraction, sample_dataset_per_tree ): limit = 0.1 max_features = "sqrt" @@ -168,14 +170,14 @@ def test_iris_pauc_statistic( ), test_size=test_size, sample_dataset_per_tree=sample_dataset_per_tree, - permute_per_tree=permute_per_tree, + permute_forest_fraction=permute_forest_fraction, ) # now add completely uninformative feature X = np.hstack((iris_X, rng.standard_normal(size=(iris_X.shape[0], 4)))) # test for unimportant feature set clf.reset() - if sample_dataset_per_tree and not permute_per_tree: + if sample_dataset_per_tree and permute_forest_fraction is None: # test in another test return @@ -219,7 +221,8 @@ def test_iris_pauc_statistic( n_estimators=10, ), random_state=seed, - permute_per_tree=False, + # permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=False, ), ], @@ -323,19 +326,33 @@ def test_pickle(tmpdir): assert_array_equal(getattr(clf, attr), getattr(clf_pickle, attr)) -@pytest.mark.parametrize("permute_per_tree", [True, False], ids=["permute_per_tree", "no_permute"]) +@pytest.mark.parametrize( + "permute_forest_fraction", + [ + None, + # 0.5 + ], + ids=[ + "no_permute" + # "permute_forest_fraction", + ], +) @pytest.mark.parametrize( "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"] ) -def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_dataset_per_tree): +def test_sample_size_consistency_of_estimator_indices_( + permute_forest_fraction, sample_dataset_per_tree +): """Test that the test-sample indices are what is expected.""" clf = FeatureImportanceForestClassifier( estimator=HonestForestClassifier( n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - permute_per_tree=permute_per_tree, + # permute_per_tree=permute_per_tree, + permute_forest_fraction=permute_forest_fraction, sample_dataset_per_tree=sample_dataset_per_tree, + stratify=False, ) n_samples = 100 @@ -346,6 +363,7 @@ def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_ _, posteriors, samples = clf.statistic( X, y, covariate_index=None, return_posteriors=True, metric="mi" ) + print(clf._seeds) if sample_dataset_per_tree: # check the non-nans non_nan_idx = _non_nan_samples(posteriors) @@ -367,7 +385,11 @@ def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_ f"{set(sorted_est_samples_idx) - set(sorted_sample_idx)}", ) else: - assert_array_equal(samples, sorted(clf.train_test_samples_[0][1])) + assert_array_equal( + samples, + sorted(clf.train_test_samples_[0][1]), + err_msg=f"Samples {set(samples) - set(sorted(clf.train_test_samples_[0][1]))}.", + ) assert len(_non_nan_samples(posteriors)) == len(samples) @@ -384,7 +406,8 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - permute_per_tree=True, + # permute_per_tree=True, + permute_forest_fraction=1.0 / n_samples, sample_dataset_per_tree=sample_dataset_per_tree, ) other_clf = FeatureImportanceForestClassifier( @@ -392,7 +415,8 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - permute_per_tree=False, + # permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=sample_dataset_per_tree, ) @@ -405,8 +429,8 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da assert_array_equal(clf.train_test_samples_[idx][0], estimator_train_test_indices[idx][0]) assert_array_equal(clf.train_test_samples_[idx][1], estimator_train_test_indices[idx][1]) - # if the sample_dataset_per_tree, then the indices should be different across all - if sample_dataset_per_tree: + # if the sample_dataset_per_tree, then the indices should be different across all trees + if sample_dataset_per_tree or clf.permute_forest_fraction > 0.0: for indices, other_indices in combinations(clf.train_test_samples_, 2): assert not np.array_equal(indices[0], other_indices[0]) assert not np.array_equal(indices[1], other_indices[1]) @@ -426,7 +450,7 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da ) # when seed is passed, the indices should be deterministic - if seed is not None: + if seed is not None and sample_dataset_per_tree: assert_array_equal( clf.train_test_samples_[idx][0], other_clf.train_test_samples_[idx][0] ) @@ -448,7 +472,8 @@ def test_small_dataset_independent(seed): n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5 ), test_size=0.2, - permute_per_tree=False, + # permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=False, ) stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") @@ -481,7 +506,8 @@ def test_small_dataset_dependent(seed): n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5 ), test_size=0.2, - permute_per_tree=False, + # permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=False, ) stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") @@ -491,22 +517,3 @@ def test_small_dataset_dependent(seed): stat, pvalue = clf.test(X, y, metric="mi") assert pvalue <= 0.05 - - -# @pytest.mark.monitor_test -# def test_memory_usage(): -# n_samples = 1000 -# n_features = 5000 -# X = rng.uniform(size=(n_samples, n_features)) -# y = rng.integers(0, 2, size=n_samples) # Binary classification - -# clf = FeatureImportanceForestClassifier( -# estimator=HonestForestClassifier( -# n_estimators=10, random_state=seed, n_jobs=-1, honest_fraction=0.5 -# ), -# test_size=0.2, -# permute_per_tree=False, -# sample_dataset_per_tree=False, -# ) - -# stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 7d534ddb9..b8bd2528b 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -528,6 +528,7 @@ def _fit( nonzero_indices = np.where(_sample_weight > 0)[0] + # TODO: perhaps we want to stratify this split self.structure_indices_ = rng.choice( nonzero_indices, int((1 - self.honest_fraction) * len(nonzero_indices)), From 2887909692f3ef7a55bbd6f6b1ba113f110ed9d4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 24 Oct 2023 13:16:17 -0400 Subject: [PATCH 16/26] WIP Signed-off-by: Adam Li --- sktree/stats/forestht.py | 108 +++++++++++++--------------- sktree/stats/tests/test_forestht.py | 17 ++--- 2 files changed, 55 insertions(+), 70 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 847557ac0..c1f9026fd 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -10,6 +10,7 @@ from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _is_fitted, check_X_y +from sktree import HonestForestClassifier from sktree._lib.sklearn.ensemble._forest import ( ForestClassifier, ForestRegressor, @@ -117,6 +118,7 @@ def __init__( stratify=False, sample_dataset_per_tree=False, permute_forest_fraction=None, + train_test_split=True, ): self.estimator = estimator self.random_state = random_state @@ -124,6 +126,7 @@ def __init__( self.test_size = test_size self.stratify = stratify + self.train_test_split = train_test_split # XXX: possibly removing these parameters self.sample_dataset_per_tree = sample_dataset_per_tree self.permute_forest_fraction = permute_forest_fraction @@ -250,6 +253,10 @@ def train_test_samples_(self): if self._n_samples_ is None: raise RuntimeError("The estimator must be fitted before accessing this attribute.") + # we are not train/test splitting, then + if not self.train_test_split: + return [(np.arange(self._n_samples_, dtype=int), np.array([], dtype=int)) for _ in range(len(self.estimator_.estimators_))] + # Stratifier uses a cached _y attribute if available stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None @@ -303,6 +310,9 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = f"y must have type {self._type_of_target_}, got {type_of_target(y)}. " f"If running on a new dataset, call the 'reset' method." ) + + if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier): + raise RuntimeError(f'Train test split must occur if not using honest forest classifier.') return X, y, covariate_index @@ -592,12 +602,16 @@ class FeatureImportanceForestRegressor(BaseForestHT): test_size : float, default=0.2 Proportion of samples per tree to use for the test set. - permute_per_tree : bool, default=True - Whether to permute the covariate index per tree or per forest. - sample_dataset_per_tree : bool, default=False Whether to sample the dataset per tree or per forest. + permute_forest_fraction : float, default=None + The fraction of trees to permute the covariate index for. If None, then + just one permutation is performed. + + train_test_split : bool, default=True + Whether to split the dataset before passing to the forest. + Attributes ---------- estimator_ : BaseForest @@ -653,6 +667,7 @@ def __init__( # permute_per_tree=False, sample_dataset_per_tree=False, permute_forest_fraction=None, + train_test_split=True, ): super().__init__( estimator=estimator, @@ -662,6 +677,7 @@ def __init__( # permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, permute_forest_fraction=permute_forest_fraction, + train_test_split=train_test_split, ) def _get_estimator(self): @@ -740,25 +756,6 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually if self.sample_dataset_per_tree or self.permute_forest_fraction: - # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - # delayed(_parallel_build_trees_and_compute_posteriors)( - # estimator, - # idx, - # indices_train, - # indices_test, - # X, - # y, - # covariate_index, - # posterior_arr, - # False, - # self.permute_per_tree, - # self._type_of_target_, - # ) - # for idx, (indices_train, indices_test) in enumerate( - # self._get_estimators_indices(sample_separate=True) - # ) - # ) - if self.permute_forest_fraction and covariate_index is not None: random_states = [tree.random_state for tree in estimator.estimators_] else: @@ -802,24 +799,29 @@ def _statistic( y_train = y_train.ravel() estimator.fit(X_train, y_train) - # construct posterior array for all trees (n_trees, n_samples_test, n_outputs) - # for itree, tree in enumerate(estimator.estimators_): - # posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( - # -1, tree.n_outputs_ - # ) - # set variables to compute metric samples = indices_test y_true_final = y_test - # accumulate the predictions across all trees - all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) - for idx, (_, indices_test) in enumerate(self.train_test_samples_) - ) - for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): - _, indices_test = est_indices - posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) + # TODO: probably a more elegant way of doing this + if self.train_test_split: + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) + for idx, (_, indices_test) in enumerate(self.train_test_samples_) + ) + for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): + _, indices_test = est_indices + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) + else: + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) + for idx, honest_tree in enumerate(estimator.estimators_) + ) + for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): + _, indices_test = est_indices + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) # determine if there are any nans in the final posterior array, when # averaged over the trees @@ -880,14 +882,18 @@ class FeatureImportanceForestClassifier(BaseForestHT): test_size : float, default=0.2 Proportion of samples per tree to use for the test set. - permute_per_tree : bool, default=True - Whether to permute the covariate index per tree or per forest. + stratify : bool, default=True + Whether to stratify the samples by class labels. sample_dataset_per_tree : bool, default=False Whether to sample the dataset per tree or per forest. - stratify : bool, default=True - Whether to stratify the samples by class labels. + permute_forest_fraction : float, default=None + The fraction of trees to permute the covariate index for. If None, then + just one permutation is performed. + + train_test_split : bool, default=True + Whether to split the data into train/test before passing to the forest. Attributes ---------- @@ -940,9 +946,10 @@ def __init__( verbose=0, test_size=0.2, # permute_per_tree=False, - sample_dataset_per_tree=False, stratify=True, + sample_dataset_per_tree=False, permute_forest_fraction=None, + train_test_split=True, ): super().__init__( estimator=estimator, @@ -952,6 +959,7 @@ def __init__( # permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, stratify=stratify, + train_test_split=train_test_split, permute_forest_fraction=permute_forest_fraction, ) @@ -999,24 +1007,6 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually if self.sample_dataset_per_tree or self.permute_forest_fraction: - # Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - # delayed(_parallel_build_trees_and_compute_posteriors)( - # estimator, - # idx, - # indices_train, - # indices_test, - # X, - # y, - # covariate_index, - # posterior_arr, - # predict_posteriors, - # self.permute_per_tree, - # self._type_of_target_, - # ) - # for idx, (indices_train, indices_test) in enumerate( - # self._get_estimators_indices(sample_separate=True) - # ) - # ) if self.permute_forest_fraction and covariate_index is not None: random_states = [tree.random_state for tree in estimator.estimators_] else: diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 6c6668563..4343a27af 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -328,14 +328,8 @@ def test_pickle(tmpdir): @pytest.mark.parametrize( "permute_forest_fraction", - [ - None, - # 0.5 - ], - ids=[ - "no_permute" - # "permute_forest_fraction", - ], + [None, 0.5], + ids=["no_permute", "permute_forest_fraction"], ) @pytest.mark.parametrize( "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"] @@ -363,11 +357,12 @@ def test_sample_size_consistency_of_estimator_indices_( _, posteriors, samples = clf.statistic( X, y, covariate_index=None, return_posteriors=True, metric="mi" ) - print(clf._seeds) - if sample_dataset_per_tree: + + if sample_dataset_per_tree or permute_forest_fraction is not None: # check the non-nans non_nan_idx = _non_nan_samples(posteriors) - assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}" + if sample_dataset_per_tree: + assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}" sorted_sample_idx = sorted(np.unique(samples)) sorted_est_samples_idx = sorted( From 739c7bef58586b73f43fae52bff30d611a375f29 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 24 Oct 2023 17:35:38 -0400 Subject: [PATCH 17/26] Adding ability to turn off train/test split Signed-off-by: Adam Li --- sktree/stats/forestht.py | 79 ++++++++++++++++++----------- sktree/stats/tests/test_forestht.py | 54 ++++++++++++++++++++ 2 files changed, 104 insertions(+), 29 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index c1f9026fd..7f35ff9f4 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -10,7 +10,6 @@ from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _is_fitted, check_X_y -from sktree import HonestForestClassifier from sktree._lib.sklearn.ensemble._forest import ( ForestClassifier, ForestRegressor, @@ -19,6 +18,7 @@ _get_n_samples_bootstrap, _parallel_build_trees, ) +from sktree.ensemble._honest_forest import HonestForestClassifier from sktree.tree import DecisionTreeClassifier, DecisionTreeRegressor from sktree.tree._classes import DTYPE @@ -253,9 +253,12 @@ def train_test_samples_(self): if self._n_samples_ is None: raise RuntimeError("The estimator must be fitted before accessing this attribute.") - # we are not train/test splitting, then + # we are not train/test splitting, then if not self.train_test_split: - return [(np.arange(self._n_samples_, dtype=int), np.array([], dtype=int)) for _ in range(len(self.estimator_.estimators_))] + return [ + (np.arange(self._n_samples_, dtype=int), np.array([], dtype=int)) + for _ in range(len(self.estimator_.estimators_)) + ] # Stratifier uses a cached _y attribute if available stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None @@ -310,9 +313,11 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = f"y must have type {self._type_of_target_}, got {type_of_target(y)}. " f"If running on a new dataset, call the 'reset' method." ) - + if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier): - raise RuntimeError(f'Train test split must occur if not using honest forest classifier.') + raise RuntimeError( + "Train test split must occur if not using honest forest classifier." + ) return X, y, covariate_index @@ -807,21 +812,24 @@ def _statistic( if self.train_test_split: # accumulate the predictions across all trees all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict, X, indices_test + ) for idx, (_, indices_test) in enumerate(self.train_test_samples_) ) for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): _, indices_test = est_indices posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) else: + all_indices = np.arange(self._n_samples_, dtype=int) + # accumulate the predictions across all trees all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, indices_test) - for idx, honest_tree in enumerate(estimator.estimators_) + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, all_indices) + for idx in range(len(estimator.estimators_)) ) - for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): - _, indices_test = est_indices - posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) + for itree, proba in enumerate(all_proba): + posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_outputs_) # determine if there are any nans in the final posterior array, when # averaged over the trees @@ -1050,29 +1058,37 @@ def _statistic( y_train = y_train.ravel() estimator.fit(X_train, y_train) - # construct posterior array for all trees (n_trees, n_samples_test, n_outputs) - # for itree, tree in enumerate(estimator.estimators_): - # if predict_posteriors: - # # XXX: currently assumes n_outputs_ == 1 - # posterior_arr[itree, indices_test, ...] = tree.predict_proba(X_test).reshape( - # -1, tree.n_classes_ - # ) - # else: - # posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( - # -1, tree.n_outputs_ - # ) - # set variables to compute metric samples = indices_test # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,) if predict_posteriors: - all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( - delayed(_parallel_predict_proba)( - estimator.estimators_[idx].predict_proba, X, indices_test + # all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + # delayed(_parallel_predict_proba)( + # estimator.estimators_[idx].predict_proba, X, indices_test + # ) + # for idx, (_, indices_test) in enumerate(self.train_test_samples_) + # ) + + # TODO: probably a more elegant way of doing this + if self.train_test_split: + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict_proba, X, indices_test + ) + for idx, (_, indices_test) in enumerate(self.train_test_samples_) + ) + else: + all_indices = np.arange(self._n_samples_, dtype=int) + + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict_proba, X, all_indices + ) + for idx in range(len(estimator.estimators_)) ) - for idx, (_, indices_test) in enumerate(self.train_test_samples_) - ) else: all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( delayed(_parallel_predict_proba)( @@ -1084,7 +1100,12 @@ def _statistic( _, indices_test = est_indices if predict_posteriors: - posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_classes_) + if self.train_test_split: + posterior_arr[itree, indices_test, ...] = proba.reshape( + -1, estimator.n_classes_ + ) + else: + posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_classes_) else: posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 4343a27af..39d5d9468 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -512,3 +512,57 @@ def test_small_dataset_dependent(seed): stat, pvalue = clf.test(X, y, metric="mi") assert pvalue <= 0.05 + + +@flaky(max_runs=3) +def test_no_traintest_split(): + n_samples = 500 + n_features = 5 + rng = np.random.default_rng(seed) + + X = rng.uniform(size=(n_samples, n_features)) + X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = X * 2 + X = np.vstack([X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + + clf = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=50, + max_features=n_features, + random_state=seed, + n_jobs=1, + honest_fraction=0.5, + ), + test_size=0.2, + train_test_split=False, + permute_forest_fraction=None, + sample_dataset_per_tree=False, + ) + stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") + + # since no train-test split, the training is all the data and the testing is none of the data + assert_array_equal(clf.train_test_samples_[0][0], np.arange(n_samples)) + assert_array_equal(clf.train_test_samples_[0][1], np.array([])) + + assert ~np.isnan(pvalue) + assert ~np.isnan(stat) + assert pvalue <= 0.05, f"{pvalue}" + + stat, pvalue = clf.test(X, y, metric="mi") + assert pvalue <= 0.05, f"{pvalue}" + + X = rng.uniform(size=(n_samples, n_features)) + y = rng.integers(0, 2, size=n_samples) # Binary classification + clf.reset() + + stat, pvalue = clf.test(X, y, metric="mi") + assert_almost_equal(stat, 0.0, decimal=1) + assert pvalue > 0.05, f"{pvalue}" + + stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") + assert ~np.isnan(pvalue) + assert ~np.isnan(stat) + assert pvalue > 0.05, f"{pvalue}" From 36c5582be3e3df51bfffd3eb6c10e67cd2c5e2b2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 8 Nov 2023 09:09:01 -0500 Subject: [PATCH 18/26] Fix type checK Signed-off-by: Adam Li --- sktree/stats/forestht.py | 22 ++++++++++------------ sktree/stats/permutationforest.py | 5 +++-- sktree/stats/utils.py | 6 +++--- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 7f35ff9f4..130739ace 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple, Union +from typing import Optional, Callable, Tuple, Union import numpy as np from joblib import Parallel, delayed @@ -54,7 +54,7 @@ def _parallel_build_trees_with_sepdata( covariate_index, bootstrap: bool, max_samples, - sample_weight: ArrayLike = None, + sample_weight: Optional[ArrayLike] = None, class_weight=None, missing_values_in_feature_mask=None, classes=None, @@ -76,6 +76,7 @@ def _parallel_build_trees_with_sepdata( else: n_samples_bootstrap = None + # XXX: this currently creates a copy of X_train on RAM, which is not ideal # individual tree permutation of y labels if covariate_index is not None: indices = np.arange(X_train.shape[0], dtype=int) @@ -280,7 +281,7 @@ def _statistic( ): raise NotImplementedError("Subclasses should implement this!") - def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = None): + def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[ArrayLike] = None): X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True, dtype=DTYPE) if y.ndim != 2: y = y.reshape(-1, 1) @@ -315,9 +316,7 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = ) if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier): - raise RuntimeError( - "Train test split must occur if not using honest forest classifier." - ) + raise RuntimeError("Train test split must occur if not using honest forest classifier.") return X, y, covariate_index @@ -325,7 +324,7 @@ def statistic( self, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mi", return_posteriors: bool = False, check_input: bool = True, @@ -444,7 +443,7 @@ def test( self, X, y, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric: str = "mi", n_repeats: int = 1000, return_posteriors: bool = True, @@ -612,7 +611,8 @@ class FeatureImportanceForestRegressor(BaseForestHT): permute_forest_fraction : float, default=None The fraction of trees to permute the covariate index for. If None, then - just one permutation is performed. + just one permutation is performed. If sampling a permutation per tree + is desirable, then the fraction should be set to ``1. / n_estimators``. train_test_split : bool, default=True Whether to split the dataset before passing to the forest. @@ -669,7 +669,6 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - # permute_per_tree=False, sample_dataset_per_tree=False, permute_forest_fraction=None, train_test_split=True, @@ -679,7 +678,6 @@ def __init__( random_state=random_state, verbose=verbose, test_size=test_size, - # permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, permute_forest_fraction=permute_forest_fraction, train_test_split=train_test_split, @@ -698,7 +696,7 @@ def statistic( self, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mse", return_posteriors: bool = False, check_input: bool = True, diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py index 4a27f539f..78a2437a4 100644 --- a/sktree/stats/permutationforest.py +++ b/sktree/stats/permutationforest.py @@ -10,6 +10,7 @@ from sktree._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier, ForestRegressor from .utils import METRIC_FUNCTIONS, REGRESSOR_METRICS, _compute_null_distribution_perm +from typing import Optional class BasePermutationForest(MetaEstimatorMixin): @@ -62,7 +63,7 @@ def _statistic( estimator: BaseForest, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mse", return_posteriors: bool = False, seed=None, @@ -117,7 +118,7 @@ def statistic( self, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mse", return_posteriors: bool = False, check_input: bool = True, diff --git a/sktree/stats/utils.py b/sktree/stats/utils.py index f21dd00a8..7ff7a9f05 100644 --- a/sktree/stats/utils.py +++ b/sktree/stats/utils.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import Optional, Tuple import numpy as np from numpy.typing import ArrayLike @@ -112,7 +112,7 @@ def _compute_null_distribution_perm( est: ForestClassifier, metric: str = "mse", n_repeats: int = 1000, - seed: int = None, + seed: Optional[int] = None, ) -> ArrayLike: """Compute null distribution using permutation method. @@ -173,7 +173,7 @@ def _compute_null_distribution_coleman( y_pred_proba_perm: ArrayLike, metric: str = "mse", n_repeats: int = 1000, - seed: int = None, + seed: Optional[int] = None, ) -> Tuple[ArrayLike, ArrayLike]: """Compute null distribution using Coleman method. From 1ff7a5cb1aa6d5e5b38d2159afd7b4036c6cef85 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 8 Nov 2023 09:10:29 -0500 Subject: [PATCH 19/26] Fix typing Signed-off-by: Adam Li --- sktree/experimental/mutual_info.py | 2 +- sktree/stats/forestht.py | 2 +- sktree/stats/permutationforest.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sktree/experimental/mutual_info.py b/sktree/experimental/mutual_info.py index a1820e715..85c8f0a83 100644 --- a/sktree/experimental/mutual_info.py +++ b/sktree/experimental/mutual_info.py @@ -147,7 +147,7 @@ def mutual_info_ksg( algorithm="kd_tree", n_jobs: int = -1, transform: str = "rank", - random_seed: int = None, + random_seed: Optional[int] = None, ): """Compute the generalized (conditional) mutual information KSG estimate. diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 130739ace..f0518dc5e 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1,4 +1,4 @@ -from typing import Optional, Callable, Tuple, Union +from typing import Callable, Optional, Tuple, Union import numpy as np from joblib import Parallel, delayed diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py index 78a2437a4..edaa0878d 100644 --- a/sktree/stats/permutationforest.py +++ b/sktree/stats/permutationforest.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from numpy.typing import ArrayLike from sklearn.base import MetaEstimatorMixin, clone, is_classifier @@ -10,7 +12,6 @@ from sktree._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier, ForestRegressor from .utils import METRIC_FUNCTIONS, REGRESSOR_METRICS, _compute_null_distribution_perm -from typing import Optional class BasePermutationForest(MetaEstimatorMixin): From c9c22e91218deda424b5a47d43b30744082da50a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 8 Nov 2023 09:31:07 -0500 Subject: [PATCH 20/26] Fix ci Signed-off-by: Adam Li --- ...ot_MI_genuine_hypothesis_testing_forest.py | 6 --- .../plot_MI_imbalanced_hyppo_testing.py | 8 --- examples/hypothesis_testing/plot_might_auc.py | 2 +- .../hypothesis_testing/plot_might_mv_auc.py | 4 +- sktree/stats/forestht.py | 22 ++++++++- sktree/stats/tests/test_forestht.py | 49 ++++++++++++++++--- 6 files changed, 65 insertions(+), 26 deletions(-) diff --git a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py index cf69eb8e8..2ddec2289 100644 --- a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py +++ b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py @@ -108,14 +108,8 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=False, - sample_dataset_per_tree=False, ) -print( - f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " - f"{est.sample_dataset_per_tree}" -) # we test for the first feature set, which is important and thus should return a pvalue < 0.05 stat, pvalue = est.test( X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py index c8a5478a4..4d068fb8b 100644 --- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -132,16 +132,10 @@ def make_multiview_classification( ), random_state=seed, test_size=test_size, - permute_per_tree=False, - sample_dataset_per_tree=False, ) mv_results = dict() -print( - f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " - f"{est.sample_dataset_per_tree}" -) # we test for the first feature set, which is important and thus should return a pvalue < 0.05 stat, pvalue = est.test( X, y, covariate_index=np.arange(10, dtype=int), metric="mi", n_repeats=n_repeats @@ -177,8 +171,6 @@ def make_multiview_classification( ), random_state=seed, test_size=test_size, - permute_per_tree=False, - sample_dataset_per_tree=False, ) rf_results = dict() diff --git a/examples/hypothesis_testing/plot_might_auc.py b/examples/hypothesis_testing/plot_might_auc.py index fd032972a..74ea79abd 100644 --- a/examples/hypothesis_testing/plot_might_auc.py +++ b/examples/hypothesis_testing/plot_might_auc.py @@ -84,7 +84,7 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=True, ) diff --git a/examples/hypothesis_testing/plot_might_mv_auc.py b/examples/hypothesis_testing/plot_might_mv_auc.py index d4e573d82..2a492eb7c 100644 --- a/examples/hypothesis_testing/plot_might_mv_auc.py +++ b/examples/hypothesis_testing/plot_might_mv_auc.py @@ -72,7 +72,7 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=True, ) @@ -104,7 +104,7 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=True, ) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index f0518dc5e..81426aa51 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -144,7 +144,15 @@ def __init__( @property def n_estimators(self): - return self.estimator_.n_estimators + try: + return self.estimator_.n_estimators + except Exception: + return self.permuted_estimator_.n_estimators + finally: + return self._get_estimator().n_estimators + + def _get_estimator(self): + pass def reset(self): class_attributes = dir(type(self)) @@ -318,6 +326,18 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[Arr if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier): raise RuntimeError("Train test split must occur if not using honest forest classifier.") + if self.permute_forest_fraction is not None and self.permute_forest_fraction < 0.0: + raise RuntimeError("permute_forest_fraction must be non-negative.") + + if ( + self.permute_forest_fraction is not None + and self.permute_forest_fraction * self.n_estimators < 1.0 + ): + raise RuntimeError( + "permute_forest_fraction must be greater than 1./n_estimators, " + f"got {self.permute_forest_fraction}." + ) + return X, y, covariate_index def statistic( diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 39d5d9468..f1dbd1bf4 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -34,13 +34,14 @@ @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): n_samples = 50 + n_estimators = 10 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( - n_estimators=10, + n_estimators=n_estimators, random_state=seed, ), # permute_per_tree=True, - permute_forest_fraction=1.0 / n_samples, + permute_forest_fraction=1.0 / n_estimators, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, @@ -69,17 +70,48 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): with pytest.raises(RuntimeError, match="Not all covariate_index"): est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") + with pytest.raises( + RuntimeError, match="permute_forest_fraction must be greater than 1./n_estimators" + ): + est = FeatureImportanceForestClassifier( + estimator=RandomForestClassifier( + n_estimators=n_estimators, + random_state=seed, + ), + # permute_per_tree=True, + permute_forest_fraction=1.0 / n_samples, + test_size=0.7, + random_state=seed, + sample_dataset_per_tree=sample_dataset_per_tree, + ) + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") + + with pytest.raises(RuntimeError, match="permute_forest_fraction must be non-negative."): + est = FeatureImportanceForestClassifier( + estimator=RandomForestClassifier( + n_estimators=n_estimators, + random_state=seed, + ), + # permute_per_tree=True, + permute_forest_fraction=-1.0 / n_estimators, + test_size=0.7, + random_state=seed, + sample_dataset_per_tree=sample_dataset_per_tree, + ) + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") + @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_featureimportance_forest_stratified(sample_dataset_per_tree): n_samples = 100 + n_estimators = 10 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( - n_estimators=10, + n_estimators=n_estimators, random_state=seed, ), # permute_per_tree=True, - permute_forest_fraction=1.0 / n_samples, + permute_forest_fraction=1.0 / n_estimators, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, @@ -106,9 +138,10 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): def test_featureimportance_forest_errors(): # permute_per_tree = False sample_dataset_per_tree = True + n_estimators = 10 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( - n_estimators=10, + n_estimators=n_estimators, ), test_size=0.5, permute_forest_fraction=None, @@ -395,14 +428,14 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da n_features = 5 X = rng.uniform(size=(n_samples, n_features)) y = rng.integers(0, 2, size=n_samples) # Binary classification - + n_estimators = 10 clf = FeatureImportanceForestClassifier( estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 + n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, # permute_per_tree=True, - permute_forest_fraction=1.0 / n_samples, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=sample_dataset_per_tree, ) other_clf = FeatureImportanceForestClassifier( From 2a6cda3eb9bc83ea84b90a1297a9494c6458bb9a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 8 Nov 2023 09:49:32 -0500 Subject: [PATCH 21/26] Remove fluff Signed-off-by: Adam Li --- sktree/stats/forestht.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 81426aa51..fa5f88200 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -207,13 +207,6 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False): # deterministically seeds = self._seeds - # if sample_separate: - # if self._perm_seeds is None: - # new_rng = np.random.default_rng(np.random.randint(0, 1e6)) - # self._perm_seeds = new_rng.integers( - # low=0, high=np.iinfo(np.int32).max, size=len(self.estimator_.estimators_) - # ) - # seeds = self._perm_seeds for idx, tree in enumerate(self.estimator_.estimators_): seed = seeds[idx] From b183db18cea0e9683f8fe1ad9e5cabfc1625b2cc Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 8 Nov 2023 10:04:45 -0500 Subject: [PATCH 22/26] Remove any mention of permute_per_tree Signed-off-by: Adam Li --- sktree/stats/forestht.py | 2 -- sktree/stats/tests/test_coleman.py | 13 +++++-------- sktree/stats/tests/test_forestht.py | 21 +++++---------------- 3 files changed, 10 insertions(+), 26 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index fa5f88200..86a5bb32f 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -964,7 +964,6 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - # permute_per_tree=False, stratify=True, sample_dataset_per_tree=False, permute_forest_fraction=None, @@ -975,7 +974,6 @@ def __init__( random_state=random_state, verbose=verbose, test_size=test_size, - # permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, stratify=stratify, train_test_split=train_test_split, diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 676b7a817..d0b40402a 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -52,7 +52,7 @@ n_jobs=-1, ), "random_state": seed, - "permute_per_tree": False, + "permute_per_forest_fraction": None, "sample_dataset_per_tree": False, }, 300, # n_samples @@ -69,7 +69,7 @@ # n_estimators=150, # n_jobs=-1, # ), - # "permute_per_tree": True, + # "permute_per_forest_fraction": 1. / 150, # "sample_dataset_per_tree": True, # }, # 300, # n_samples @@ -86,7 +86,7 @@ n_jobs=-1, ), # "random_state": seed, - "permute_per_tree": True, + "permute_per_forest_fraction": 1.0 / 125, "sample_dataset_per_tree": False, }, 300, # n_samples @@ -171,8 +171,6 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_estimators=100, n_jobs=-1, ), - # "random_state": seed, - "permute_per_tree": False, "sample_dataset_per_tree": False, }, 600, # n_samples @@ -190,7 +188,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) # n_jobs=-1, # ), # # "random_state": seed, - # "permute_per_tree": True, + # "permute_per_forest_fraction": 1. / 150, # "sample_dataset_per_tree": True, # }, # 600, # n_samples @@ -206,8 +204,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_estimators=100, n_jobs=-1, ), - # "random_state": seed, - "permute_per_tree": True, + "permute_per_forest_fraction": 1.0 / 100, "sample_dataset_per_tree": False, }, 600, # n_samples diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index f1dbd1bf4..a242193b1 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -40,8 +40,7 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): n_estimators=n_estimators, random_state=seed, ), - # permute_per_tree=True, - permute_forest_fraction=1.0 / n_estimators, + permute_forest_fraction=1.0 / n_estimators * 5, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, @@ -78,7 +77,6 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): n_estimators=n_estimators, random_state=seed, ), - # permute_per_tree=True, permute_forest_fraction=1.0 / n_samples, test_size=0.7, random_state=seed, @@ -92,8 +90,7 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): n_estimators=n_estimators, random_state=seed, ), - # permute_per_tree=True, - permute_forest_fraction=-1.0 / n_estimators, + permute_forest_fraction=-1.0 / n_estimators * 5, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, @@ -110,8 +107,7 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): n_estimators=n_estimators, random_state=seed, ), - # permute_per_tree=True, - permute_forest_fraction=1.0 / n_estimators, + permute_forest_fraction=1.0 / n_estimators * 5, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, @@ -136,7 +132,6 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): def test_featureimportance_forest_errors(): - # permute_per_tree = False sample_dataset_per_tree = True n_estimators = 10 est = FeatureImportanceForestClassifier( @@ -187,7 +182,7 @@ def test_iris_pauc_statistic( limit = 0.1 max_features = "sqrt" n_repeats = 200 - n_estimators = 100 + n_estimators = 25 test_size = 0.2 # Check consistency on dataset iris. @@ -254,7 +249,6 @@ def test_iris_pauc_statistic( n_estimators=10, ), random_state=seed, - # permute_per_tree=False, permute_forest_fraction=None, sample_dataset_per_tree=False, ), @@ -376,7 +370,6 @@ def test_sample_size_consistency_of_estimator_indices_( n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - # permute_per_tree=permute_per_tree, permute_forest_fraction=permute_forest_fraction, sample_dataset_per_tree=sample_dataset_per_tree, stratify=False, @@ -423,7 +416,7 @@ def test_sample_size_consistency_of_estimator_indices_( @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) @pytest.mark.parametrize("seed", [None, 0]) -def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree): +def test_sample_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree): n_samples = 100 n_features = 5 X = rng.uniform(size=(n_samples, n_features)) @@ -434,7 +427,6 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - # permute_per_tree=True, permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=sample_dataset_per_tree, ) @@ -443,7 +435,6 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - # permute_per_tree=False, permute_forest_fraction=None, sample_dataset_per_tree=sample_dataset_per_tree, ) @@ -500,7 +491,6 @@ def test_small_dataset_independent(seed): n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5 ), test_size=0.2, - # permute_per_tree=False, permute_forest_fraction=None, sample_dataset_per_tree=False, ) @@ -534,7 +524,6 @@ def test_small_dataset_dependent(seed): n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5 ), test_size=0.2, - # permute_per_tree=False, permute_forest_fraction=None, sample_dataset_per_tree=False, ) From 59ae89be645c6e0089989fcef47c76d876adc1f0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 9 Nov 2023 11:47:19 -0500 Subject: [PATCH 23/26] Fix slow test Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index d0b40402a..889da8af6 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -52,7 +52,7 @@ n_jobs=-1, ), "random_state": seed, - "permute_per_forest_fraction": None, + "permute_forest_fraction": None, "sample_dataset_per_tree": False, }, 300, # n_samples @@ -69,7 +69,7 @@ # n_estimators=150, # n_jobs=-1, # ), - # "permute_per_forest_fraction": 1. / 150, + # "permute_forest_fraction": 1. / 150, # "sample_dataset_per_tree": True, # }, # 300, # n_samples @@ -86,7 +86,7 @@ n_jobs=-1, ), # "random_state": seed, - "permute_per_forest_fraction": 1.0 / 125, + "permute_forest_fraction": 1.0 / 125, "sample_dataset_per_tree": False, }, 300, # n_samples @@ -188,7 +188,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) # n_jobs=-1, # ), # # "random_state": seed, - # "permute_per_forest_fraction": 1. / 150, + # "permute_forest_fraction": 1. / 150, # "sample_dataset_per_tree": True, # }, # 600, # n_samples @@ -204,7 +204,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_estimators=100, n_jobs=-1, ), - "permute_per_forest_fraction": 1.0 / 100, + "permute_forest_fraction": 1.0 / 100, "sample_dataset_per_tree": False, }, 600, # n_samples From 2e1d53b88eb67592ad4ff3a42ca81c6a4fc66e0c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 9 Nov 2023 12:04:14 -0500 Subject: [PATCH 24/26] Try to fix slow Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 889da8af6..7f73e6a32 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -199,7 +199,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) FeatureImportanceForestClassifier, { "estimator": RandomForestClassifier( - max_features="sqrt", + max_features=0.3, # random_state=seed, n_estimators=100, n_jobs=-1, From f3aa7d76e016be9c69d673a85f41f2233302fa42 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 9 Nov 2023 12:33:18 -0500 Subject: [PATCH 25/26] Update Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 7f73e6a32..2ce888afe 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -199,12 +199,12 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) FeatureImportanceForestClassifier, { "estimator": RandomForestClassifier( - max_features=0.3, + max_features='sqrt', # random_state=seed, n_estimators=100, n_jobs=-1, ), - "permute_forest_fraction": 1.0 / 100, + "permute_forest_fraction": 0.3, "sample_dataset_per_tree": False, }, 600, # n_samples From 9d0f2db30f996c9d078c0b034661d16e0150e9bd Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 9 Nov 2023 13:04:56 -0500 Subject: [PATCH 26/26] Lint Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 2ce888afe..69efbe378 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -199,7 +199,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) FeatureImportanceForestClassifier, { "estimator": RandomForestClassifier( - max_features='sqrt', + max_features="sqrt", # random_state=seed, n_estimators=100, n_jobs=-1,