diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst index 6270ad13c..1a4de76c7 100644 --- a/doc/whats_new/v0.4.rst +++ b/doc/whats_new/v0.4.rst @@ -12,7 +12,8 @@ Version 0.4 Changelog --------- -- + +- |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`) Code and Documentation Contributors ----------------------------------- diff --git a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py index e6831a9e7..2ddec2289 100644 --- a/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py +++ b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py @@ -108,19 +108,13 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=False, - sample_dataset_per_tree=False, ) -print( - f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " - f"{est.sample_dataset_per_tree}" -) # we test for the first feature set, which is important and thus should return a pvalue < 0.05 stat, pvalue = est.test( X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats ) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") +print(f"Estimated MI difference for the important feature set: {stat} with Pvalue: {pvalue}") # we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05 stat, pvalue = est.test( @@ -130,7 +124,7 @@ metric="mi", n_repeats=n_repeats, ) -print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}") +print(f"Estimated MI difference for the unimportant feature set: {stat} with Pvalue: {pvalue}") # %% # References diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py index dbcb23ccb..fe37c8e33 100644 --- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -134,16 +134,10 @@ def make_multiview_classification( ), random_state=seed, test_size=test_size, - permute_per_tree=False, - sample_dataset_per_tree=False, ) mv_results = dict() -print( - f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: " - f"{est.sample_dataset_per_tree}" -) # we test for the first feature set, which is important and thus should return a pvalue < 0.05 stat, pvalue = est.test( X, y, covariate_index=np.arange(10, dtype=int), metric="mi", n_repeats=n_repeats @@ -179,8 +173,6 @@ def make_multiview_classification( ), random_state=seed, test_size=test_size, - permute_per_tree=False, - sample_dataset_per_tree=False, ) rf_results = dict() diff --git a/examples/hypothesis_testing/plot_might_auc.py b/examples/hypothesis_testing/plot_might_auc.py index fd032972a..74ea79abd 100644 --- a/examples/hypothesis_testing/plot_might_auc.py +++ b/examples/hypothesis_testing/plot_might_auc.py @@ -84,7 +84,7 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=True, ) diff --git a/examples/hypothesis_testing/plot_might_mv_auc.py b/examples/hypothesis_testing/plot_might_mv_auc.py index d4e573d82..2a492eb7c 100644 --- a/examples/hypothesis_testing/plot_might_mv_auc.py +++ b/examples/hypothesis_testing/plot_might_mv_auc.py @@ -72,7 +72,7 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=True, ) @@ -104,7 +104,7 @@ ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=True, ) diff --git a/sktree/experimental/mutual_info.py b/sktree/experimental/mutual_info.py index a1820e715..85c8f0a83 100644 --- a/sktree/experimental/mutual_info.py +++ b/sktree/experimental/mutual_info.py @@ -147,7 +147,7 @@ def mutual_info_ksg( algorithm="kd_tree", n_jobs: int = -1, transform: str = "rank", - random_seed: int = None, + random_seed: Optional[int] = None, ): """Compute the generalized (conditional) mutual information KSG estimate. diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 56a044c5c..86a5bb32f 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple, Union +from typing import Callable, Optional, Tuple, Union import numpy as np from joblib import Parallel, delayed @@ -11,7 +11,6 @@ from sklearn.utils.validation import _is_fitted, check_X_y from sktree._lib.sklearn.ensemble._forest import ( - BaseForest, ForestClassifier, ForestRegressor, RandomForestClassifier, @@ -19,7 +18,9 @@ _get_n_samples_bootstrap, _parallel_build_trees, ) +from sktree.ensemble._honest_forest import HonestForestClassifier from sktree.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sktree.tree._classes import DTYPE from .utils import ( METRIC_FUNCTIONS, @@ -31,45 +32,51 @@ ) -def _parallel_build_trees_and_compute_posteriors( - forest: BaseForest, +def _parallel_predict_proba(predict_proba, X, indices_test): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + # each tree predicts proba with a list of output (n_samples, n_classes[i]) + prediction = predict_proba(X[indices_test, :], check_input=False) + return prediction + + +def _parallel_build_trees_with_sepdata( + tree: Union[DecisionTreeClassifier, DecisionTreeRegressor], + n_trees: int, idx: int, indices_train: ArrayLike, - indices_test: ArrayLike, X: ArrayLike, y: ArrayLike, covariate_index, - posterior_arr: ArrayLike, - predict_posteriors: bool, - permute_per_tree: bool, - type_of_target, - sample_weight: ArrayLike = None, + bootstrap: bool, + max_samples, + sample_weight: Optional[ArrayLike] = None, class_weight=None, missing_values_in_feature_mask=None, classes=None, + random_state=None, ): """Parallel function to build trees and compute posteriors. This inherently assumes that the caller function defines the indices for the training and testing data for each tree. """ - tree: Union[DecisionTreeClassifier, DecisionTreeRegressor] = forest.estimators_[idx] - if permute_per_tree and covariate_index is not None: - random_state = tree.random_state - else: - random_state = forest.random_state - + rng = np.random.default_rng(random_state) X_train = X[indices_train, :] y_train = y[indices_train, ...] - rng = np.random.default_rng(random_state) - if forest.bootstrap: + if bootstrap: n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples=X_train.shape[0], max_samples=forest.max_samples + n_samples=X_train.shape[0], max_samples=max_samples ) else: n_samples_bootstrap = None + # XXX: this currently creates a copy of X_train on RAM, which is not ideal # individual tree permutation of y labels if covariate_index is not None: indices = np.arange(X_train.shape[0], dtype=int) @@ -78,32 +85,21 @@ def _parallel_build_trees_and_compute_posteriors( perm_X_cov = X_train[index_arr, covariate_index] X_train[:, covariate_index] = perm_X_cov - if type_of_target == "binary": - y_train = y_train.ravel() - tree = _parallel_build_trees( tree, - forest.bootstrap, + bootstrap, X_train, y_train, sample_weight, idx, - len(forest.estimators_), + n_trees, verbose=0, class_weight=class_weight, n_samples_bootstrap=n_samples_bootstrap, missing_values_in_feature_mask=missing_values_in_feature_mask, classes=classes, ) - - if predict_posteriors: - # XXX: currently assumes n_outputs_ == 1 - y_pred = tree.predict_proba(X[indices_test, :]).reshape(-1, tree.n_classes_) - else: - y_pred = tree.predict(X[indices_test, :]).reshape(-1, tree.n_outputs_) - - # Fill test set posteriors & set rest NaN - posterior_arr[idx, indices_test, :] = y_pred # posterior + return tree class BaseForestHT(MetaEstimatorMixin): @@ -120,18 +116,22 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=True, - sample_dataset_per_tree=True, stratify=False, + sample_dataset_per_tree=False, + permute_forest_fraction=None, + train_test_split=True, ): self.estimator = estimator self.random_state = random_state self.verbose = verbose self.test_size = test_size - self.permute_per_tree = permute_per_tree - self.sample_dataset_per_tree = sample_dataset_per_tree self.stratify = stratify + self.train_test_split = train_test_split + # XXX: possibly removing these parameters + self.sample_dataset_per_tree = sample_dataset_per_tree + self.permute_forest_fraction = permute_forest_fraction + self.n_samples_test_ = None self._n_samples_ = None self._metric = None @@ -144,7 +144,15 @@ def __init__( @property def n_estimators(self): - return self.estimator_.n_estimators + try: + return self.estimator_.n_estimators + except Exception: + return self.permuted_estimator_.n_estimators + finally: + return self._get_estimator().n_estimators + + def _get_estimator(self): + pass def reset(self): class_attributes = dir(type(self)) @@ -170,24 +178,35 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False): # Get drawn indices along both sample and feature axes rng = np.random.default_rng(self.estimator_.random_state) - if self.sample_dataset_per_tree: + if self.permute_forest_fraction is None: + permute_forest_fraction = 0.0 + else: + permute_forest_fraction = self.permute_forest_fraction + + # TODO: consolidate how we "sample/permute" per subset of the forest + if self.sample_dataset_per_tree or permute_forest_fraction > 0.0: + # sample random seeds if self._seeds is None: self._seeds = [] + self._n_permutations = 0 - for tree in self.estimator_.estimators_: - if tree.random_state is None: - self._seeds.append(rng.integers(low=0, high=np.iinfo(np.int32).max)) - else: - self._seeds.append(tree.random_state) + num_trees_per_seed = max( + int(permute_forest_fraction * len(self.estimator_.estimators_)), 1 + ) + for tree_idx, tree in enumerate(self.estimator_.estimators_): + if tree_idx == 0 or tree_idx % num_trees_per_seed == 0: + if tree.random_state is None: + seed = rng.integers(low=0, high=np.iinfo(np.int32).max) + else: + seed = tree.random_state + + self._n_permutations += 1 + self._seeds.append(seed) + + # now that we have the random seeds, we can sample the train/test indices + # deterministically seeds = self._seeds - if sample_separate: - if self._perm_seeds is None: - new_rng = np.random.default_rng(np.random.randint(0, 1e6)) - self._perm_seeds = new_rng.integers( - low=0, high=np.iinfo(np.int32).max, size=len(self.estimator_.estimators_) - ) - seeds = self._perm_seeds for idx, tree in enumerate(self.estimator_.estimators_): seed = seeds[idx] @@ -211,6 +230,7 @@ def _get_estimators_indices(self, stratifier=None, sample_separate=False): indices_train, indices_test = train_test_split( indices, + shuffle=True, test_size=self.test_size, stratify=stratifier, random_state=self._seeds, @@ -235,6 +255,13 @@ def train_test_samples_(self): if self._n_samples_ is None: raise RuntimeError("The estimator must be fitted before accessing this attribute.") + # we are not train/test splitting, then + if not self.train_test_split: + return [ + (np.arange(self._n_samples_, dtype=int), np.array([], dtype=int)) + for _ in range(len(self.estimator_.estimators_)) + ] + # Stratifier uses a cached _y attribute if available stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None @@ -255,8 +282,8 @@ def _statistic( ): raise NotImplementedError("Subclasses should implement this!") - def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = None): - X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True) + def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[ArrayLike] = None): + X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True, dtype=DTYPE) if y.ndim != 2: y = y.reshape(-1, 1) @@ -289,13 +316,28 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = f"If running on a new dataset, call the 'reset' method." ) + if not self.train_test_split and not isinstance(self.estimator, HonestForestClassifier): + raise RuntimeError("Train test split must occur if not using honest forest classifier.") + + if self.permute_forest_fraction is not None and self.permute_forest_fraction < 0.0: + raise RuntimeError("permute_forest_fraction must be non-negative.") + + if ( + self.permute_forest_fraction is not None + and self.permute_forest_fraction * self.n_estimators < 1.0 + ): + raise RuntimeError( + "permute_forest_fraction must be greater than 1./n_estimators, " + f"got {self.permute_forest_fraction}." + ) + return X, y, covariate_index def statistic( self, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mi", return_posteriors: bool = False, check_input: bool = True, @@ -414,7 +456,7 @@ def test( self, X, y, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric: str = "mi", n_repeats: int = 1000, return_posteriors: bool = True, @@ -577,12 +619,17 @@ class FeatureImportanceForestRegressor(BaseForestHT): test_size : float, default=0.2 Proportion of samples per tree to use for the test set. - permute_per_tree : bool, default=True - Whether to permute the covariate index per tree or per forest. - sample_dataset_per_tree : bool, default=False Whether to sample the dataset per tree or per forest. + permute_forest_fraction : float, default=None + The fraction of trees to permute the covariate index for. If None, then + just one permutation is performed. If sampling a permutation per tree + is desirable, then the fraction should be set to ``1. / n_estimators``. + + train_test_split : bool, default=True + Whether to split the dataset before passing to the forest. + Attributes ---------- estimator_ : BaseForest @@ -635,16 +682,18 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=True, - sample_dataset_per_tree=True, + sample_dataset_per_tree=False, + permute_forest_fraction=None, + train_test_split=True, ): super().__init__( estimator=estimator, random_state=random_state, verbose=verbose, test_size=test_size, - permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, + permute_forest_fraction=permute_forest_fraction, + train_test_split=train_test_split, ) def _get_estimator(self): @@ -660,7 +709,7 @@ def statistic( self, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mse", return_posteriors: bool = False, check_input: bool = True, @@ -722,28 +771,33 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually - if self.sample_dataset_per_tree or self.permute_per_tree: - Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - delayed(_parallel_build_trees_and_compute_posteriors)( - estimator, + if self.sample_dataset_per_tree or self.permute_forest_fraction: + if self.permute_forest_fraction and covariate_index is not None: + random_states = [tree.random_state for tree in estimator.estimators_] + else: + random_states = [estimator.random_state] * len(estimator.estimators_) + + trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( + delayed(_parallel_build_trees_with_sepdata)( + estimator.estimators_[idx], + len(estimator.estimators_), idx, indices_train, - indices_test, X, y, covariate_index, - posterior_arr, - False, - self.permute_per_tree, - self._type_of_target_, + bootstrap=estimator.bootstrap, + max_samples=estimator.max_samples, + random_state=random_states[idx], ) - for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_) + for idx, (indices_train, _) in enumerate(self.train_test_samples_) ) + estimator.estimators_ = trees else: # fitting a forest will only get one unique train/test split indices_train, indices_test = self.train_test_samples_[0] - X_train, X_test = X[indices_train, :], X[indices_test, :] + X_train, _ = X[indices_train, :], X[indices_test, :] y_train, y_test = y[indices_train, :], y[indices_test, :] if covariate_index is not None: @@ -761,16 +815,33 @@ def _statistic( y_train = y_train.ravel() estimator.fit(X_train, y_train) - # construct posterior array for all trees (n_trees, n_samples_test, n_outputs) - for itree, tree in enumerate(estimator.estimators_): - posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( - -1, tree.n_outputs_ - ) - # set variables to compute metric samples = indices_test y_true_final = y_test + # TODO: probably a more elegant way of doing this + if self.train_test_split: + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict, X, indices_test + ) + for idx, (_, indices_test) in enumerate(self.train_test_samples_) + ) + for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): + _, indices_test = est_indices + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) + else: + all_indices = np.arange(self._n_samples_, dtype=int) + + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)(estimator.estimators_[idx].predict, X, all_indices) + for idx in range(len(estimator.estimators_)) + ) + for itree, proba in enumerate(all_proba): + posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_outputs_) + # determine if there are any nans in the final posterior array, when # averaged over the trees samples = _non_nan_samples(posterior_arr) @@ -830,14 +901,18 @@ class FeatureImportanceForestClassifier(BaseForestHT): test_size : float, default=0.2 Proportion of samples per tree to use for the test set. - permute_per_tree : bool, default=True - Whether to permute the covariate index per tree or per forest. + stratify : bool, default=True + Whether to stratify the samples by class labels. sample_dataset_per_tree : bool, default=False Whether to sample the dataset per tree or per forest. - stratify : bool, default=True - Whether to stratify the samples by class labels. + permute_forest_fraction : float, default=None + The fraction of trees to permute the covariate index for. If None, then + just one permutation is performed. + + train_test_split : bool, default=True + Whether to split the data into train/test before passing to the forest. Attributes ---------- @@ -889,18 +964,20 @@ def __init__( random_state=None, verbose=0, test_size=0.2, - permute_per_tree=True, - sample_dataset_per_tree=True, stratify=True, + sample_dataset_per_tree=False, + permute_forest_fraction=None, + train_test_split=True, ): super().__init__( estimator=estimator, random_state=random_state, verbose=verbose, test_size=test_size, - permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, stratify=stratify, + train_test_split=train_test_split, + permute_forest_fraction=permute_forest_fraction, ) def _get_estimator(self): @@ -946,28 +1023,33 @@ def _statistic( # both sampling dataset per tree or permuting per tree requires us to bypass the # sklearn API to fit each tree individually - if self.sample_dataset_per_tree or self.permute_per_tree: - Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( - delayed(_parallel_build_trees_and_compute_posteriors)( - estimator, + if self.sample_dataset_per_tree or self.permute_forest_fraction: + if self.permute_forest_fraction and covariate_index is not None: + random_states = [tree.random_state for tree in estimator.estimators_] + else: + random_states = [estimator.random_state] * len(estimator.estimators_) + + trees = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose, prefer="threads")( + delayed(_parallel_build_trees_with_sepdata)( + estimator.estimators_[idx], + len(estimator.estimators_), idx, indices_train, - indices_test, X, y, covariate_index, - posterior_arr, - predict_posteriors, - self.permute_per_tree, - self._type_of_target_, + bootstrap=estimator.bootstrap, + max_samples=estimator.max_samples, + random_state=random_states[idx], ) - for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_) + for idx, (indices_train, _) in enumerate(self.train_test_samples_) ) + estimator.estimators_ = trees else: # fitting a forest will only get one unique train/test split indices_train, indices_test = self.train_test_samples_[0] - X_train, X_test = X[indices_train, :], X[indices_test, :] + X_train, _ = X[indices_train, :], X[indices_test, :] y_train = y[indices_train, :] if covariate_index is not None: @@ -985,20 +1067,57 @@ def _statistic( y_train = y_train.ravel() estimator.fit(X_train, y_train) - # construct posterior array for all trees (n_trees, n_samples_test, n_outputs) - for itree, tree in enumerate(estimator.estimators_): - if predict_posteriors: - # XXX: currently assumes n_outputs_ == 1 - posterior_arr[itree, indices_test, ...] = tree.predict_proba(X_test).reshape( - -1, tree.n_classes_ + # set variables to compute metric + samples = indices_test + + # list of tree outputs. Each tree output is (n_samples, n_outputs), or (n_samples,) + if predict_posteriors: + # all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + # delayed(_parallel_predict_proba)( + # estimator.estimators_[idx].predict_proba, X, indices_test + # ) + # for idx, (_, indices_test) in enumerate(self.train_test_samples_) + # ) + + # TODO: probably a more elegant way of doing this + if self.train_test_split: + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict_proba, X, indices_test ) - else: - posterior_arr[itree, indices_test, ...] = tree.predict(X_test).reshape( - -1, tree.n_outputs_ + for idx, (_, indices_test) in enumerate(self.train_test_samples_) + ) + else: + all_indices = np.arange(self._n_samples_, dtype=int) + + # accumulate the predictions across all trees + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict_proba, X, all_indices + ) + for idx in range(len(estimator.estimators_)) + ) + else: + all_proba = Parallel(n_jobs=estimator.n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_proba)( + estimator.estimators_[idx].predict, X, indices_test + ) + for idx, (_, indices_test) in enumerate(self.train_test_samples_) + ) + for itree, (proba, est_indices) in enumerate(zip(all_proba, self.train_test_samples_)): + _, indices_test = est_indices + + if predict_posteriors: + if self.train_test_split: + posterior_arr[itree, indices_test, ...] = proba.reshape( + -1, estimator.n_classes_ ) + else: + posterior_arr[itree, ...] = proba.reshape(-1, estimator.n_classes_) + else: + posterior_arr[itree, indices_test, ...] = proba.reshape(-1, estimator.n_outputs_) - # set variables to compute metric - samples = indices_test if metric == "auc": # at this point, posterior_final is the predicted posterior for only the positive class # as more than one output is not supported. diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py index 4a27f539f..edaa0878d 100644 --- a/sktree/stats/permutationforest.py +++ b/sktree/stats/permutationforest.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from numpy.typing import ArrayLike from sklearn.base import MetaEstimatorMixin, clone, is_classifier @@ -62,7 +64,7 @@ def _statistic( estimator: BaseForest, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mse", return_posteriors: bool = False, seed=None, @@ -117,7 +119,7 @@ def statistic( self, X: ArrayLike, y: ArrayLike, - covariate_index: ArrayLike = None, + covariate_index: Optional[ArrayLike] = None, metric="mse", return_posteriors: bool = False, check_input: bool = True, diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 676b7a817..69efbe378 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -52,7 +52,7 @@ n_jobs=-1, ), "random_state": seed, - "permute_per_tree": False, + "permute_forest_fraction": None, "sample_dataset_per_tree": False, }, 300, # n_samples @@ -69,7 +69,7 @@ # n_estimators=150, # n_jobs=-1, # ), - # "permute_per_tree": True, + # "permute_forest_fraction": 1. / 150, # "sample_dataset_per_tree": True, # }, # 300, # n_samples @@ -86,7 +86,7 @@ n_jobs=-1, ), # "random_state": seed, - "permute_per_tree": True, + "permute_forest_fraction": 1.0 / 125, "sample_dataset_per_tree": False, }, 300, # n_samples @@ -171,8 +171,6 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_estimators=100, n_jobs=-1, ), - # "random_state": seed, - "permute_per_tree": False, "sample_dataset_per_tree": False, }, 600, # n_samples @@ -190,7 +188,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) # n_jobs=-1, # ), # # "random_state": seed, - # "permute_per_tree": True, + # "permute_forest_fraction": 1. / 150, # "sample_dataset_per_tree": True, # }, # 600, # n_samples @@ -206,8 +204,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_estimators=100, n_jobs=-1, ), - # "random_state": seed, - "permute_per_tree": True, + "permute_forest_fraction": 0.3, "sample_dataset_per_tree": False, }, 600, # n_samples diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index e71e5e09b..a242193b1 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -33,17 +33,18 @@ @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): + n_samples = 50 + n_estimators = 10 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( - n_estimators=10, + n_estimators=n_estimators, random_state=seed, ), - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators * 5, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, ) - n_samples = 50 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") assert ( @@ -68,20 +69,49 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): with pytest.raises(RuntimeError, match="Not all covariate_index"): est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") + with pytest.raises( + RuntimeError, match="permute_forest_fraction must be greater than 1./n_estimators" + ): + est = FeatureImportanceForestClassifier( + estimator=RandomForestClassifier( + n_estimators=n_estimators, + random_state=seed, + ), + permute_forest_fraction=1.0 / n_samples, + test_size=0.7, + random_state=seed, + sample_dataset_per_tree=sample_dataset_per_tree, + ) + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") + + with pytest.raises(RuntimeError, match="permute_forest_fraction must be non-negative."): + est = FeatureImportanceForestClassifier( + estimator=RandomForestClassifier( + n_estimators=n_estimators, + random_state=seed, + ), + permute_forest_fraction=-1.0 / n_estimators * 5, + test_size=0.7, + random_state=seed, + sample_dataset_per_tree=sample_dataset_per_tree, + ) + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mse") + @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_featureimportance_forest_stratified(sample_dataset_per_tree): + n_samples = 100 + n_estimators = 10 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( - n_estimators=10, + n_estimators=n_estimators, random_state=seed, ), - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators * 5, test_size=0.7, random_state=seed, sample_dataset_per_tree=sample_dataset_per_tree, ) - n_samples = 100 est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") _, indices_test = est.train_test_samples_[0] @@ -102,14 +132,14 @@ def test_featureimportance_forest_stratified(sample_dataset_per_tree): def test_featureimportance_forest_errors(): - permute_per_tree = False sample_dataset_per_tree = True + n_estimators = 10 est = FeatureImportanceForestClassifier( estimator=RandomForestClassifier( - n_estimators=10, + n_estimators=n_estimators, ), test_size=0.5, - permute_per_tree=permute_per_tree, + permute_forest_fraction=None, sample_dataset_per_tree=sample_dataset_per_tree, ) with pytest.raises(RuntimeError, match="The estimator must be fitted"): @@ -139,20 +169,20 @@ def test_featureimportance_forest_errors(): ], ) @pytest.mark.parametrize( - "permute_per_tree", + "permute_forest_fraction", [ - True, - False, + None, + 0.5, ], ) @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) def test_iris_pauc_statistic( - criterion, honest_prior, estimator, permute_per_tree, sample_dataset_per_tree + criterion, honest_prior, estimator, permute_forest_fraction, sample_dataset_per_tree ): limit = 0.1 max_features = "sqrt" n_repeats = 200 - n_estimators = 100 + n_estimators = 25 test_size = 0.2 # Check consistency on dataset iris. @@ -168,14 +198,14 @@ def test_iris_pauc_statistic( ), test_size=test_size, sample_dataset_per_tree=sample_dataset_per_tree, - permute_per_tree=permute_per_tree, + permute_forest_fraction=permute_forest_fraction, ) # now add completely uninformative feature X = np.hstack((iris_X, rng.standard_normal(size=(iris_X.shape[0], 4)))) # test for unimportant feature set clf.reset() - if sample_dataset_per_tree and not permute_per_tree: + if sample_dataset_per_tree and permute_forest_fraction is None: # test in another test return @@ -219,7 +249,7 @@ def test_iris_pauc_statistic( n_estimators=10, ), random_state=seed, - permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=False, ), ], @@ -323,19 +353,26 @@ def test_pickle(tmpdir): assert_array_equal(getattr(clf, attr), getattr(clf_pickle, attr)) -@pytest.mark.parametrize("permute_per_tree", [True, False], ids=["permute_per_tree", "no_permute"]) +@pytest.mark.parametrize( + "permute_forest_fraction", + [None, 0.5], + ids=["no_permute", "permute_forest_fraction"], +) @pytest.mark.parametrize( "sample_dataset_per_tree", [True, False], ids=["sample_dataset_per_tree", "no_sample_dataset"] ) -def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_dataset_per_tree): +def test_sample_size_consistency_of_estimator_indices_( + permute_forest_fraction, sample_dataset_per_tree +): """Test that the test-sample indices are what is expected.""" clf = FeatureImportanceForestClassifier( estimator=HonestForestClassifier( n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - permute_per_tree=permute_per_tree, + permute_forest_fraction=permute_forest_fraction, sample_dataset_per_tree=sample_dataset_per_tree, + stratify=False, ) n_samples = 100 @@ -346,10 +383,12 @@ def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_ _, posteriors, samples = clf.statistic( X, y, covariate_index=None, return_posteriors=True, metric="mi" ) - if sample_dataset_per_tree: + + if sample_dataset_per_tree or permute_forest_fraction is not None: # check the non-nans non_nan_idx = _non_nan_samples(posteriors) - assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}" + if sample_dataset_per_tree: + assert clf.n_samples_test_ == n_samples, f"{clf.n_samples_test_} != {n_samples}" sorted_sample_idx = sorted(np.unique(samples)) sorted_est_samples_idx = sorted( @@ -367,24 +406,28 @@ def test_sample_size_consistency_of_estimator_indices_(permute_per_tree, sample_ f"{set(sorted_est_samples_idx) - set(sorted_sample_idx)}", ) else: - assert_array_equal(samples, sorted(clf.train_test_samples_[0][1])) + assert_array_equal( + samples, + sorted(clf.train_test_samples_[0][1]), + err_msg=f"Samples {set(samples) - set(sorted(clf.train_test_samples_[0][1]))}.", + ) assert len(_non_nan_samples(posteriors)) == len(samples) @pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) @pytest.mark.parametrize("seed", [None, 0]) -def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree): +def test_sample_per_tree_samples_consistency_with_sklearnforest(seed, sample_dataset_per_tree): n_samples = 100 n_features = 5 X = rng.uniform(size=(n_samples, n_features)) y = rng.integers(0, 2, size=n_samples) # Binary classification - + n_estimators = 10 clf = FeatureImportanceForestClassifier( estimator=HonestForestClassifier( - n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 + n_estimators=n_estimators, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - permute_per_tree=True, + permute_forest_fraction=1.0 / n_estimators, sample_dataset_per_tree=sample_dataset_per_tree, ) other_clf = FeatureImportanceForestClassifier( @@ -392,7 +435,7 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.2 ), test_size=0.5, - permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=sample_dataset_per_tree, ) @@ -405,8 +448,8 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da assert_array_equal(clf.train_test_samples_[idx][0], estimator_train_test_indices[idx][0]) assert_array_equal(clf.train_test_samples_[idx][1], estimator_train_test_indices[idx][1]) - # if the sample_dataset_per_tree, then the indices should be different across all - if sample_dataset_per_tree: + # if the sample_dataset_per_tree, then the indices should be different across all trees + if sample_dataset_per_tree or clf.permute_forest_fraction > 0.0: for indices, other_indices in combinations(clf.train_test_samples_, 2): assert not np.array_equal(indices[0], other_indices[0]) assert not np.array_equal(indices[1], other_indices[1]) @@ -426,7 +469,7 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da ) # when seed is passed, the indices should be deterministic - if seed is not None: + if seed is not None and sample_dataset_per_tree: assert_array_equal( clf.train_test_samples_[idx][0], other_clf.train_test_samples_[idx][0] ) @@ -448,7 +491,7 @@ def test_small_dataset_independent(seed): n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5 ), test_size=0.2, - permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=False, ) stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") @@ -475,14 +518,13 @@ def test_small_dataset_dependent(seed): y = np.vstack( [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] ) # Binary classification - print(X.shape, y.shape) clf = FeatureImportanceForestClassifier( estimator=HonestForestClassifier( n_estimators=50, random_state=seed, n_jobs=1, honest_fraction=0.5 ), test_size=0.2, - permute_per_tree=False, + permute_forest_fraction=None, sample_dataset_per_tree=False, ) stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") @@ -494,20 +536,55 @@ def test_small_dataset_dependent(seed): assert pvalue <= 0.05 -# @pytest.mark.monitor_test -# def test_memory_usage(): -# n_samples = 1000 -# n_features = 5000 -# X = rng.uniform(size=(n_samples, n_features)) -# y = rng.integers(0, 2, size=n_samples) # Binary classification +@flaky(max_runs=3) +def test_no_traintest_split(): + n_samples = 500 + n_features = 5 + rng = np.random.default_rng(seed) -# clf = FeatureImportanceForestClassifier( -# estimator=HonestForestClassifier( -# n_estimators=10, random_state=seed, n_jobs=-1, honest_fraction=0.5 -# ), -# test_size=0.2, -# permute_per_tree=False, -# sample_dataset_per_tree=False, -# ) + X = rng.uniform(size=(n_samples, n_features)) + X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = X * 2 + X = np.vstack([X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + + clf = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=50, + max_features=n_features, + random_state=seed, + n_jobs=1, + honest_fraction=0.5, + ), + test_size=0.2, + train_test_split=False, + permute_forest_fraction=None, + sample_dataset_per_tree=False, + ) + stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") + + # since no train-test split, the training is all the data and the testing is none of the data + assert_array_equal(clf.train_test_samples_[0][0], np.arange(n_samples)) + assert_array_equal(clf.train_test_samples_[0][1], np.array([])) -# stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") + assert ~np.isnan(pvalue) + assert ~np.isnan(stat) + assert pvalue <= 0.05, f"{pvalue}" + + stat, pvalue = clf.test(X, y, metric="mi") + assert pvalue <= 0.05, f"{pvalue}" + + X = rng.uniform(size=(n_samples, n_features)) + y = rng.integers(0, 2, size=n_samples) # Binary classification + clf.reset() + + stat, pvalue = clf.test(X, y, metric="mi") + assert_almost_equal(stat, 0.0, decimal=1) + assert pvalue > 0.05, f"{pvalue}" + + stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") + assert ~np.isnan(pvalue) + assert ~np.isnan(stat) + assert pvalue > 0.05, f"{pvalue}" diff --git a/sktree/stats/utils.py b/sktree/stats/utils.py index f21dd00a8..7ff7a9f05 100644 --- a/sktree/stats/utils.py +++ b/sktree/stats/utils.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import Optional, Tuple import numpy as np from numpy.typing import ArrayLike @@ -112,7 +112,7 @@ def _compute_null_distribution_perm( est: ForestClassifier, metric: str = "mse", n_repeats: int = 1000, - seed: int = None, + seed: Optional[int] = None, ) -> ArrayLike: """Compute null distribution using permutation method. @@ -173,7 +173,7 @@ def _compute_null_distribution_coleman( y_pred_proba_perm: ArrayLike, metric: str = "mse", n_repeats: int = 1000, - seed: int = None, + seed: Optional[int] = None, ) -> Tuple[ArrayLike, ArrayLike]: """Compute null distribution using Coleman method. diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 011ce7590..ebd88d99e 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -527,6 +527,7 @@ def _fit( nonzero_indices = np.where(_sample_weight > 0)[0] + # TODO: perhaps we want to stratify this split self.structure_indices_ = rng.choice( nonzero_indices, int((1 - self.honest_fraction) * len(nonzero_indices)), diff --git a/sktree/tree/_oblique_tree.pxd b/sktree/tree/_oblique_tree.pxd index db7813946..f85af4ce1 100644 --- a/sktree/tree/_oblique_tree.pxd +++ b/sktree/tree/_oblique_tree.pxd @@ -42,7 +42,7 @@ cdef class ObliqueTree(Tree): ) noexcept nogil cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node ) noexcept nogil diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 99a3d6fc0..ecd257862 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -217,11 +217,14 @@ cdef class ObliqueTree(Tree): self.proj_vec_weights.resize(capacity) self.proj_vec_indices.resize(capacity) - # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: + # value memory is initialised to 0 to enable classifier argmax memset((self.value + self.capacity * self.value_stride), 0, (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) + # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) + memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) + # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: self.node_count = capacity @@ -269,10 +272,7 @@ cdef class ObliqueTree(Tree): # get the index of the node cdef intp_t node_id = node - self.nodes - # cdef intp_t n_projections = proj_vec_indices.size() # compute projection of the data based on trained tree - # proj_vec_weights = self.proj_vec_weights[node_id] - # proj_vec_indices = self.proj_vec_indices[node_id] for j in range(0, self.proj_vec_indices[node_id].size()): feature_index = self.proj_vec_indices[node_id][j] weight = self.proj_vec_weights[node_id][j] @@ -286,7 +286,7 @@ cdef class ObliqueTree(Tree): cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node ) noexcept nogil: """Compute feature importances from a Node in the Tree. diff --git a/sktree/tree/tests/test_all_trees.py b/sktree/tree/tests/test_all_trees.py index a5608fda5..66a9ea307 100644 --- a/sktree/tree/tests/test_all_trees.py +++ b/sktree/tree/tests/test_all_trees.py @@ -1,7 +1,7 @@ import joblib import numpy as np import pytest -from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_array_equal +from numpy.testing import assert_almost_equal, assert_array_equal from sklearn.base import is_classifier from sklearn.datasets import make_blobs from sklearn.tree._tree import TREE_LEAF @@ -53,9 +53,14 @@ def assert_tree_equal(d, s, message): assert_almost_equal(d.impurity, s.impurity, err_msg=message + ": inequal impurity") - assert_array_almost_equal( - d.value[external], s.value[external], err_msg=message + ": inequal value" - ) + assert_array_equal(d.value[external], s.value[external], err_msg=message + ": inequal value") + + if hasattr(d, "get_projection_matrix"): + assert_array_equal( + d.get_projection_matrix(), + s.get_projection_matrix(), + err_msg=message + ": inequal projection matrix", + ) X_small = np.array( diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py index 478aa6d5b..94cdfef62 100644 --- a/sktree/tree/tests/test_tree.py +++ b/sktree/tree/tests/test_tree.py @@ -1,3 +1,6 @@ +import pickle +import sys + import numpy as np import pytest from numpy.testing import assert_allclose @@ -20,6 +23,8 @@ UnsupervisedObliqueDecisionTree, ) +from .test_all_trees import assert_tree_equal + CLUSTER_CRITERIONS = ("twomeans", "fastbic") REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson") CLF_CRITERIONS = ("gini", "entropy") @@ -141,12 +146,6 @@ digits.target = digits.target[perm] -def assert_tree_equal(d, s, message): - assert s.node_count == d.node_count, "{0}: inequal number of node ({1} != {2})".format( - message, s.node_count, d.node_count - ) - - def test_pickle_splitters(): """Test that splitters are picklable.""" import tempfile @@ -526,3 +525,28 @@ def test_balance_property(criterion, Tree): reg = Tree(criterion=criterion) reg.fit(X, y) assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) + + +@pytest.mark.parametrize("Tree", ALL_TREES.values()) +def test_deterministic_pickle(Tree): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/27268 + # Uninitialised memory would lead to the two pickle strings being different. + tree1 = Tree(random_state=0).fit(iris.data, iris.target) + tree2 = Tree(random_state=0).fit(iris.data, iris.target) + + pickle1 = pickle.dumps(tree1) + pickle2 = pickle.dumps(tree2) + + pickle1_tree = pickle.loads(pickle1) + pickle2_tree = pickle.loads(pickle2) + assert_tree_equal( + pickle1_tree.tree_, + pickle2_tree.tree_, + "The trees of the original and loaded classifiers are not equal.", + ) + assert sys.getsizeof(pickle1_tree) == sys.getsizeof(pickle2_tree) + + # TODO: this does not work properly it seems + if Tree not in (list(CLF_TREES.values()) + list(REG_TREES.values())): + assert pickle1 == pickle2, f"Failed with {Tree}" diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 51f6e6202..e72802f66 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -197,11 +197,14 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): self.proj_vec_weights.resize(capacity) self.proj_vec_indices.resize(capacity) - # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: + # value memory is initialised to 0 to enable classifier argmax memset((self.value + self.capacity * self.value_stride), 0, (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) + # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) + memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) + # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: self.node_count = capacity