From 7b9edfedfac4249b9acb84c22bfaf800f1f50892 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 22 Feb 2024 14:26:54 -0500 Subject: [PATCH 01/14] Fix Signed-off-by: Adam Li --- .pre-commit-config.yaml | 1 + pyproject.toml | 11 +- sktree/_lib/sklearn_fork | 2 +- sktree/ensemble/_extensions.py | 35 +--- sktree/ensemble/_honest_forest.py | 268 ++++++-------------------- sktree/tests/test_honest_forest.py | 51 ++++- sktree/tree/_honest_tree.py | 57 ++++-- sktree/tree/_multiview.py | 4 +- sktree/tree/tests/test_honest_tree.py | 33 ++++ 9 files changed, 186 insertions(+), 276 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9b5825e5c..22194c830 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,6 +63,7 @@ repos: hooks: - id: toml-sort files: ^pyproject\.toml$ + args: ['-i'] # mypy - repo: https://github.com/pre-commit/mirrors-mypy diff --git a/pyproject.toml b/pyproject.toml index 37837441a..5e613f7c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,9 @@ requires = [ "numpy>=1.25; python_version>='3.9'" ] +[lint.per-file-ignores] +'__init__.py' = ['F401'] + [project] name = "scikit-tree" version = "0.7.0dev0" @@ -203,7 +206,9 @@ add_ignore = 'D100,D104,D105,D107' [tool.pytest.ini_options] minversion = '6.0' addopts = '--durations 20 --junit-xml=junit-results.xml --verbose --ignore=sktree/_lib/ -k "not slowtest"' -filterwarnings = [] +filterwarnings = [ + 'ignore:Using sklearn tree so store_leaf_values cannot be set.*' +] [tool.rstcheck] ignore_directives = [ @@ -261,10 +266,10 @@ extend-exclude = [ 'validation' ] line-length = 88 -ignore = ['E731'] +lint.ignore = ['E731'] [tool.ruff.per-file-ignores] -'__init__.py' = ['F401'] +"__init__.py" = ["F401"] [tool.spin] package = 'sktree' diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index db91ddb6f..d48716a6b 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit db91ddb6f090ac091588dd844be6d9c60afc341f +Subproject commit d48716a6b2cc6373b9e66bf959f2b43b89f10c5d diff --git a/sktree/ensemble/_extensions.py b/sktree/ensemble/_extensions.py index 99aef5a6c..1933d1415 100644 --- a/sktree/ensemble/_extensions.py +++ b/sktree/ensemble/_extensions.py @@ -1,5 +1,4 @@ import threading -from numbers import Integral, Real import numpy as np from joblib import Parallel, delayed @@ -7,6 +6,7 @@ from sklearn.utils import check_random_state from sklearn.utils.validation import check_is_fitted +from .._lib.sklearn.ensemble._forest import _get_n_samples_bootstrap from ..tree._classes import DTYPE @@ -55,39 +55,6 @@ def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap, boots return sample_indices -def _get_n_samples_bootstrap(n_samples, max_samples): - """ - Get the number of samples in a bootstrap sample. - - XXX: Note this is copied from sklearn. We override the ability - to sample a higher number of bootstrap samples to enable sampling - closer to 80% unique training data points for in-bag computation. - - Parameters - ---------- - n_samples : int - Number of samples in the dataset. - max_samples : int or float - The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total; - - if int, this indicates the exact number of samples; - - if None, this indicates the total number of samples. - - Returns - ------- - n_samples_bootstrap : int - The total number of samples to draw for the bootstrap sample. - """ - if max_samples is None: - return n_samples - - if isinstance(max_samples, Integral): - return max_samples - - if isinstance(max_samples, Real): - return round(n_samples * max_samples) - - class ForestMixin: """A set of mixin methods to extend forests models API.""" diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index b9bd9a4bd..7a3b23eb5 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -3,25 +3,19 @@ import threading from numbers import Integral -from warnings import catch_warnings, simplefilter, warn +from warnings import catch_warnings, simplefilter import numpy as np from joblib import Parallel, delayed -from scipy.sparse import issparse -from sklearn.base import _fit_context -from sklearn.ensemble._base import _partition_estimators -from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.exceptions import DataConversionWarning -from sklearn.utils import check_random_state, compute_sample_weight -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.base import _fit_context, clone +from sklearn.ensemble._base import _partition_estimators, _set_random_states +from sklearn.utils import compute_sample_weight from sklearn.utils._param_validation import Interval, RealNotInt -from sklearn.utils.multiclass import type_of_target -from sklearn.utils.validation import _check_sample_weight, check_is_fitted +from sklearn.utils.validation import check_is_fitted from .._lib.sklearn.ensemble._forest import ForestClassifier -from .._lib.sklearn.tree._tree import DOUBLE, DTYPE from ..tree import HonestTreeClassifier -from ._extensions import ForestClassifierMixin, _generate_sample_indices, _get_n_samples_bootstrap +from ._extensions import ForestClassifierMixin, _generate_sample_indices def _parallel_build_trees( @@ -419,27 +413,30 @@ def __init__( honest_fraction=0.5, tree_estimator=None, stratify=False, + **tree_estimator_params, ): + estimator_params = ( + "criterion", + "splitter", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "tree_estimator", + "honest_fraction", + "honest_prior", + "stratify", + ) + super().__init__( estimator=HonestTreeClassifier(), n_estimators=n_estimators, - estimator_params=( - "criterion", - "splitter", - "max_depth", - "min_samples_split", - "min_samples_leaf", - "min_weight_fraction_leaf", - "max_features", - "max_leaf_nodes", - "min_impurity_decrease", - "random_state", - "ccp_alpha", - "tree_estimator", - "honest_fraction", - "honest_prior", - "stratify", - ), + estimator_params=estimator_params, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -463,9 +460,10 @@ def __init__( self.honest_prior = honest_prior self.tree_estimator = tree_estimator self.stratify = stratify + self._tree_estimator_params = tree_estimator_params @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None, classes=None): + def fit(self, X, y, sample_weight=None, classes=None, **fit_params): """ Build a forest of trees from the training set (X, y). @@ -490,202 +488,50 @@ def fit(self, X, y, sample_weight=None, classes=None): classes : array-like of shape (n_classes,), default=None List of all the classes that can possibly appear in the y vector. + **fit_params : dict + Parameters to pass to the underlying base tree estimators. + + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + Returns ------- self : object Fitted estimator. """ - # XXX: This entire function is a copy of what is in scikit-learn - # with the exception of: - # - _get_n_samples_bootstrap is a re-defined function to allow higher - # max_samples - - MAX_INT = np.iinfo(np.int32).max - # Validate or convert input data - if issparse(y): - raise ValueError("sparse multilabel-indicator for y is not supported.") - - X, y = self._validate_data( - X, - y, - multi_output=True, - accept_sparse="csc", - dtype=DTYPE, - force_all_finite=False, - ) - # _compute_missing_values_in_feature_mask checks if X has missing values and - # will raise an error if the underlying tree base estimator can't handle missing - # values. Only the criterion is required to determine if the tree supports - # missing values. - estimator = type(self.estimator)(criterion=self.criterion) - missing_values_in_feature_mask = estimator._compute_missing_values_in_feature_mask( - X, estimator_name=self.__class__.__name__ - ) - - if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) - - if issparse(X): - # Pre-sort indices to avoid that each individual tree of the - # ensemble sorts the indices. - X.sort_indices() - - y = np.atleast_1d(y) - if y.ndim == 2 and y.shape[1] == 1: - warn( - ( - "A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel()." - ), - DataConversionWarning, - stacklevel=2, - ) - - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) - - if self.criterion == "poisson": - if np.any(y < 0): - raise ValueError( - "Some value(s) of y are negative which is " - "not allowed for Poisson regression." - ) - if np.sum(y) <= 0: - raise ValueError( - "Sum of y is not strictly positive which " - "is necessary for Poisson regression." - ) - - self._n_samples, self.n_outputs_ = y.shape - - y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes) - - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) - if expanded_class_weight is not None: - if sample_weight is not None: - sample_weight = sample_weight * expanded_class_weight - else: - sample_weight = expanded_class_weight + super().fit(X, y, sample_weight=sample_weight, classes=classes, **fit_params) - # compute the number of samples we want to sample for each tree - # possibly without replacement - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples=X.shape[0], max_samples=self.max_samples + # Compute honest decision function + self.honest_decision_function_ = self._predict_proba( + X, indices=self.honest_indices_, impute_missing=np.nan ) - self._n_samples_bootstrap = n_samples_bootstrap - - self._validate_estimator() - - if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available if bootstrap=True") - - random_state = check_random_state(self.random_state) - - if not self.warm_start or not hasattr(self, "estimators_"): - # Free allocated memory, if any - self.estimators_ = [] - - n_more_estimators = self.n_estimators - len(self.estimators_) - - if self.max_bins is not None: - # `_openmp_effective_n_threads` is used to take cgroups CPU quotes - # into account when determine the maximum number of threads to use. - n_threads = _openmp_effective_n_threads() - - # Bin the data - # For ease of use of the API, the user-facing GBDT classes accept the - # parameter max_bins, which doesn't take into account the bin for - # missing values (which is always allocated). However, since max_bins - # isn't the true maximal number of bins, all other private classes - # (binmapper, histbuilder...) accept n_bins instead, which is the - # actual total number of bins. Everywhere in the code, the - # convention is that n_bins == max_bins + 1 - n_bins = self.max_bins + 1 # + 1 for missing values - self._bin_mapper = _BinMapper( - n_bins=n_bins, - # is_categorical=self.is_categorical_, - known_categories=None, - random_state=random_state, - n_threads=n_threads, - ) - - # XXX: in order for this to work with the underlying tree submodule's Cython - # code, we need to convert this into the original data's DTYPE because - # the Cython code assumes that `DTYPE` is used. - # The proper implementation will be a lot more complicated and should be - # tackled once scikit-learn has finalized their inclusion of missing data - # and categorical support for decision trees - X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) - else: - self._bin_mapper = None - - if n_more_estimators < 0: - raise ValueError( - "n_estimators=%d must be larger or equal to " - "len(estimators_)=%d when warm_start==True" - % (self.n_estimators, len(self.estimators_)) - ) - - elif n_more_estimators == 0: - warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") - else: - if self.warm_start and len(self.estimators_) > 0: - # We draw from the random state to get the random state we - # would have got if we hadn't used a warm_start. - random_state.randint(MAX_INT, size=len(self.estimators_)) + return self - trees = self._construct_trees( - X, - y, - sample_weight, - random_state, - n_samples_bootstrap, - missing_values_in_feature_mask, - classes, - n_more_estimators, - ) + def _make_estimator(self, append=True, random_state=None): + """Make and configure a copy of the `estimator_` attribute. - # Collect newly grown trees - self.estimators_.extend(trees) - - if self.oob_score and (n_more_estimators > 0 or not hasattr(self, "oob_score_")): - y_type = type_of_target(y) - if y_type == "unknown" or ( - self._estimator_type == "classifier" and y_type == "multiclass-multioutput" - ): - # FIXME: we could consider to support multiclass-multioutput if - # we introduce or reuse a constructor parameter (e.g. - # oob_score) allowing our user to pass a callable defining the - # scoring strategy on OOB sample. - raise ValueError( - "The type of target cannot be used to compute OOB " - f"estimates. Got {y_type} while only the following are " - "supported: continuous, continuous-multioutput, binary, " - "multiclass, multilabel-indicator." - ) + Warning: This method should be used to properly instantiate new + sub-estimators. + """ + estimator = clone(self.estimator_) + estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params}) - if callable(self.oob_score): - self._set_oob_score_and_attributes(X, y, scoring_function=self.oob_score) - else: - self._set_oob_score_and_attributes(X, y) + # XXX: This is the only change compared to scikit-learn's make_estimator + # additionally set the tree_estimator_params + estimator._tree_estimator_params = self._tree_estimator_params - # Decapsulate classes_ attributes - if hasattr(self, "classes_") and self.n_outputs_ == 1: - self.n_classes_ = self.n_classes_[0] - self.classes_ = self.classes_[0] + if random_state is not None: + _set_random_states(estimator, random_state) - # Compute honest decision function - self.honest_decision_function_ = self._predict_proba( - X, indices=self.honest_indices_, impute_missing=np.nan - ) + if append: + self.estimators_.append(estimator) - return self + return estimator def _construct_trees( self, diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index 2efc4d438..c06fb716c 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -12,7 +12,11 @@ from sktree.datasets import make_quadratic_classification from sktree.ensemble import HonestForestClassifier from sktree.stats.utils import _mutual_information -from sktree.tree import ObliqueDecisionTreeClassifier, PatchObliqueDecisionTreeClassifier +from sktree.tree import ( + MultiViewDecisionTreeClassifier, + ObliqueDecisionTreeClassifier, + PatchObliqueDecisionTreeClassifier, +) CLF_CRITERIONS = ("gini", "entropy") @@ -155,16 +159,15 @@ def test_max_samples(): uf = uf.fit(X, y) -@pytest.mark.parametrize("bootstrap", [True, False]) @pytest.mark.parametrize("max_samples", [0.75, 1.0]) -def test_honest_forest_has_deterministic_sampling_for_oob_structure_and_leaves( - bootstrap, max_samples -): +def test_honest_forest_has_deterministic_sampling_for_oob_structure_and_leaves(max_samples): """Test that honest forest can produce the oob, structure and leaf-node samples. When bootstrap is True, oob should be exclusive from structure and leaf-node samples. When bootstrap is False, there is no oob. """ + bootstrap = True + n_estimators = 5 est = HonestForestClassifier( n_estimators=n_estimators, @@ -325,7 +328,7 @@ def test_honest_forest_with_sklearn_trees(): https://github.com/neurodata/scikit-tree/pull/157.""" # generate the high-dimensional quadratic data - X, y = make_quadratic_classification(1024, 4096, noise=True, seed=0) + X, y = make_quadratic_classification(256, 2048, noise=True, seed=0) y = y.squeeze() print(X.shape, y.shape) print(np.sum(y) / len(y)) @@ -372,7 +375,7 @@ def test_honest_forest_with_sklearn_trees_with_auc(): scores = [] sk_scores = [] for idx in range(10): - X, y = make_quadratic_classification(1024, 4096, noise=True, seed=idx) + X, y = make_quadratic_classification(256, 2048, noise=True, seed=idx) y = y.squeeze() skForest.fit(X, y) @@ -412,7 +415,7 @@ def test_honest_forest_with_sklearn_trees_with_mi(): scores = [] sk_scores = [] for idx in range(10): - X, y = make_quadratic_classification(1024, 4096, noise=True, seed=idx) + X, y = make_quadratic_classification(256, 2048, noise=True, seed=idx) y = y.squeeze() skForest.fit(X, y) @@ -432,3 +435,35 @@ def test_honest_forest_with_sklearn_trees_with_mi(): print(np.mean(scores), np.mean(sk_scores)) print(np.std(scores), np.std(sk_scores)) assert_allclose(np.mean(sk_scores), np.mean(scores), atol=0.005) + + +def test_honest_forest_with_tree_estimator_params(): + X = np.ones((20, 4)) + X[10:] *= -1 + y = [0] * 10 + [1] * 10 + + # test with a parameter that is a repeat of an init parameter + clf = HonestForestClassifier( + tree_estimator=DecisionTreeClassifier(), + random_state=0, + feature_set_ends=[10, 20], + ) + with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): + clf.fit(X, y) + + # test with a parameter that is not in any init signature + clf = HonestForestClassifier( + tree_estimator=MultiViewDecisionTreeClassifier(), + random_state=0, + blah=0, + ) + with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): + clf.fit(X, y) + + # passing in a valid argument to the tree_estimator should work + clf = HonestForestClassifier( + tree_estimator=MultiViewDecisionTreeClassifier(), + random_state=0, + feature_set_ends=[10, 20], + ) + clf.fit(X, y) diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 2a434daea..a0cb34950 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context, clone from sklearn.model_selection import StratifiedShuffleSplit -from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions +from sklearn.utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions from sklearn.utils.multiclass import _check_partial_fit_first_call, check_classification_targets from sklearn.utils.validation import check_is_fitted, check_X_y @@ -18,6 +18,11 @@ class HonestTreeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseDecisionTree Parameters ---------- + tree_estimator : object, default=None + Instantiated tree of type BaseDecisionTree from sktree. + If None, then sklearn's DecisionTreeClassifier with default parameters will + be used. + criterion : {"gini", "entropy"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. @@ -150,12 +155,6 @@ class HonestTreeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseDecisionTree Read more in the :ref:`User Guide `. - tree_estimator : object, default=None - Instantiated tree of type BaseDecisionTree from sktree. - If None, then DecisionTreeClassifier with default parameters will - be used. Note that one MUST use trees imported from the `sktree.tree` - API namespace rather than from `sklearn.tree`. - honest_fraction : float, default=0.5 Fraction of training samples used for estimates in the leaves. The remaining samples will be used to learn the tree structure. A larger @@ -272,6 +271,10 @@ class frequency in the voting subsample. _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, + "tree_estimator": [ + HasMethods(["fit", "predict", "predict_proba", "apply"]), + None, + ], "honest_fraction": [Interval(RealNotInt, 0.0, 1.0, closed="neither")], "honest_prior": [StrOptions({"empirical", "uniform", "ignore"})], "stratify": ["boolean"], @@ -279,6 +282,7 @@ class frequency in the voting subsample. def __init__( self, + tree_estimator=None, criterion="gini", splitter="best", max_depth=None, @@ -292,10 +296,10 @@ def __init__( class_weight=None, ccp_alpha=0.0, monotonic_cst=None, - tree_estimator=None, honest_fraction=0.5, honest_prior="empirical", stratify=False, + **tree_estimator_params, ): self.tree_estimator = tree_estimator self.criterion = criterion @@ -310,13 +314,15 @@ def __init__( self.random_state = random_state self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha + self.monotonic_cst = monotonic_cst + self.honest_fraction = honest_fraction self.honest_prior = honest_prior - self.monotonic_cst = monotonic_cst self.stratify = stratify # XXX: to enable this, we need to also reset the leaf node samples during `_set_leaf_nodes` self.store_leaf_values = False + self._tree_estimator_params = tree_estimator_params @_fit_context(prefer_skip_nested_validation=True) def fit( @@ -521,6 +527,15 @@ def _partition_honest_indices(self, y, sample_weight): return _sample_weight + def _get_estimator(self): + """Resolve which estimator to return (default is DecisionTreeClassifier)""" + if self.tree_estimator is None: + self.estimator_ = DecisionTreeClassifier(random_state=self.random_state) + else: + # XXX: maybe error out if the base tree estimator is already fitted + self.estimator_ = clone(self.tree_estimator) + return self.estimator_ + def _fit( self, X, @@ -564,14 +579,21 @@ def _fit( if check_input: X, y = check_X_y(X, y, multi_output=True) - if self.tree_estimator is None: - self.estimator_ = DecisionTreeClassifier(random_state=self.random_state) - else: - # XXX: maybe error out if the tree_estimator is already fitted - self.estimator_ = clone(self.tree_estimator) + self.estimator_ = self._get_estimator() - # obtain the structure sample weights - sample_weights_structure = self._partition_honest_indices(y, sample_weight) + # check that all of tree_estimator_params are valid + init_params = self.estimator_.__init__.__code__.co_varnames[1:] # exclude 'self' + honest_tree_init_params = self.__init__.__code__.co_varnames[1:] # exclude 'self' + invalid_params = [] + for param in self._tree_estimator_params.keys(): + if param not in init_params or param in honest_tree_init_params: + invalid_params.append(param) + + if invalid_params: + raise ValueError( + f"Invalid parameter(s) for estimator {self.estimator_.__class__.__name__}: " + f'{", ".join(invalid_params)}' + ) self.estimator_.set_params( **dict( @@ -602,6 +624,9 @@ def _fit( warn("Using sklearn tree so store_leaf_values cannot be set.") + # obtain the structure sample weights + sample_weights_structure = self._partition_honest_indices(y, sample_weight) + # Learn structure on subsample # XXX: this allows us to use BaseDecisionTree without partial_fit API try: diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 7d9edda73..ff1fa3a9f 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -573,8 +573,6 @@ def _fit( # - self.max_features_arr contains a possible array-like setting of max_features self._max_features_arr = self.max_features self.max_features = None - self = super()._fit( - X, y, sample_weight, check_input, missing_values_in_feature_mask, classes - ) + super()._fit(X, y, sample_weight, check_input, missing_values_in_feature_mask, classes) self.max_features = self._max_features_arr return self diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py index b412faff7..d0b673fea 100644 --- a/sktree/tree/tests/test_honest_tree.py +++ b/sktree/tree/tests/test_honest_tree.py @@ -9,6 +9,7 @@ from sktree._lib.sklearn.tree import DecisionTreeClassifier from sktree.tree import ( HonestTreeClassifier, + MultiViewDecisionTreeClassifier, ObliqueDecisionTreeClassifier, PatchObliqueDecisionTreeClassifier, ) @@ -62,6 +63,38 @@ def test_toy_accuracy(): np.testing.assert_array_equal(clf.predict(X), y) +def test_honest_tree_with_tree_estimator_params(): + X = np.ones((20, 4)) + X[10:] *= -1 + y = [0] * 10 + [1] * 10 + + # test with a parameter that is a repeat of an init parameter + clf = HonestTreeClassifier( + tree_estimator=DecisionTreeClassifier(), + random_state=0, + feature_set_ends=[10, 20], + ) + with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): + clf.fit(X, y) + + # test with a parameter that is not in any init signature + clf = HonestTreeClassifier( + tree_estimator=MultiViewDecisionTreeClassifier(), + random_state=0, + blah=0, + ) + with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): + clf.fit(X, y) + + # passing in a valid argument to the tree_estimator should work + clf = HonestTreeClassifier( + tree_estimator=MultiViewDecisionTreeClassifier(), + random_state=0, + feature_set_ends=[10, 20], + ) + clf.fit(X, y) + + @pytest.mark.parametrize( "honest_prior, val", [ From 5e75cf080ba7a582a9f6050bcae91bdb634c4dac Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 22 Feb 2024 14:44:41 -0500 Subject: [PATCH 02/14] Add changelog Signed-off-by: Adam Li --- doc/whats_new/v0.7.rst | 4 ++++ sktree/ensemble/_honest_forest.py | 13 +++++++++---- sktree/tree/_honest_tree.py | 8 +++++++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index 2d474beca..cc389e17e 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -27,6 +27,10 @@ Changelog by `Sambit Panda`_ (:pr:`#203`) - |Feature| :func:`sktree.stats.build_coleman_forest` and :func:`sktree.stats.build_permutation_forest` are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`) +- |API| :class:`sktree.HonestForestClassifier` and :class:`sktree.tree.HonestTreeClassifier` + now overwrite all parameters set by the underlying ``tree_estimator`` and allow you to directly + pass any extra parameters that ``tree_estimator`` has compared to the original + :class:`~sklearn.tree.DecisionTreeClassifier`, `Adam Li`_ (:pr:`#228`) Code and Documentation Contributors ----------------------------------- diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 7a3b23eb5..33833d0a9 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -246,15 +246,20 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin): fraction creates shallower trees with lower variance estimates. tree_estimator : object, default=None - Type of decision tree classifier to use. By default `None`, which - defaults to `sktree.tree.DecisionTreeClassifier`. Note - that one MUST use trees imported from the `sktree.tree` - API namespace rather than from `sklearn.tree`. + Instantiated tree of type BaseDecisionTree from sktree. + If None, then sklearn's DecisionTreeClassifier with default parameters will + be used. Note that none of the parameters in ``tree_estimator`` need + to be set. The parameters of the ``tree_estimator`` can be set using + the ``tree_estimator_params`` keyword argument. stratify : bool Whether or not to stratify sample when considering structure and leaf indices. By default False. + **tree_estimator_params : dict + Parameters to pass to the underlying base tree estimators. + These must be parameters for ``tree_estimator``. + Attributes ---------- estimator : sktree.tree.HonestTreeClassifier diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index a0cb34950..5927703f5 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -21,7 +21,9 @@ class HonestTreeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseDecisionTree tree_estimator : object, default=None Instantiated tree of type BaseDecisionTree from sktree. If None, then sklearn's DecisionTreeClassifier with default parameters will - be used. + be used. Note that none of the parameters in ``tree_estimator`` need + to be set. The parameters of the ``tree_estimator`` can be set using + the ``tree_estimator_params`` keyword argument. criterion : {"gini", "entropy"}, default="gini" The function to measure the quality of a split. Supported criteria are @@ -171,6 +173,10 @@ class frequency in the voting subsample. Whether or not to stratify sample when considering structure and leaf indices. By default False. + **tree_estimator_params : dict + Parameters to pass to the underlying base tree estimators. + These must be parameters for ``tree_estimator``. + Attributes ---------- estimator_ : object From 7ab70cde5b35e4524c19b35905da8a809687bdf1 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 22 Feb 2024 15:08:49 -0500 Subject: [PATCH 03/14] Fix return Signed-off-by: Adam Li --- sktree/tree/_classes.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index 4763bb53d..1ce5adb34 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -302,6 +302,7 @@ def _build_tree( ) builder.build(self.tree_, X, sample_weight) + return self def predict(self, X, check_input=True): """Assign labels based on clustering the affinity matrix. @@ -573,6 +574,7 @@ def _build_tree( ) builder.build(self.tree_, X, sample_weight) + return self class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): @@ -986,6 +988,8 @@ def _build_tree( self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] + return self + class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """An oblique decision tree Regressor. @@ -1364,6 +1368,7 @@ def _build_tree( ) builder.build(self.tree_, X, y, sample_weight, None) + return self class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): @@ -1839,6 +1844,8 @@ def _build_tree( self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] + return self + def _more_tags(self): # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values # However, for MORF it is not supported @@ -2306,6 +2313,8 @@ def _build_tree( builder.build(self.tree_, X, y, sample_weight, None) + return self + def _more_tags(self): # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values # However, for MORF it is not supported @@ -2736,6 +2745,7 @@ def _build_tree( if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] + return self class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): @@ -3125,3 +3135,5 @@ def _build_tree( ) builder.build(self.tree_, X, y, sample_weight) + + return self From 7549c1b18db03cc16b5f074607ec37d7f27136d8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 22 Feb 2024 18:11:15 -0500 Subject: [PATCH 04/14] Fix unit-testss Signed-off-by: Adam Li --- sktree/ensemble/_extensions.py | 8 ++++---- sktree/ensemble/_honest_forest.py | 9 +++++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sktree/ensemble/_extensions.py b/sktree/ensemble/_extensions.py index 1933d1415..edf301f8a 100644 --- a/sktree/ensemble/_extensions.py +++ b/sktree/ensemble/_extensions.py @@ -47,11 +47,11 @@ def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap, boots XXX: this is copied over from scikit-learn and modified to allow sampling with and without replacement given ``bootstrap``. """ - random_instance = check_random_state(random_state) n_sample_idx = np.arange(0, n_samples, dtype=np.int32) - sample_indices = random_instance.choice(n_sample_idx, n_samples_bootstrap, replace=bootstrap) - + sample_indices = np.atleast_1d( + random_instance.choice(n_sample_idx, n_samples_bootstrap, replace=bootstrap) + ) return sample_indices @@ -65,7 +65,7 @@ def oob_samples_(self): Only utilized if ``bootstrap=True``, otherwise, all samples are "in-bag". """ if self.bootstrap is False and ( - self._n_samples_bootstrap == self._n_samples or self._n_samples_bootstrap is None + self._n_samples_bootstrap is None or (self._n_samples_bootstrap == self._n_samples) ): raise RuntimeError( "Cannot extract out-of-bag samples when bootstrap is False and " diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 33833d0a9..768cd31a0 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -663,7 +663,9 @@ def oob_samples_(self): Only utilized if ``bootstrap=True``, otherwise, all samples are "in-bag". """ - if self.bootstrap is False and self._n_samples_bootstrap == self._n_samples: + if self.bootstrap is False and ( + self._n_samples_bootstrap is None or self._n_samples_bootstrap == self._n_samples + ): raise RuntimeError( "Cannot extract out-of-bag samples when bootstrap is False and " "n_samples == n_samples_bootstrap" @@ -767,13 +769,16 @@ def get_leaf_node_samples(self, X): def _get_estimators_indices(self): # Get drawn indices along both sample and feature axes for tree in self.estimators_: - if not self.bootstrap and self._n_samples_bootstrap == self._n_samples: + if not self.bootstrap and ( + self._n_samples_bootstrap is None or (self._n_samples_bootstrap == self._n_samples) + ): yield np.arange(self._n_samples, dtype=np.int32) else: # tree.random_state is actually an immutable integer seed rather # than a mutable RandomState instance, so it's safe to use it # repeatedly when calling this property. seed = tree.random_state + # Operations accessing random_state must be performed identically # to those in `_parallel_build_trees()` yield _generate_sample_indices( From d196a6caf284bd491c20740c0620d81a79f2b1e6 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 09:25:24 -0500 Subject: [PATCH 05/14] Change wrt main Signed-off-by: Adam Li --- sktree/ensemble/_extensions.py | 4 +--- sktree/ensemble/_honest_forest.py | 36 ++++++++++++++--------------- sktree/stats/forestht.py | 2 +- sktree/stats/permuteforest.py | 2 ++ sktree/stats/tests/test_forestht.py | 24 ++++++++++++++----- 5 files changed, 39 insertions(+), 29 deletions(-) diff --git a/sktree/ensemble/_extensions.py b/sktree/ensemble/_extensions.py index edf301f8a..6e8219763 100644 --- a/sktree/ensemble/_extensions.py +++ b/sktree/ensemble/_extensions.py @@ -49,9 +49,7 @@ def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap, boots """ random_instance = check_random_state(random_state) n_sample_idx = np.arange(0, n_samples, dtype=np.int32) - sample_indices = np.atleast_1d( - random_instance.choice(n_sample_idx, n_samples_bootstrap, replace=bootstrap) - ) + sample_indices = random_instance.choice(n_sample_idx, n_samples_bootstrap, replace=bootstrap) return sample_indices diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 768cd31a0..4ae2691c7 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -420,28 +420,26 @@ def __init__( stratify=False, **tree_estimator_params, ): - estimator_params = ( - "criterion", - "splitter", - "max_depth", - "min_samples_split", - "min_samples_leaf", - "min_weight_fraction_leaf", - "max_features", - "max_leaf_nodes", - "min_impurity_decrease", - "random_state", - "ccp_alpha", - "tree_estimator", - "honest_fraction", - "honest_prior", - "stratify", - ) - super().__init__( estimator=HonestTreeClassifier(), n_estimators=n_estimators, - estimator_params=estimator_params, + estimator_params=( + "criterion", + "splitter", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "tree_estimator", + "honest_fraction", + "honest_prior", + "stratify", + ), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 9b9aef692..eaf616928 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1338,7 +1338,7 @@ def build_coleman_forest( forest_result = ForestTestResult(observe_test_stat, permute_stat, observe_stat, pvalue) if return_posteriors: - return forest_result, orig_forest_proba, perm_forest_proba + return forest_result, orig_forest_proba, perm_forest_proba, est, perm_est else: return forest_result diff --git a/sktree/stats/permuteforest.py b/sktree/stats/permuteforest.py index 8c2dfe7b1..42f318c06 100644 --- a/sktree/stats/permuteforest.py +++ b/sktree/stats/permuteforest.py @@ -297,6 +297,7 @@ def __init__( tree_estimator=None, stratify=False, permute_per_tree=False, + **tree_estimator_params, ): super().__init__( n_estimators, @@ -322,6 +323,7 @@ def __init__( honest_fraction, tree_estimator, stratify, + **tree_estimator_params, ) self.permute_per_tree = permute_per_tree diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index ca9d7a1de..ed9d9e780 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -785,9 +785,9 @@ def test_build_coleman_forest(): Test the function under alternative and null hypothesis for a very simple dataset. """ - n_estimators = 10 + n_estimators = 40 n_samples = 20 - n_features = 3 + n_features = 5 rng = np.random.default_rng(seed) _X = rng.uniform(size=(n_samples, n_features)) @@ -799,19 +799,31 @@ def test_build_coleman_forest(): ) # Binary classification clf = HonestForestClassifier( - n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, ) perm_clf = PermutationHonestForestClassifier( - n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, ) with pytest.raises( RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" ): build_coleman_forest(clf, clf, X, y) - forest_result, orig_forest_proba, perm_forest_proba = build_coleman_forest( - clf, perm_clf, X, y, metric="s@98" + forest_result, orig_forest_proba, perm_forest_proba, clf, perm_clf = build_coleman_forest( + clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed ) + assert clf._n_samples_bootstrap == round(20 * 1.6) + assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" assert forest_result.observe_stat > 0.1, f"{forest_result.observe_stat}" assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) From 97155f80fe16fdc77ef597a0661d5fc9b3550d76 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 10:42:31 -0500 Subject: [PATCH 06/14] Fix unit tests Signed-off-by: Adam Li --- sktree/_lib/sklearn_fork | 2 +- sktree/ensemble/_honest_forest.py | 45 ------------------------------- sktree/stats/permuteforest.py | 2 +- 3 files changed, 2 insertions(+), 47 deletions(-) diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index d48716a6b..94517a38a 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit d48716a6b2cc6373b9e66bf959f2b43b89f10c5d +Subproject commit 94517a38a6354ee02ef715d1077c8ec6d1713d3b diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 4ae2691c7..ec146bda2 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -536,51 +536,6 @@ def _make_estimator(self, append=True, random_state=None): return estimator - def _construct_trees( - self, - X, - y, - sample_weight, - random_state, - n_samples_bootstrap, - missing_values_in_feature_mask, - classes, - n_more_estimators, - ): - trees = [ - self._make_estimator(append=False, random_state=random_state) - for i in range(n_more_estimators) - ] - - # Parallel loop: we prefer the threading backend as the Cython code - # for fitting the trees is internally releasing the Python GIL - # making threading more efficient than multiprocessing in - # that case. However, for joblib 0.12+ we respect any - # parallel_backend contexts set at a higher level, - # since correctness does not rely on using threads. - trees = Parallel( - n_jobs=self.n_jobs, - verbose=self.verbose, - prefer="threads", - )( - delayed(_parallel_build_trees)( - t, - self.bootstrap, - X, - y, - sample_weight, - i, - len(trees), - verbose=self.verbose, - class_weight=self.class_weight, - n_samples_bootstrap=n_samples_bootstrap, - missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, - ) - for i, t in enumerate(trees) - ) - return trees - def predict_proba(self, X): """ Predict class probabilities for X. diff --git a/sktree/stats/permuteforest.py b/sktree/stats/permuteforest.py index 42f318c06..fc9aa19a8 100644 --- a/sktree/stats/permuteforest.py +++ b/sktree/stats/permuteforest.py @@ -470,4 +470,4 @@ def _construct_trees( for i, t in enumerate(trees) ) - return trees + self.estimators_.extend(trees) From 7d377c1dfe5313eb66f71dba29d283146948d5d3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 10:50:38 -0500 Subject: [PATCH 07/14] Get ci pass Signed-off-by: Adam Li --- sktree/stats/permuteforest.py | 4 ++-- sktree/stats/tests/test_forestht.py | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sktree/stats/permuteforest.py b/sktree/stats/permuteforest.py index fc9aa19a8..460361295 100644 --- a/sktree/stats/permuteforest.py +++ b/sktree/stats/permuteforest.py @@ -442,8 +442,8 @@ def _construct_trees( ) ) else: - perm_idx = random_state.choice( - self._n_samples, size=(self._n_samples, 1), replace=False + perm_idx = np.array( + random_state.choice(self._n_samples, size=(self._n_samples, 1), replace=False) ) X[:, self.covariate_index_] = X[perm_idx, self.covariate_index_] self.permutation_indices_ = perm_idx diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index ed9d9e780..18ea7ec34 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -786,7 +786,7 @@ def test_build_coleman_forest(): Test the function under alternative and null hypothesis for a very simple dataset. """ n_estimators = 40 - n_samples = 20 + n_samples = 30 n_features = 5 rng = np.random.default_rng(seed) @@ -819,17 +819,21 @@ def test_build_coleman_forest(): ): build_coleman_forest(clf, clf, X, y) - forest_result, orig_forest_proba, perm_forest_proba, clf, perm_clf = build_coleman_forest( - clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed + forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = ( + build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) ) - assert clf._n_samples_bootstrap == round(20 * 1.6) + assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) + assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) + assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1)) assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" assert forest_result.observe_stat > 0.1, f"{forest_result.observe_stat}" assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) X = np.vstack([_X, _X]) - forest_result, _, _ = build_coleman_forest(clf, perm_clf, X, y, metric="s@98") + forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest( + clf, perm_clf, X, y, metric="s@98" + ) assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" assert forest_result.observe_stat < 0.05, f"{forest_result.observe_stat}" From 87b2182cfa46e97c6eb0de0b363247f8b445e869 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 10:55:07 -0500 Subject: [PATCH 08/14] Merging Signed-off-by: Adam Li --- sktree/ensemble/_honest_forest.py | 6 +++ sktree/stats/tests/test_forestht.py | 64 +++++++++++++++++++++++++++++ sktree/tree/_honest_tree.py | 9 ++-- 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index ec146bda2..eb1dab87a 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -536,6 +536,12 @@ def _make_estimator(self, append=True, random_state=None): return estimator + def _inherit_estimator_attributes(self): + """Initialize necessary attributes from the provided tree estimator""" + for attr_name in dir(self.estimator_[0]): + if not attr_name.startswith("_") and attr_name.endswith("_"): + setattr(self, attr_name, getattr(self.estimator_, attr_name)) + def predict_proba(self, X): """ Predict class probabilities for X. diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 18ea7ec34..5a0cfff42 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -838,6 +838,70 @@ def test_build_coleman_forest(): assert forest_result.observe_stat < 0.05, f"{forest_result.observe_stat}" +def test_build_coleman_forest_multiview(): + """Simple test for building a Coleman forest. + + Test the function under alternative and null hypothesis for a very simple dataset. + """ + n_estimators = 40 + n_samples = 30 + n_features = 5 + rng = np.random.default_rng(seed) + + _X = rng.uniform(size=(n_samples, n_features)) + _X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = _X + 3 + X = np.vstack([_X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + + clf = HonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, + max_features=[1, 2], + tree_estimator=MultiViewDecisionTreeClassifier(), + feature_set_ends=[2, 5], + ) + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, + max_features=[1, 2], + tree_estimator=MultiViewDecisionTreeClassifier(), + feature_set_ends=[2, 5], + ) + with pytest.raises( + RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" + ): + build_coleman_forest(clf, clf, X, y) + + forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = ( + build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) + ) + assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) + assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) + assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1)) + + assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" + assert forest_result.observe_stat > 0.1, f"{forest_result.observe_stat}" + assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) + + X = np.vstack([_X, _X]) + forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest( + clf, perm_clf, X, y, metric="s@98" + ) + assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" + assert forest_result.observe_stat < 0.05, f"{forest_result.observe_stat}" + + def test_build_permutation_forest(): """Simple test for building a permutation forest.""" n_estimators = 30 diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 5927703f5..7ccaf1f28 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -730,12 +730,9 @@ def _set_leaf_nodes(self, leaf_ids, y): def _inherit_estimator_attributes(self): """Initialize necessary attributes from the provided tree estimator""" - self.classes_ = self.estimator_.classes_ - self.max_features_ = self.estimator_.max_features_ - self.n_classes_ = self.estimator_.n_classes_ - self.n_features_in_ = self.estimator_.n_features_in_ - self.n_outputs_ = self.estimator_.n_outputs_ - self.tree_ = self.estimator_.tree_ + for attr_name in dir(self.estimator_): + if not attr_name.startswith("_") and attr_name.endswith("_"): + setattr(self, attr_name, getattr(self.estimator_, attr_name)) def _empty_leaf_correction(self, proba, pos=0): """Leaves with empty posteriors are assigned values. From 9530594eb720e9dd290ef9d3f020f8a09fcff820 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 14:30:28 -0500 Subject: [PATCH 09/14] Merge Signed-off-by: Adam Li --- pyproject.toml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8424d8a42..536364240 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,9 +15,6 @@ requires = [ "numpy>=1.25; python_version>='3.9'" ] -[lint.per-file-ignores] -'__init__.py' = ['F401'] - [project] name = "scikit-tree" version = "0.7.0dev0" @@ -266,7 +263,12 @@ extend-exclude = [ 'validation' ] line-length = 88 -lint.ignore = ['E731'] + +[tool.ruff.lint] +ignore = ['E731'] + +[tool.ruff.lint.per-file-ignores] +'__init__.py' = ['F401'] [tool.spin] package = 'sktree' From 8d68c2a6164392f24dc3e2c10cff8f556fce66ac Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 14:53:34 -0500 Subject: [PATCH 10/14] Try again Signed-off-by: Adam Li --- sktree/ensemble/_honest_forest.py | 37 +++++++++----------------- sktree/tests/test_honest_forest.py | 38 ++++++++++++++++++++------- sktree/tree/_classes.py | 22 ++++++++++++++++ sktree/tree/_honest_tree.py | 20 +++++++------- sktree/tree/_multiview.py | 6 ++++- sktree/tree/tests/test_honest_tree.py | 34 +++++++++++++++--------- 6 files changed, 99 insertions(+), 58 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 525e875b3..f08189a62 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -277,9 +277,6 @@ class labels (multi-output problem). The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem). - n_features_ : int - The number of features when ``fit`` is performed. - n_features_in_ : int Number of features seen during :term:`fit`. @@ -508,6 +505,9 @@ def fit(self, X, y, sample_weight=None, classes=None, **fit_params): super().fit(X, y, sample_weight=sample_weight, classes=classes, **fit_params) + # Inherit attributes from the tree estimator + self._inherit_estimator_attributes() + # Compute honest decision function self.honest_decision_function_ = self._predict_proba( X, indices=self.honest_indices_, impute_missing=np.nan @@ -538,9 +538,15 @@ def _make_estimator(self, append=True, random_state=None): def _inherit_estimator_attributes(self): """Initialize necessary attributes from the provided tree estimator""" - for attr_name in dir(self.estimator_[0]): - if not attr_name.startswith("_") and attr_name.endswith("_"): - setattr(self, attr_name, getattr(self.estimator_, attr_name)) + if hasattr(self.estimators_[0], "_inheritable_fitted_attribute"): + for attr in self.estimators_[0]._inheritable_fitted_attribute: + setattr(self, attr, getattr(self.estimators_[0], attr)) + + self.classes_ = self.estimators_[0].classes_ + self.max_features_ = self.estimators_[0].max_features_ + self.n_classes_ = self.estimators_[0].n_classes_ + self.n_features_in_ = self.estimators_[0].n_features_in_ + self.n_outputs_ = self.estimators_[0].n_outputs_ def predict_proba(self, X): """ @@ -644,25 +650,6 @@ def oob_samples_(self): def _more_tags(self): return {"multioutput": False} - def apply(self, X): - """ - Apply trees in the forest to X, return leaf indices. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. Internally, its dtype will be converted to - ``dtype=np.float32``. If a sparse matrix is provided, it will be - converted into a sparse ``csr_matrix``. - - Returns - ------- - X_leaves : ndarray of shape (n_samples, n_estimators) - For each datapoint x in X and for each tree in the forest, - return the index of the leaf x ends up in. - """ - return self.estimator_.apply(X) - def decision_path(self, X): """ Return the decision path in the forest. diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index c06fb716c..8aa694838 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -437,23 +437,30 @@ def test_honest_forest_with_sklearn_trees_with_mi(): assert_allclose(np.mean(sk_scores), np.mean(scores), atol=0.005) -def test_honest_forest_with_tree_estimator_params(): +@pytest.mark.parametrize( + "tree, tree_kwargs", + [ + (MultiViewDecisionTreeClassifier(), {"feature_set_ends": [10, 20]}), + (ObliqueDecisionTreeClassifier(), {"feature_combinations": 2}), + (PatchObliqueDecisionTreeClassifier(), {"max_patch_dims": 5}), + ], +) +def test_honest_forest_with_tree_estimator_params(tree, tree_kwargs): + """Test that honest forest inherits all the fitted parameters of the tree estimator.""" X = np.ones((20, 4)) X[10:] *= -1 y = [0] * 10 + [1] * 10 # test with a parameter that is a repeat of an init parameter clf = HonestForestClassifier( - tree_estimator=DecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], + tree_estimator=DecisionTreeClassifier(), random_state=0, **tree_kwargs ) with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): clf.fit(X, y) # test with a parameter that is not in any init signature clf = HonestForestClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), + tree_estimator=tree, random_state=0, blah=0, ) @@ -461,9 +468,20 @@ def test_honest_forest_with_tree_estimator_params(): clf.fit(X, y) # passing in a valid argument to the tree_estimator should work - clf = HonestForestClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], - ) + clf = HonestForestClassifier(tree_estimator=tree, random_state=0, **tree_kwargs) clf.fit(X, y) + checked_attrs = [ + "classes_", + "n_classes_", + "n_features_in_", + "n_outputs_", + ] + checked_attrs + getattr(clf.estimator_, "_inheritable_fitted_attribute", []) + for attr_name in checked_attrs: + if not attr_name.startswith("_") and attr_name.endswith("_"): + if isinstance(getattr(clf, attr_name), np.ndarray): + np.testing.assert_array_equal( + getattr(clf, attr_name), getattr(clf.estimators_[0], attr_name) + ) + else: + assert getattr(clf, attr_name) == getattr(clf.estimators_[0], attr_name) diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index 1ce5adb34..67b534a9d 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -990,6 +990,12 @@ def _build_tree( return self + @property + def _inheritable_fitted_attribute(self): + return { + "feature_combinations_", + } + class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """An oblique decision tree Regressor. @@ -1852,6 +1858,16 @@ def _more_tags(self): allow_nan = False return {"multilabel": True, "allow_nan": allow_nan} + @property + def _inheritable_fitted_attribute(self): + return { + "feature_combinations_", + "min_patch_dims_", + "max_patch_dims_", + "dim_contiguous_", + "data_dims_", + } + class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """A oblique decision tree regressor that operates over patches of data. @@ -2747,6 +2763,12 @@ def _build_tree( self.classes_ = self.classes_[0] return self + @property + def _inheritable_fitted_attribute(self): + return { + "feature_combinations_", + } + class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """An oblique decision tree Regressor. diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 5f181dfd7..86d0a46d4 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -456,7 +456,7 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): check_input=check_input, classes=classes, ) - # self._inherit_estimator_attributes() + self._inherit_estimator_attributes() # update the number of classes, unsplit if y.ndim == 1: @@ -652,7 +652,7 @@ def _fit( check_input=check_input, missing_values_in_feature_mask=missing_values_in_feature_mask, ) - # self._inherit_estimator_attributes() + self._inherit_estimator_attributes() # fit the leaves on the non-structure indices not_honest_mask = np.ones(len(y), dtype=bool) @@ -730,17 +730,19 @@ def _set_leaf_nodes(self, leaf_ids, y): def _inherit_estimator_attributes(self): """Initialize necessary attributes from the provided tree estimator""" - # basic_inherited_attrs = [ - # "tree_", - # "n_classes_", - # "classes_", - # "n_outputs_", - # ] - if hasattr(self.estimator_, "_inheritable_fitted_attribute"): for attr in self.estimator_._inheritable_fitted_attribute: setattr(self, attr, getattr(self.estimator_, attr)) + self.classes_ = self.estimator_.classes_ + self.max_features_ = self.estimator_.max_features_ + self.n_classes_ = self.estimator_.n_classes_ + self.n_features_in_ = self.estimator_.n_features_in_ + self.n_outputs_ = self.estimator_.n_outputs_ + self.tree_ = self.estimator_.tree_ + self.builder_ = self.estimator_.builder_ + self.min_samples_split_ = self.estimator_.min_samples_split_ + def _empty_leaf_correction(self, proba, pos=0): """Leaves with empty posteriors are assigned values. diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 0d50a96a6..f408f1e72 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -476,7 +476,6 @@ def _build_tree( max_features = 0 self.max_features_ = max_features - print(self.max_features_, self.max_features_per_set_) if not isinstance(self.splitter, ObliqueSplitter): splitter = SPLITTERS[self.splitter]( @@ -579,10 +578,15 @@ def _fit( @property def _inheritable_fitted_attribute(self): + """Define additional attributes to pass onto a parent meta tree-estimator. + + Used for passing parameters to HonestTreeClassifier. + """ return { "max_features_", "feature_combinations_", "feature_set_ends_", "n_feature_sets_", + "n_features_in_set_", "max_features_per_set_", } diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py index e84172a45..907c386f1 100644 --- a/sktree/tree/tests/test_honest_tree.py +++ b/sktree/tree/tests/test_honest_tree.py @@ -63,23 +63,30 @@ def test_toy_accuracy(): np.testing.assert_array_equal(clf.predict(X), y) -def test_honest_tree_with_tree_estimator_params(): +@pytest.mark.parametrize( + "tree, tree_kwargs", + [ + (MultiViewDecisionTreeClassifier(), {"feature_set_ends": [10, 20]}), + (ObliqueDecisionTreeClassifier(), {"feature_combinations": 2}), + (PatchObliqueDecisionTreeClassifier(), {"max_patch_dims": 5}), + ], +) +def test_honest_tree_with_tree_estimator_params(tree, tree_kwargs): + """Test that honest tree inherits all the fitted parameters of the tree estimator.""" X = np.ones((20, 4)) X[10:] *= -1 y = [0] * 10 + [1] * 10 # test with a parameter that is a repeat of an init parameter clf = HonestTreeClassifier( - tree_estimator=DecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], + tree_estimator=DecisionTreeClassifier(), random_state=0, **tree_kwargs ) with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): clf.fit(X, y) # test with a parameter that is not in any init signature clf = HonestTreeClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), + tree_estimator=tree, random_state=0, blah=0, ) @@ -87,15 +94,16 @@ def test_honest_tree_with_tree_estimator_params(): clf.fit(X, y) # passing in a valid argument to the tree_estimator should work - clf = HonestTreeClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], - ) + clf = HonestTreeClassifier(tree_estimator=tree, random_state=0, **tree_kwargs) clf.fit(X, y) - # for attr_name in dir(clf.estimators_[0]): - # if not attr_name.startswith("_") and attr_name.endswith("_"): - # assert hasattr(clf, attr_name, getattr(clf.estimators_[0], attr_name)) + for attr_name in dir(clf.estimator_): + if not attr_name.startswith("_") and attr_name.endswith("_"): + if isinstance(getattr(clf, attr_name), np.ndarray): + np.testing.assert_array_equal( + getattr(clf, attr_name), getattr(clf.estimator_, attr_name) + ) + else: + assert getattr(clf, attr_name) == getattr(clf.estimator_, attr_name) @pytest.mark.parametrize( From d3c1d556474f07f5707aefd5cccca3efd51e6846 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 16:04:47 -0500 Subject: [PATCH 11/14] Fix Signed-off-by: Adam Li --- sktree/tree/_honest_tree.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 86d0a46d4..a35e84f99 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -740,8 +740,10 @@ def _inherit_estimator_attributes(self): self.n_features_in_ = self.estimator_.n_features_in_ self.n_outputs_ = self.estimator_.n_outputs_ self.tree_ = self.estimator_.tree_ - self.builder_ = self.estimator_.builder_ - self.min_samples_split_ = self.estimator_.min_samples_split_ + + # XXX: scikit-learn trees do not store their builder, or min_samples_split_ + self.builder_ = getattr(self.estimator_, "builder_", None) + self.min_samples_split_ = getattr(self.estimator_, "min_samples_split_", None) def _empty_leaf_correction(self, proba, pos=0): """Leaves with empty posteriors are assigned values. From 7482c28d8946d4e6284a87d4197ed53b3d9652b8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 16:30:10 -0500 Subject: [PATCH 12/14] Fix Signed-off-by: Adam Li --- sktree/ensemble/_honest_forest.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index f08189a62..24d9cecae 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -542,12 +542,6 @@ def _inherit_estimator_attributes(self): for attr in self.estimators_[0]._inheritable_fitted_attribute: setattr(self, attr, getattr(self.estimators_[0], attr)) - self.classes_ = self.estimators_[0].classes_ - self.max_features_ = self.estimators_[0].max_features_ - self.n_classes_ = self.estimators_[0].n_classes_ - self.n_features_in_ = self.estimators_[0].n_features_in_ - self.n_outputs_ = self.estimators_[0].n_outputs_ - def predict_proba(self, X): """ Predict class probabilities for X. From e177338c52c2bc2e8f0d720dcd97c478c7f81e62 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 17:05:09 -0500 Subject: [PATCH 13/14] Try new test Signed-off-by: Adam Li --- sktree/stats/tests/test_forestht.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 5a0cfff42..091eff99a 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -863,7 +863,7 @@ def test_build_coleman_forest_multiview(): honest_fraction=0.5, bootstrap=True, max_samples=1.6, - max_features=[1, 2], + max_features=[1, 1], tree_estimator=MultiViewDecisionTreeClassifier(), feature_set_ends=[2, 5], ) @@ -874,7 +874,7 @@ def test_build_coleman_forest_multiview(): honest_fraction=0.5, bootstrap=True, max_samples=1.6, - max_features=[1, 2], + max_features=[1, 1], tree_estimator=MultiViewDecisionTreeClassifier(), feature_set_ends=[2, 5], ) @@ -899,7 +899,6 @@ def test_build_coleman_forest_multiview(): clf, perm_clf, X, y, metric="s@98" ) assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" - assert forest_result.observe_stat < 0.05, f"{forest_result.observe_stat}" def test_build_permutation_forest(): From 97dcee4c8bc743c8642da1f70b8bba982f0c80a8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 22:46:35 -0500 Subject: [PATCH 14/14] Ensure traits are passed thru Signed-off-by: Adam Li --- sktree/ensemble/_honest_forest.py | 4 ++-- sktree/tests/test_honest_forest.py | 2 +- sktree/tree/_classes.py | 12 ++++++------ sktree/tree/_multiview.py | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 24d9cecae..b1647b9ca 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -538,8 +538,8 @@ def _make_estimator(self, append=True, random_state=None): def _inherit_estimator_attributes(self): """Initialize necessary attributes from the provided tree estimator""" - if hasattr(self.estimators_[0], "_inheritable_fitted_attribute"): - for attr in self.estimators_[0]._inheritable_fitted_attribute: + if hasattr(self.tree_estimator, "_inheritable_fitted_attribute"): + for attr in self.tree_estimator._inheritable_fitted_attribute: setattr(self, attr, getattr(self.estimators_[0], attr)) def predict_proba(self, X): diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index 8aa694838..3ad9c065a 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -476,7 +476,7 @@ def test_honest_forest_with_tree_estimator_params(tree, tree_kwargs): "n_features_in_", "n_outputs_", ] - checked_attrs + getattr(clf.estimator_, "_inheritable_fitted_attribute", []) + checked_attrs + getattr(tree, "_inheritable_fitted_attribute", []) for attr_name in checked_attrs: if not attr_name.startswith("_") and attr_name.endswith("_"): if isinstance(getattr(clf, attr_name), np.ndarray): diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index 67b534a9d..1b0e95cfa 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -992,9 +992,9 @@ def _build_tree( @property def _inheritable_fitted_attribute(self): - return { + return [ "feature_combinations_", - } + ] class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): @@ -1860,13 +1860,13 @@ def _more_tags(self): @property def _inheritable_fitted_attribute(self): - return { + return [ "feature_combinations_", "min_patch_dims_", "max_patch_dims_", "dim_contiguous_", "data_dims_", - } + ] class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): @@ -2765,9 +2765,9 @@ def _build_tree( @property def _inheritable_fitted_attribute(self): - return { + return [ "feature_combinations_", - } + ] class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index f408f1e72..52b70c0df 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -582,11 +582,11 @@ def _inheritable_fitted_attribute(self): Used for passing parameters to HonestTreeClassifier. """ - return { + return [ "max_features_", "feature_combinations_", "feature_set_ends_", "n_feature_sets_", "n_features_in_set_", "max_features_per_set_", - } + ]