From 48ca3835652e7d12ea21c4b6256ced9c7c599f5d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 23:59:06 -0500 Subject: [PATCH] MAINT honest API with a multi view (#231) * Ensure HonestForest and HonestTree inherit fitted attributes from any decision tree we can use in scikit-tree --------- Signed-off-by: Adam Li --- pyproject.toml | 11 +++-- sktree/ensemble/_honest_forest.py | 31 ++++--------- sktree/stats/tests/test_forestht.py | 63 +++++++++++++++++++++++++++ sktree/tests/test_honest_forest.py | 38 +++++++++++----- sktree/tree/_classes.py | 22 ++++++++++ sktree/tree/_honest_tree.py | 8 ++++ sktree/tree/_multiview.py | 16 ++++++- sktree/tree/tests/test_honest_tree.py | 31 ++++++++----- 8 files changed, 171 insertions(+), 49 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5e613f7c1..536364240 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,9 +15,6 @@ requires = [ "numpy>=1.25; python_version>='3.9'" ] -[lint.per-file-ignores] -'__init__.py' = ['F401'] - [project] name = "scikit-tree" version = "0.7.0dev0" @@ -266,10 +263,12 @@ extend-exclude = [ 'validation' ] line-length = 88 -lint.ignore = ['E731'] -[tool.ruff.per-file-ignores] -"__init__.py" = ["F401"] +[tool.ruff.lint] +ignore = ['E731'] + +[tool.ruff.lint.per-file-ignores] +'__init__.py' = ['F401'] [tool.spin] package = 'sktree' diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index f263bcd90..b1647b9ca 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -277,9 +277,6 @@ class labels (multi-output problem). The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem). - n_features_ : int - The number of features when ``fit`` is performed. - n_features_in_ : int Number of features seen during :term:`fit`. @@ -508,6 +505,9 @@ def fit(self, X, y, sample_weight=None, classes=None, **fit_params): super().fit(X, y, sample_weight=sample_weight, classes=classes, **fit_params) + # Inherit attributes from the tree estimator + self._inherit_estimator_attributes() + # Compute honest decision function self.honest_decision_function_ = self._predict_proba( X, indices=self.honest_indices_, impute_missing=np.nan @@ -536,6 +536,12 @@ def _make_estimator(self, append=True, random_state=None): return estimator + def _inherit_estimator_attributes(self): + """Initialize necessary attributes from the provided tree estimator""" + if hasattr(self.tree_estimator, "_inheritable_fitted_attribute"): + for attr in self.tree_estimator._inheritable_fitted_attribute: + setattr(self, attr, getattr(self.estimators_[0], attr)) + def predict_proba(self, X): """ Predict class probabilities for X. @@ -638,25 +644,6 @@ def oob_samples_(self): def _more_tags(self): return {"multioutput": False} - def apply(self, X): - """ - Apply trees in the forest to X, return leaf indices. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. Internally, its dtype will be converted to - ``dtype=np.float32``. If a sparse matrix is provided, it will be - converted into a sparse ``csr_matrix``. - - Returns - ------- - X_leaves : ndarray of shape (n_samples, n_estimators) - For each datapoint x in X and for each tree in the forest, - return the index of the leaf x ends up in. - """ - return self.estimator_.apply(X) - def decision_path(self, X): """ Return the decision path in the forest. diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 18ea7ec34..091eff99a 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -838,6 +838,69 @@ def test_build_coleman_forest(): assert forest_result.observe_stat < 0.05, f"{forest_result.observe_stat}" +def test_build_coleman_forest_multiview(): + """Simple test for building a Coleman forest. + + Test the function under alternative and null hypothesis for a very simple dataset. + """ + n_estimators = 40 + n_samples = 30 + n_features = 5 + rng = np.random.default_rng(seed) + + _X = rng.uniform(size=(n_samples, n_features)) + _X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = _X + 3 + X = np.vstack([_X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + + clf = HonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, + max_features=[1, 1], + tree_estimator=MultiViewDecisionTreeClassifier(), + feature_set_ends=[2, 5], + ) + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, + random_state=seed, + n_jobs=-1, + honest_fraction=0.5, + bootstrap=True, + max_samples=1.6, + max_features=[1, 1], + tree_estimator=MultiViewDecisionTreeClassifier(), + feature_set_ends=[2, 5], + ) + with pytest.raises( + RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" + ): + build_coleman_forest(clf, clf, X, y) + + forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = ( + build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) + ) + assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) + assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) + assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1)) + + assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" + assert forest_result.observe_stat > 0.1, f"{forest_result.observe_stat}" + assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) + + X = np.vstack([_X, _X]) + forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest( + clf, perm_clf, X, y, metric="s@98" + ) + assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" + + def test_build_permutation_forest(): """Simple test for building a permutation forest.""" n_estimators = 30 diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index c06fb716c..3ad9c065a 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -437,23 +437,30 @@ def test_honest_forest_with_sklearn_trees_with_mi(): assert_allclose(np.mean(sk_scores), np.mean(scores), atol=0.005) -def test_honest_forest_with_tree_estimator_params(): +@pytest.mark.parametrize( + "tree, tree_kwargs", + [ + (MultiViewDecisionTreeClassifier(), {"feature_set_ends": [10, 20]}), + (ObliqueDecisionTreeClassifier(), {"feature_combinations": 2}), + (PatchObliqueDecisionTreeClassifier(), {"max_patch_dims": 5}), + ], +) +def test_honest_forest_with_tree_estimator_params(tree, tree_kwargs): + """Test that honest forest inherits all the fitted parameters of the tree estimator.""" X = np.ones((20, 4)) X[10:] *= -1 y = [0] * 10 + [1] * 10 # test with a parameter that is a repeat of an init parameter clf = HonestForestClassifier( - tree_estimator=DecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], + tree_estimator=DecisionTreeClassifier(), random_state=0, **tree_kwargs ) with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): clf.fit(X, y) # test with a parameter that is not in any init signature clf = HonestForestClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), + tree_estimator=tree, random_state=0, blah=0, ) @@ -461,9 +468,20 @@ def test_honest_forest_with_tree_estimator_params(): clf.fit(X, y) # passing in a valid argument to the tree_estimator should work - clf = HonestForestClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], - ) + clf = HonestForestClassifier(tree_estimator=tree, random_state=0, **tree_kwargs) clf.fit(X, y) + checked_attrs = [ + "classes_", + "n_classes_", + "n_features_in_", + "n_outputs_", + ] + checked_attrs + getattr(tree, "_inheritable_fitted_attribute", []) + for attr_name in checked_attrs: + if not attr_name.startswith("_") and attr_name.endswith("_"): + if isinstance(getattr(clf, attr_name), np.ndarray): + np.testing.assert_array_equal( + getattr(clf, attr_name), getattr(clf.estimators_[0], attr_name) + ) + else: + assert getattr(clf, attr_name) == getattr(clf.estimators_[0], attr_name) diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index 1ce5adb34..1b0e95cfa 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -990,6 +990,12 @@ def _build_tree( return self + @property + def _inheritable_fitted_attribute(self): + return [ + "feature_combinations_", + ] + class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """An oblique decision tree Regressor. @@ -1852,6 +1858,16 @@ def _more_tags(self): allow_nan = False return {"multilabel": True, "allow_nan": allow_nan} + @property + def _inheritable_fitted_attribute(self): + return [ + "feature_combinations_", + "min_patch_dims_", + "max_patch_dims_", + "dim_contiguous_", + "data_dims_", + ] + class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """A oblique decision tree regressor that operates over patches of data. @@ -2747,6 +2763,12 @@ def _build_tree( self.classes_ = self.classes_[0] return self + @property + def _inheritable_fitted_attribute(self): + return [ + "feature_combinations_", + ] + class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor): """An oblique decision tree Regressor. diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 5927703f5..a35e84f99 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -730,6 +730,10 @@ def _set_leaf_nodes(self, leaf_ids, y): def _inherit_estimator_attributes(self): """Initialize necessary attributes from the provided tree estimator""" + if hasattr(self.estimator_, "_inheritable_fitted_attribute"): + for attr in self.estimator_._inheritable_fitted_attribute: + setattr(self, attr, getattr(self.estimator_, attr)) + self.classes_ = self.estimator_.classes_ self.max_features_ = self.estimator_.max_features_ self.n_classes_ = self.estimator_.n_classes_ @@ -737,6 +741,10 @@ def _inherit_estimator_attributes(self): self.n_outputs_ = self.estimator_.n_outputs_ self.tree_ = self.estimator_.tree_ + # XXX: scikit-learn trees do not store their builder, or min_samples_split_ + self.builder_ = getattr(self.estimator_, "builder_", None) + self.min_samples_split_ = getattr(self.estimator_, "min_samples_split_", None) + def _empty_leaf_correction(self, proba, pos=0): """Leaves with empty posteriors are assigned values. diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index ff1fa3a9f..52b70c0df 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -476,7 +476,6 @@ def _build_tree( max_features = 0 self.max_features_ = max_features - print(self.max_features_, self.max_features_per_set_) if not isinstance(self.splitter, ObliqueSplitter): splitter = SPLITTERS[self.splitter]( @@ -576,3 +575,18 @@ def _fit( super()._fit(X, y, sample_weight, check_input, missing_values_in_feature_mask, classes) self.max_features = self._max_features_arr return self + + @property + def _inheritable_fitted_attribute(self): + """Define additional attributes to pass onto a parent meta tree-estimator. + + Used for passing parameters to HonestTreeClassifier. + """ + return [ + "max_features_", + "feature_combinations_", + "feature_set_ends_", + "n_feature_sets_", + "n_features_in_set_", + "max_features_per_set_", + ] diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py index d0b673fea..907c386f1 100644 --- a/sktree/tree/tests/test_honest_tree.py +++ b/sktree/tree/tests/test_honest_tree.py @@ -63,23 +63,30 @@ def test_toy_accuracy(): np.testing.assert_array_equal(clf.predict(X), y) -def test_honest_tree_with_tree_estimator_params(): +@pytest.mark.parametrize( + "tree, tree_kwargs", + [ + (MultiViewDecisionTreeClassifier(), {"feature_set_ends": [10, 20]}), + (ObliqueDecisionTreeClassifier(), {"feature_combinations": 2}), + (PatchObliqueDecisionTreeClassifier(), {"max_patch_dims": 5}), + ], +) +def test_honest_tree_with_tree_estimator_params(tree, tree_kwargs): + """Test that honest tree inherits all the fitted parameters of the tree estimator.""" X = np.ones((20, 4)) X[10:] *= -1 y = [0] * 10 + [1] * 10 # test with a parameter that is a repeat of an init parameter clf = HonestTreeClassifier( - tree_estimator=DecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], + tree_estimator=DecisionTreeClassifier(), random_state=0, **tree_kwargs ) with pytest.raises(ValueError, match=r"Invalid parameter\(s\)"): clf.fit(X, y) # test with a parameter that is not in any init signature clf = HonestTreeClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), + tree_estimator=tree, random_state=0, blah=0, ) @@ -87,12 +94,16 @@ def test_honest_tree_with_tree_estimator_params(): clf.fit(X, y) # passing in a valid argument to the tree_estimator should work - clf = HonestTreeClassifier( - tree_estimator=MultiViewDecisionTreeClassifier(), - random_state=0, - feature_set_ends=[10, 20], - ) + clf = HonestTreeClassifier(tree_estimator=tree, random_state=0, **tree_kwargs) clf.fit(X, y) + for attr_name in dir(clf.estimator_): + if not attr_name.startswith("_") and attr_name.endswith("_"): + if isinstance(getattr(clf, attr_name), np.ndarray): + np.testing.assert_array_equal( + getattr(clf, attr_name), getattr(clf.estimator_, attr_name) + ) + else: + assert getattr(clf, attr_name) == getattr(clf.estimator_, attr_name) @pytest.mark.parametrize(