neurodata · adam2392 · Nov 14, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/.spin/cmds.py b/.spin/cmds.py
@@ -41,7 +41,7 @@ def coverage(ctx, slowtest):
 def setup_submodule(forcesubmodule=False):
     """Build scikit-tree using submodules.
 
-    git submodule set-branch -b submodulev2 sktree/_lib/sklearn
+    git submodule set-branch -b submodulev3 sktree/_lib/sklearn
 
     git submodule update --recursive --remote
 
@@ -137,7 +137,7 @@ def setup_submodule(forcesubmodule=False):
 def build(ctx, meson_args, jobs=None, clean=False, forcesubmodule=False, verbose=False):
     """Build scikit-tree using submodules.
 
-    git submodule update --recursive --remote
+        git submodule update --recursive --remote
 
     To update submodule wrt latest commits:
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -3,13 +3,13 @@
 Thanks for considering contributing! Please read this document to learn the various ways you can contribute to this project and how to go about doing it.
 
 **Submodule dependency on a fork of scikit-learn**
-Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a maintained fork of scikit-learn at https://github.com/neurodata/scikit-learn, where specifically, the `submodulev2` branch is used to build and install this repo. We keep that fork well-maintained and up-to-date with respect to the main sklearn repo. The only difference is the refactoring of the `tree/` submodule. This fork is used internally under the namespace ``sktree._lib.sklearn``. It is necessary to use this fork for anything related to:
+Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a maintained fork of scikit-learn at <https://github.com/neurodata/scikit-learn>, where specifically, the `submodulev3` branch is used to build and install this repo. We keep that fork well-maintained and up-to-date with respect to the main sklearn repo. The only difference is the refactoring of the `tree/` submodule. This fork is used internally under the namespace ``sktree._lib.sklearn``. It is necessary to use this fork for anything related to:
 
 - `RandomForest*`
 - `ExtraTrees*`
 - or any importable items from the `tree/` submodule, whether it is a Cython or Python object
 
-If you are developing for scikit-tree, we will always depend on the most up-to-date commit of `https://github.com/neurodata/scikit-learn/submodulev2` as a submodule within scikit-tee. This branch is consistently maintained for changes upstream that occur in the scikit-learn tree submodule. This ensures that our fork maintains consistency and robustness due to bug fixes and improvements upstream
+If you are developing for scikit-tree, we will always depend on the most up-to-date commit of `https://github.com/neurodata/scikit-learn/submodulev3` as a submodule within scikit-tee. This branch is consistently maintained for changes upstream that occur in the scikit-learn tree submodule. This ensures that our fork maintains consistency and robustness due to bug fixes and improvements upstream
 
 ## Bug reports and feature requests
 

diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork
diff --git a/sktree/datasets/__init__.py b/sktree/datasets/__init__.py
@@ -0,0 +1 @@
+from .hyppo import make_quadratic_classification
diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+
+def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
+    """Simulate classification data from a quadratic model.
+
+    This is a form of the simulation used in :footcite:`panda2018learning`.
+
+    Parameters
+    ----------
+    n_samples : int
+        The number of samples to generate.
+    n_features : int
+        The number of dimensions in the dataset.
+    noise : bool, optional
+        Whether or not to add noise, by default False.
+    seed : int, optional
+        Random seed, by default None.
+
+    Returns
+    -------
+    x : array-like, shape (n_samples, n_features)
+        Data array.
+    v : array-like, shape (n_samples,)
+        Target array of 1's and 0's.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+    rng = np.random.default_rng(seed)
+
+    x = rng.standard_normal(size=(n_samples, n_features))
+    coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(n_features)])
+    eps = rng.standard_normal(size=(n_samples, n_features))
+
+    x_coeffs = x * coeffs
+    y = x_coeffs**2 + noise * eps
+
+    # generate the classification labels
+    n1 = x.shape[0]
+    n2 = y.shape[0]
+    v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
+    x = np.vstack((x, y))
+    return x, v
diff --git a/sktree/datasets/meson.build b/sktree/datasets/meson.build
@@ -0,0 +1,10 @@
+python_sources = [
+  '__init__.py',
+  'hyppo.py',
+]
+
+py3.install_sources(
+  python_sources,
+  pure: false,
+  subdir: 'sktree/datasets'
+)
diff --git a/sktree/experimental/mutual_info.py b/sktree/experimental/mutual_info.py
@@ -147,7 +147,7 @@ def mutual_info_ksg(
     algorithm="kd_tree",
     n_jobs: int = -1,
     transform: str = "rank",
-    random_seed: int = None,
+    random_seed: Optional[int] = None,
 ):
     """Compute the generalized (conditional) mutual information KSG estimate.
 

diff --git a/sktree/meson.build b/sktree/meson.build
@@ -86,3 +86,4 @@ subdir('experimental')
 subdir('stats')
 subdir('tests')
 subdir('tree')
+subdir('datasets')
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
@@ -1,4 +1,4 @@
-from typing import Callable, Tuple, Union
+from typing import Callable, Optional, Tuple, Union
 
 import numpy as np
 from joblib import Parallel, delayed
@@ -43,7 +43,7 @@
     predict_posteriors: bool,
     permute_per_tree: bool,
     type_of_target,
-    sample_weight: ArrayLike = None,
+    sample_weight: Optional[ArrayLike] = None,
     class_weight=None,
     missing_values_in_feature_mask=None,
     classes=None,
@@ -144,7 +144,7 @@
 
     @property
     def n_estimators(self):
-        return self.estimator_.n_estimators
+        return self._n_estimators
 
     def reset(self):
         class_attributes = dir(type(self))
@@ -174,7 +174,8 @@
             if self._seeds is None:
                 self._seeds = []
 
-                for tree in self.estimator_.estimators_:
+                for itree in range(self.estimator_.n_estimators):
+                    tree = self.estimator_.estimators_[itree]
                     if tree.random_state is None:
                         self._seeds.append(rng.integers(low=0, high=np.iinfo(np.int32).max))
                     else:
@@ -216,7 +217,7 @@
                 random_state=self._seeds,
             )
 
-            for _ in self.estimator_.estimators_:
+            for _ in range(self.estimator_.n_estimators):
                 yield indices_train, indices_test
 
     @property
@@ -255,7 +256,7 @@
     ):
         raise NotImplementedError("Subclasses should implement this!")
 
-    def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = None):
+    def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[ArrayLike] = None):
         X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True)
         if y.ndim != 2:
             y = y.reshape(-1, 1)
@@ -295,7 +296,7 @@
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mi",
         return_posteriors: bool = False,
         check_input: bool = True,
@@ -352,6 +353,27 @@
             self.permuted_estimator_ = self._get_estimator()
             estimator = self.permuted_estimator_
 
+            if not hasattr(self, "estimator_") or self.estimator_ is None:
+                self.estimator_ = self._get_estimator()
+
+                # Ensure that the estimator_ is fitted at least
+                if not _is_fitted(self.estimator_) and is_classifier(self.estimator_):
+                    _unique_y = []
+                    for axis in range(y.shape[1]):
+                        _unique_y.append(np.unique(y[:, axis]))
+                    unique_y = np.hstack(_unique_y)
+                    if unique_y.ndim > 1 and unique_y.shape[1] == 1:
+                        unique_y = unique_y.ravel()
+                    X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
+                    self.estimator_.fit(X_dummy, unique_y)
+                elif not _is_fitted(estimator):
+                    if y.ndim > 1 and y.shape[1] == 1:
+                        self.estimator_.fit(X[:2], y[:2].ravel())
+                    else:
+                        self.estimator_.fit(X[:2], y[:2])
+
+        self._n_estimators = estimator.n_estimators
+
         # Store a cache of the y variable
         if is_classifier(self._get_estimator()):
             self._y = y.copy()
@@ -392,7 +414,7 @@
             )
         self._metric = metric
 
-        if not is_classifier(self.estimator_) and metric not in REGRESSOR_METRICS:
+        if not is_classifier(estimator) and metric not in REGRESSOR_METRICS:
             raise RuntimeError(
                 f'Metric must be either "mse" or "mae" if using Regression, got {metric}'
             )
@@ -414,7 +436,7 @@
         self,
         X,
         y,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric: str = "mi",
         n_repeats: int = 1000,
         return_posteriors: bool = True,
@@ -660,7 +682,7 @@
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         check_input: bool = True,

diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 from numpy.typing import ArrayLike
 from sklearn.base import MetaEstimatorMixin, clone, is_classifier
@@ -62,7 +64,7 @@ def _statistic(
         estimator: BaseForest,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         seed=None,
@@ -117,7 +119,7 @@ def statistic(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         check_input: bool = True,

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
@@ -32,7 +32,6 @@
             {
                 "estimator": RandomForestRegressor(
                     max_features="sqrt",
-                    random_state=seed,
                     n_estimators=75,
                     n_jobs=-1,
                 ),
@@ -47,7 +46,6 @@
             {
                 "estimator": RandomForestRegressor(
                     max_features="sqrt",
-                    # random_state=seed,
                     n_estimators=125,
                     n_jobs=-1,
                 ),
@@ -81,11 +79,10 @@
             {
                 "estimator": RandomForestRegressor(
                     max_features="sqrt",
-                    # random_state=seed,
                     n_estimators=125,
                     n_jobs=-1,
                 ),
-                # "random_state": seed,
+                "random_state": seed,
                 "permute_per_tree": True,
                 "sample_dataset_per_tree": False,
             },
@@ -151,7 +148,6 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
             {
                 "estimator": RandomForestClassifier(
                     max_features="sqrt",
-                    random_state=seed,
                     n_estimators=50,
                     n_jobs=-1,
                 ),
@@ -167,11 +163,10 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
             {
                 "estimator": RandomForestClassifier(
                     max_features="sqrt",
-                    # random_state=seed,
                     n_estimators=100,
                     n_jobs=-1,
                 ),
-                # "random_state": seed,
+                "random_state": seed,
                 "permute_per_tree": False,
                 "sample_dataset_per_tree": False,
             },
@@ -202,11 +197,10 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
             {
                 "estimator": RandomForestClassifier(
                     max_features="sqrt",
-                    # random_state=seed,
                     n_estimators=100,
                     n_jobs=-1,
                 ),
-                # "random_state": seed,
+                "random_state": seed,
                 "permute_per_tree": True,
                 "sample_dataset_per_tree": False,
             },

diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
@@ -60,11 +60,19 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
     with pytest.raises(RuntimeError, match="Metric must be"):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi")
 
+    # covariate index should work with mse
+    est.reset()
+    est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse")
+    with pytest.raises(RuntimeError, match="Metric must be"):
+        est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi")
+
     # covariate index must be an iterable
+    est.reset()
     with pytest.raises(RuntimeError, match="covariate_index must be an iterable"):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi")
 
     # covariate index must be an iterable of ints
+    est.reset()
     with pytest.raises(RuntimeError, match="Not all covariate_index"):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi")
 

diff --git a/sktree/stats/utils.py b/sktree/stats/utils.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 from numpy.typing import ArrayLike
@@ -112,7 +112,7 @@ def _compute_null_distribution_perm(
     est: ForestClassifier,
     metric: str = "mse",
     n_repeats: int = 1000,
-    seed: int = None,
+    seed: Optional[int] = None,
 ) -> ArrayLike:
     """Compute null distribution using permutation method.
 
@@ -173,7 +173,7 @@ def _compute_null_distribution_coleman(
     y_pred_proba_perm: ArrayLike,
     metric: str = "mse",
     n_repeats: int = 1000,
-    seed: int = None,
+    seed: Optional[int] = None,
 ) -> Tuple[ArrayLike, ArrayLike]:
     """Compute null distribution using Coleman method.