neurodata · adam2392 · Oct 19, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 16, 2023
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
@@ -161,7 +161,26 @@ def reset(self):
         self._is_fitted = False
         self._seeds = None
 
-    def _get_estimators_indices(self, sample_separate=False):
+    def _get_estimators_indices(self, stratifier, sample_separate=False):
+
+        # Check stratifier
+        # if stratifier is None, stratifier is regressor
+        if stratifier is not None:
+            if self._n_samples_ is not None and stratifier.shape[0] != self._n_samples_:
+                raise RuntimeError(
+                    f"stratifier must have {self._n_samples_} samples, got {stratifier.shape[0]}. "
+                    f"If running on a new dataset, call the 'reset' method."
+                )
+
+            if (
+                self._type_of_target_ is not None
+                and type_of_target(stratifier) != self._type_of_target_
+            ):
+                raise RuntimeError(
+                    f"stratifier must have type {self._type_of_target_}, got {type_of_target(stratifier)}. "
+                    f"If running on a new dataset, call the 'reset' method."
+                )
+
         indices = np.arange(self._n_samples_, dtype=int)
 
         # Get drawn indices along both sample and feature axes
@@ -191,7 +210,11 @@ def _get_estimators_indices(self, sample_separate=False):
                 # Operations accessing random_state must be performed identically
                 # to those in `_parallel_build_trees()`
                 indices_train, indices_test = train_test_split(
-                    indices, test_size=self.test_size, shuffle=True, random_state=seed
+                    indices,
+                    test_size=self.test_size,
+                    shuffle=True,
+                    stratify=stratifier,
+                    random_state=seed,
                 )
 
                 yield indices_train, indices_test
@@ -206,13 +229,14 @@ def _get_estimators_indices(self, sample_separate=False):
             indices_train, indices_test = train_test_split(
                 indices,
                 test_size=self.test_size,
+                stratify=stratifier,
                 random_state=self._seeds,
             )
+
             for _ in self.estimator_.estimators_:
                 yield indices_train, indices_test
 
-    @property
-    def train_test_samples_(self):
+    def train_test_samples_(self, stratifier):
         """
         The subset of drawn samples for each base estimator.
 
@@ -229,7 +253,7 @@ def train_test_samples_(self):
 
         return [
             (indices_train, indices_test)
-            for indices_train, indices_test in self._get_estimators_indices()
+            for indices_train, indices_test in self._get_estimators_indices(stratifier=stratifier)
         ]
 
     def _statistic(
@@ -462,10 +486,10 @@ def test(
             observe_posteriors = self.observe_posteriors_
             observe_stat = self.observe_stat_
 
-        # next permute the data
         if covariate_index is None:
             covariate_index = np.arange(X.shape[1], dtype=int)
 
+        # next permute the data
         permute_stat, permute_posteriors, permute_samples = self.statistic(
             X,
             y,
@@ -493,7 +517,7 @@ def test(
             # If not sampling a new dataset per tree, then we may either be
             # permuting the covariate index per tree or per forest. If not permuting
             # there is only one train and test split, so we can just use that
-            _, indices_test = self.train_test_samples_[0]
+            _, indices_test = self.train_test_samples_(stratifier=y)[0]
             indices_test = observe_samples
             y_test = y[indices_test, :]
             y_pred_proba_normal = observe_posteriors[:, indices_test, :]
@@ -725,12 +749,12 @@ def _statistic(
                     self._type_of_target_,
                 )
                 for idx, (indices_train, indices_test) in enumerate(
-                    self._get_estimators_indices(sample_separate=True)
+                    self._get_estimators_indices(y, sample_separate=True)
                 )
             )
         else:
             # fitting a forest will only get one unique train/test split
-            indices_train, indices_test = self.train_test_samples_[0]
+            indices_train, indices_test = self.train_test_samples_(stratifier=None)[0]
 
             X_train, X_test = X[indices_train, :], X[indices_test, :]
             y_train, y_test = y[indices_train, :], y[indices_test, :]
@@ -946,12 +970,12 @@ def _statistic(
                     self._type_of_target_,
                 )
                 for idx, (indices_train, indices_test) in enumerate(
-                    self._get_estimators_indices(sample_separate=True)
+                    self._get_estimators_indices(y, sample_separate=True)
                 )
             )
         else:
             # fitting a forest will only get one unique train/test split
-            indices_train, indices_test = self.train_test_samples_[0]
+            indices_train, indices_test = self.train_test_samples_(stratifier=y)[0]
 
             X_train, X_test = X[indices_train, :], X[indices_test, :]
             y_train = y[indices_train, :]