MAINT Clean up Cython files (#321)

* Clean up Cython files in oblique and morf splitter * Migrate `self._validate_data` to `validate_data` in scikit-learn developer API * Update spin to v0.12+ * Update c++ to c++11 standard --------- Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Sep 9, 2024 · 7e9dc22 · 7e9dc22
1 parent ea67d06
commit 7e9dc22
Show file tree

Hide file tree

Showing 19 changed files with 224 additions and 109 deletions.
diff --git a/.spin/cmds.py b/.spin/cmds.py
@@ -5,6 +5,7 @@
 import click
 from spin import util
 from spin.cmds import meson
+from spin.cmds.meson import build_dir_option
 
 
 def get_git_revision_hash(submodule) -> str:
@@ -145,14 +146,18 @@ def setup_submodule(forcesubmodule=False):
 @click.option(
     "--forcesubmodule", is_flag=True, help="Force submodule pull.", envvar="FORCE_SUBMODULE"
 )
+@build_dir_option
 @click.pass_context
 def build(
     ctx,
+    *,
     meson_args,
     jobs=None,
     clean=False,
     verbose=False,
     gcov=False,
+    quiet=False,
+    build_dir=None,
     forcesubmodule=False,
 ):
     """Build treeple using submodules.

diff --git a/build_requirements.txt b/build_requirements.txt
@@ -8,5 +8,5 @@ click
 rich-click
 doit
 pydevtool
-spin
+spin>=0.12
 build
diff --git a/meson.build b/meson.build
@@ -8,7 +8,7 @@ project(
   license: 'PolyForm Noncommercial 1.0.0',
   meson_version: '>= 1.1.0',
   default_options: [
-    'c_std=c99',
+    'c_std=c11',
     'cpp_std=c++14',
   ],
 )

diff --git a/pyproject.toml b/pyproject.toml
@@ -68,7 +68,7 @@ build = [
   'twine',
   'meson',
   'meson-python',
-  'spin',
+  'spin>=0.12',
   'doit',
   'scikit-learn>=1.5.0',
   'Cython>=3.0.10',

diff --git a/treeple/__init__.py b/treeple/__init__.py
@@ -22,6 +22,7 @@
 # https://github.com/ContinuumIO/anaconda-issues/issues/11294
 os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
 
+
 try:
     # This variable is injected in the __builtins__ by the build
     # process. It is used to enable importing subpackages of sklearn when
@@ -64,7 +65,8 @@
         msg = """Error importing treeple: you cannot import treeple while
         being in treeple source directory; please exit the treeple source
         tree first and relaunch your Python interpreter."""
-        raise ImportError(msg) from e
+        raise Exception(e)
+        # raise ImportError(msg) from e
 
     __all__ = [
         "_lib",

diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build
@@ -94,3 +94,22 @@ foreach ext: extensions
     subdir: 'treeple/_lib/sklearn/utils/',
   )
 endforeach
+
+
+# python_sources = [
+#   '__init__.py',
+# ]
+
+# py.install_sources(
+#   python_sources,
+#   subdir: 'treeple/_lib'   # Folder relative to site-packages to install to
+# )
+
+# tempita = files('./sklearn/_build_utils/tempita.py')
+
+# # Copy all the .py files to the install dir, rather than using
+# # py.install_sources and needing to list them explicitely one by one
+# # install_subdir('sklearn', install_dir: py.get_install_dir())
+# install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib'))
+
+# subdir('sklearn')
diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork
diff --git a/treeple/ensemble/_honest_forest.py b/treeple/ensemble/_honest_forest.py
@@ -720,8 +720,12 @@ def oob_samples_(self):
             oob_samples.append(_oob_samples)
         return oob_samples
 
-    def _more_tags(self):
-        return {"multioutput": False}
+    def __sklearn_tags__(self):
+        # XXX: nans should be supportable in HRF
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_output = False
+        tags.input_tags.allow_nan = False
+        return tags
 
     def decision_path(self, X):
         """

diff --git a/treeple/ensemble/_unsupervised_forest.py b/treeple/ensemble/_unsupervised_forest.py
@@ -21,7 +21,12 @@
 )
 from sklearn.metrics import calinski_harabasz_score
 from sklearn.utils.parallel import Parallel, delayed
-from sklearn.utils.validation import _check_sample_weight, check_is_fitted, check_random_state
+from sklearn.utils.validation import (
+    _check_sample_weight,
+    check_is_fitted,
+    check_random_state,
+    validate_data,
+)
 
 from .._lib.sklearn.ensemble._forest import BaseForest
 from .._lib.sklearn.tree._tree import DTYPE
@@ -85,10 +90,9 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         # Validate or convert input data
-        X = self._validate_data(
+        X = validate_data(
+            self,
             X,
             dtype=DTYPE,  # accept_sparse="csc",
         )

diff --git a/treeple/meson.build b/treeple/meson.build
@@ -103,6 +103,7 @@ scikit_learn_cython_args = [
   '-X language_level=3', '-X boundscheck=' + boundscheck, '-X wraparound=False',
   '-X initializedcheck=False', '-X nonecheck=False', '-X cdivision=True',
   '-X profile=False',
+  '-X embedsignature=True',
   # Needed for cython imports across subpackages, e.g. cluster pyx that
   # cimports metrics pxd
   '--include-dir', meson.global_build_root(),

diff --git a/treeple/neighbors.py b/treeple/neighbors.py
@@ -5,8 +5,9 @@
 from sklearn.base import BaseEstimator, MetaEstimatorMixin
 from sklearn.exceptions import NotFittedError
 from sklearn.neighbors import NearestNeighbors
-from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.validation import check_is_fitted, validate_data
 
+from treeple.tree import DecisionTreeClassifier
 from treeple.tree._neighbors import _compute_distance_matrix, compute_forest_similarity_matrix
 
 
@@ -31,13 +32,19 @@ class NearestNeighborsMetaEstimator(BaseEstimator, MetaEstimatorMixin):
         The number of parallel jobs to run for neighbors, by default None.
     """
 
-    def __init__(self, estimator, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None):
+    def __init__(self, estimator=None, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None):
         self.estimator = estimator
         self.n_neighbors = n_neighbors
         self.algorithm = algorithm
         self.radius = radius
         self.n_jobs = n_jobs
 
+    def get_estimator(self):
+        if self.estimator is not None:
+            return DecisionTreeClassifier(random_state=0)
+        else:
+            return copy(self.estimator)
+
     def fit(self, X, y=None):
         """Fit the nearest neighbors estimator from the training dataset.
 
@@ -56,9 +63,9 @@ def fit(self, X, y=None):
         self : object
             Fitted estimator.
         """
-        X, y = self._validate_data(X, y, accept_sparse="csc")
+        X, y = validate_data(self, X, y, accept_sparse="csc")
 
-        self.estimator_ = copy(self.estimator)
+        self.estimator_ = self.get_estimator()
         try:
             check_is_fitted(self.estimator_)
         except NotFittedError:

diff --git a/treeple/tree/_classes.py b/treeple/tree/_classes.py
@@ -8,7 +8,7 @@
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.utils import check_random_state
 from sklearn.utils._param_validation import Interval
-from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.validation import check_is_fitted, validate_data
 
 from .._lib.sklearn.tree import (
     BaseDecisionTree,
@@ -216,7 +216,7 @@ def fit(self, X, y=None, sample_weight=None, check_input=True):
         if check_input:
             # TODO: allow X to be sparse
             check_X_params = dict(dtype=DTYPE)  # , accept_sparse="csc"
-            X = self._validate_data(X, validate_separately=(check_X_params))
+            X = validate_data(self, X, validate_separately=(check_X_params))
             if issparse(X):
                 X.sort_indices()
 
@@ -378,6 +378,13 @@ def _assign_labels(self, affinity_matrix):
         predict_labels = cluster.fit_predict(affinity_matrix)
         return predict_labels
 
+    def __sklearn_tags__(self):
+        # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
+        # However, for MORF it is not supported
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
 
 class UnsupervisedObliqueDecisionTree(UnsupervisedDecisionTree):
     """Unsupervised oblique decision tree.
@@ -577,6 +584,13 @@ def _build_tree(
         builder.build(self.tree_, X, sample_weight)
         return self
 
+    def __sklearn_tags__(self):
+        # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
+        # However, for MORF it is not supported
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
 
 class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
     """An oblique decision tree classifier.
@@ -820,7 +834,7 @@ class ObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
 
     tree_type = "oblique"
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeClassifier._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),
@@ -1070,6 +1084,13 @@ def _update_tree(self, X, y, sample_weight):
         self._prune_tree()
         return self
 
+    def __sklearn_tags__(self):
+        # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
+        # However, for MORF it is not supported
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
 
 class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
     """An oblique decision tree Regressor.
@@ -1283,7 +1304,7 @@ class ObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
 
     tree_type = "oblique"
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeRegressor._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),
@@ -1450,6 +1471,13 @@ def _build_tree(
         builder.build(self.tree_, X, y, sample_weight, None)
         return self
 
+    def __sklearn_tags__(self):
+        # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
+        # However, for MORF it is not supported
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
 
 class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
     """A oblique decision tree classifier that operates over patches of data.
@@ -1684,7 +1712,7 @@ class PatchObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)
     """
 
     tree_type = "oblique"
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeClassifier._parameter_constraints,
         "min_patch_dims": ["array-like", None],
         "max_patch_dims": ["array-like", None],
@@ -1798,8 +1826,8 @@ def _build_tree(
         self.feature_combinations_ = 1
 
         if self.feature_weight is not None:
-            self.feature_weight = self._validate_data(
-                self.feature_weight, ensure_2d=True, dtype=DTYPE
+            self.feature_weight = validate_data(
+                self, self.feature_weight, ensure_2d=True, dtype=DTYPE
             )
             if self.feature_weight.shape != X.shape:
                 raise ValueError(
@@ -1927,11 +1955,13 @@ def _build_tree(
 
         return self
 
-    def _more_tags(self):
+    def __sklearn_tags__(self):
         # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
         # However, for MORF it is not supported
-        allow_nan = False
-        return {"multilabel": True, "allow_nan": allow_nan}
+        tags = super().__sklearn_tags__()
+        tags.classifier_tags.multi_label = True
+        tags.input_tags.allow_nan = False
+        return tags
 
     @property
     def _inheritable_fitted_attribute(self):
@@ -2166,7 +2196,7 @@ class PatchObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
     """
 
     tree_type = "oblique"
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeRegressor._parameter_constraints,
         "min_patch_dims": ["array-like", None],
         "max_patch_dims": ["array-like", None],
@@ -2277,8 +2307,8 @@ def _build_tree(
         self.feature_combinations_ = 1
 
         if self.feature_weight is not None:
-            self.feature_weight = self._validate_data(
-                self.feature_weight, ensure_2d=True, dtype=DTYPE
+            self.feature_weight = validate_data(
+                self, self.feature_weight, ensure_2d=True, dtype=DTYPE
             )
             if self.feature_weight.shape != X.shape:
                 raise ValueError(
@@ -2407,11 +2437,13 @@ def _build_tree(
 
         return self
 
-    def _more_tags(self):
+    def __sklearn_tags__(self):
         # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
         # However, for MORF it is not supported
-        allow_nan = False
-        return {"multilabel": True, "allow_nan": allow_nan}
+        tags = super().__sklearn_tags__()
+        tags.regressor_tags.multi_label = True
+        tags.input_tags.allow_nan = False
+        return tags
 
 
 class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier):
@@ -2669,7 +2701,7 @@ class ExtraObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier)
 
     tree_type = "oblique"
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeClassifier._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),
@@ -2846,6 +2878,13 @@ def _inheritable_fitted_attribute(self):
             "feature_combinations_",
         ]
 
+    def __sklearn_tags__(self):
+        # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
+        # However, for MORF it is not supported
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags
+
 
 class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
     """An oblique decision tree Regressor.
@@ -3069,7 +3108,7 @@ class ExtraObliqueDecisionTreeRegressor(SimMatrixMixin, DecisionTreeRegressor):
         -0.26552594, -0.00642017, -0.07108117, -0.40726765, -0.40315294])
     """
 
-    _parameter_constraints = {
+    _parameter_constraints: dict = {
         **DecisionTreeRegressor._parameter_constraints,
         "feature_combinations": [
             Interval(Real, 1.0, None, closed="left"),
@@ -3237,3 +3276,10 @@ def _build_tree(
         builder.build(self.tree_, X, y, sample_weight)
 
         return self
+
+    def __sklearn_tags__(self):
+        # XXX: nans should be supportable in SPORF by just using RF-like splits on missing values
+        # However, for MORF it is not supported
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = False
+        return tags