From 2de069e23e897dcfb88860dbea900b4fd0424bf7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 13 Mar 2024 16:18:54 -0400 Subject: [PATCH] MAINT Adding meson compiler directives (#242) * Adding meson compiler directives and C++ compiler optimizations --------- Signed-off-by: Adam Li --- doc/whats_new/v0.8.rst | 5 + meson.build | 36 ++++--- sktree/__init__.py | 1 + sktree/_build_utils/cythoner.py | 40 -------- sktree/_lib/meson.build | 60 ++++++----- sktree/_lib/sklearn_fork | 2 +- sktree/datasets/meson.build | 2 +- sktree/datasets/tests/meson.build | 2 +- sktree/ensemble/meson.build | 2 +- sktree/experimental/meson.build | 2 +- sktree/experimental/tests/meson.build | 2 +- sktree/meson.build | 99 +++++++++++++------ sktree/stats/meson.build | 2 +- sktree/stats/tests/meson.build | 2 +- sktree/tests/meson.build | 2 +- sktree/tree/_oblique_splitter.pxd | 8 +- sktree/tree/_oblique_splitter.pyx | 4 +- sktree/tree/_oblique_tree.pxd | 2 +- sktree/tree/manifold/meson.build | 25 +++-- sktree/tree/meson.build | 41 +++++--- sktree/tree/tests/meson.build | 2 +- .../unsupervised/_unsup_oblique_splitter.pxd | 28 +++--- .../unsupervised/_unsup_oblique_splitter.pyx | 1 - .../tree/unsupervised/_unsup_oblique_tree.pxd | 2 +- sktree/tree/unsupervised/_unsup_splitter.pxd | 1 - sktree/tree/unsupervised/_unsup_splitter.pyx | 7 +- sktree/tree/unsupervised/_unsup_tree.pyx | 14 +-- sktree/tree/unsupervised/meson.build | 41 +++++--- 28 files changed, 247 insertions(+), 188 deletions(-) delete mode 100644 sktree/_build_utils/cythoner.py diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index 2637a0cf8..0da140f70 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -14,6 +14,11 @@ Changelog --------- - |Fix| Trunk simulators now correctly generate random values with a fixed seed, by `Sambit Panda`_ (:pr:`#236`) +- |Efficiency| All scikit-tree estimators are now at least 2X faster than they were + in previous versions. This was due to adding in compiler-directives to turn on + optimizations '-03' when compiling the C++ generated code from Cython. In addition, + we explicitly turned off bounds-checking and related runtime checks in the Cython code, + which would lead to performance degradation during runtime. by `Adam Li`_ (:pr:`#242`) Code and Documentation Contributors ----------------------------------- diff --git a/meson.build b/meson.build index 420025437..822d2413d 100644 --- a/meson.build +++ b/meson.build @@ -1,12 +1,12 @@ project( 'scikit-tree', - 'c', 'cpp', + 'c', 'cpp', 'cython', # Note that the git commit hash cannot be added dynamically here # That only happens when importing from a git repository. # See `sktree/__init__.py` version: '0.8.0dev0', license: 'BSD-3', - meson_version: '>= 0.64.0', + meson_version: '>= 1.1.0', default_options: [ 'buildtype=debugoptimized', 'c_std=c99', @@ -29,6 +29,21 @@ elif cc.get_id() == 'msvc' endif endif +# Suppress warning for deprecated Numpy API. +# Replace with numpy_nodepr_api after Cython 3.0 is out +# '-Wno-maybe-uninitialized' +# numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' + +# (Suppress warning messages emitted by #warning directives). +_global_c_args = cc.get_supported_arguments( + '-Wno-unused-but-set-variable', + '-Wno-unused-function', + '-Wno-conversion', + '-Wno-misleading-indentation', + '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', +) +add_project_arguments(_global_c_args, language : 'c') + # We need -lm for all C code (assuming it uses math functions, which is safe to # assume for scikit-tree). For C++ it isn't needed, because libstdc++/libc++ is # guaranteed to depend on it. For Fortran code, Meson already adds `-lm`. @@ -50,22 +65,13 @@ r = run_command('mv', 'sktree/_lib/sklearn_fork/sklearn', 'sktree/_lib/sklearn', # Setup Python: # https://mesonbuild.com/Python-module.html -py3_mod = import('python') - -# NOTE: with Meson >=0.64.0 we can add `pure: false` here and remove that line -# everywhere else, see https://github.com/mesonbuild/meson/pull/10783. -py3 = py3_mod.find_installation( - 'python3', - pure: false # Will be installed next to binaries -) -# py3.install_env('venv') +py = import('python').find_installation(pure: false) # print some debugging output -message(py3.full_path()) -message(py3.get_install_dir()) -if py3.language_version().version_compare('<3.9') +message(py.full_path()) +message(py.get_install_dir()) +if py.language_version().version_compare('<3.9') error('At least Python 3.9 is required.') endif -py3_dep = py3.dependency() subdir('sktree') diff --git a/sktree/__init__.py b/sktree/__init__.py index 9be0aff3b..793895f86 100644 --- a/sktree/__init__.py +++ b/sktree/__init__.py @@ -4,6 +4,7 @@ import os import sys +print("using current cython branch.") __version__ = "0.8.0dev0" logger = logging.getLogger(__name__) diff --git a/sktree/_build_utils/cythoner.py b/sktree/_build_utils/cythoner.py deleted file mode 100644 index ff3481064..000000000 --- a/sktree/_build_utils/cythoner.py +++ /dev/null @@ -1,40 +0,0 @@ -#!python -""" Scipy variant of Cython command - -Cython, as applied to single pyx file. -Expects two arguments, infile and outfile. -Other options passed through to cython command line parser. -""" - -import os -import os.path as op -import subprocess as sbp -import sys - - -def main(): - in_fname, out_fname = (op.abspath(p) for p in sys.argv[1:3]) - - print("\n\ninside cythoner: ") - print("input file: ", in_fname) - print("output file: ", out_fname) - print(os.getcwd(), "\n\n") - - sbp.run( - [ - "cython", - "-3", - "--fast-fail", - "--output-file", - out_fname, - "--include-dir", - f"{os.getcwd()}", - ] - + sys.argv[3:] - + [in_fname], - check=True, - ) - - -if __name__ == "__main__": - main() diff --git a/sktree/_lib/meson.build b/sktree/_lib/meson.build index 10dade643..9e8c01786 100644 --- a/sktree/_lib/meson.build +++ b/sktree/_lib/meson.build @@ -4,20 +4,31 @@ if not fs.exists('sklearn') endif # install tree/ submodule -extensions = [ - '_tree', - '_utils', - '_criterion', - '_splitter', -] +tree_extension_metadata = { + '_tree': + {'sources': ['./sklearn/tree/' + '_tree.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_splitter': + {'sources': ['./sklearn/tree/' + '_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_criterion': + {'sources': ['./sklearn/tree/' + '_criterion.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_utils': + {'sources': ['./sklearn/tree/' + '_utils.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, +} -foreach ext: extensions - py3.extension_module(ext, - cython_gen_cpp.process('./sklearn/tree/' + ext + '.pyx'), - c_args: cython_c_args, - include_directories: [incdir_numpy,], - install: true, + +foreach ext_name, ext_dict : tree_extension_metadata + py.extension_module( + ext_name, + ext_dict.get('sources'), + dependencies: [np_dep], + override_options : ext_dict.get('override_options', []), + cython_args: cython_c_args, subdir: 'sktree/_lib/sklearn/tree/', + install: true ) endforeach @@ -28,7 +39,7 @@ python_sources = [ './sklearn/tree/_reingold_tilford.py', ] -py3.install_sources( +py.install_sources( python_sources, subdir: 'sktree/_lib/sklearn/tree' # Folder relative to site-packages to install to ) @@ -38,7 +49,7 @@ python_sources = [ '_forest.py', ] foreach py_source: python_sources - py3.install_sources( + py.install_sources( './sklearn/ensemble/' + py_source, subdir: 'sktree/_lib/sklearn/ensemble' ) @@ -51,10 +62,13 @@ extensions = [ ] foreach ext: extensions - py3.extension_module(ext, - cython_gen_cpp.process('./sklearn/neighbors/' + ext + '.pyx'), - c_args: cython_c_args, - include_directories: [incdir_numpy,], + py.extension_module( + ext, + ['./sklearn/neighbors/' + ext + '.pyx'], + c_args: c_args, + dependencies: [np_dep], + cython_args: cython_c_args, + override_options : ['optimization=3', 'cython_language=cpp'], install: true, subdir: 'sktree/_lib/sklearn/neighbors/', ) @@ -67,10 +81,12 @@ extensions = [ ] foreach ext: extensions - py3.extension_module(ext, - cython_gen_cpp.process('./sklearn/utils/' + ext + '.pyx'), - c_args: cython_c_args, - include_directories: [incdir_numpy,], + py.extension_module(ext, + ['./sklearn/utils/' + ext + '.pyx'], + c_args: c_args, + dependencies: [np_dep], + cython_args: cython_c_args, + override_options : ['optimization=3', 'cython_language=cpp'], install: true, subdir: 'sktree/_lib/sklearn/utils/', ) diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index db5b137c1..b61ae3d54 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit db5b137c1c1d2cb90aed2354dcb5b933e1df803b +Subproject commit b61ae3d546ba4199dc3badf4bd89971d2d75e9df diff --git a/sktree/datasets/meson.build b/sktree/datasets/meson.build index 8bd2882ff..4a5539827 100644 --- a/sktree/datasets/meson.build +++ b/sktree/datasets/meson.build @@ -4,7 +4,7 @@ python_sources = [ 'hyppo.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/datasets' diff --git a/sktree/datasets/tests/meson.build b/sktree/datasets/tests/meson.build index 79b1833da..d450c3488 100644 --- a/sktree/datasets/tests/meson.build +++ b/sktree/datasets/tests/meson.build @@ -4,7 +4,7 @@ python_sources = [ 'test_multiview.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/datasets/tests' diff --git a/sktree/ensemble/meson.build b/sktree/ensemble/meson.build index c5801c904..0e0cdf092 100644 --- a/sktree/ensemble/meson.build +++ b/sktree/ensemble/meson.build @@ -8,7 +8,7 @@ python_sources = [ '_extensions.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/ensemble' diff --git a/sktree/experimental/meson.build b/sktree/experimental/meson.build index 2d1dde609..4dbbc4acc 100644 --- a/sktree/experimental/meson.build +++ b/sktree/experimental/meson.build @@ -6,7 +6,7 @@ python_sources = [ 'monte_carlo.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/experimental' diff --git a/sktree/experimental/tests/meson.build b/sktree/experimental/tests/meson.build index c3fdd07c4..e3bbc474e 100644 --- a/sktree/experimental/tests/meson.build +++ b/sktree/experimental/tests/meson.build @@ -6,7 +6,7 @@ python_sources = [ 'test_monte_carlo.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/experimental/tests' diff --git a/sktree/meson.build b/sktree/meson.build index aa46fe60f..7da2e8a17 100644 --- a/sktree/meson.build +++ b/sktree/meson.build @@ -2,6 +2,7 @@ is_windows = host_machine.system() == 'windows' is_mingw = is_windows and cc.get_id() == 'gcc' +c_args = [] cython_c_args = [] if is_windows # For mingw-w64, link statically against the UCRT. @@ -25,19 +26,61 @@ if is_windows # Silence warnings emitted by PyOS_snprintf for (%zd), see # https://github.com/rgommers/scipy/issues/118. # Use as c_args for extensions containing Cython code - cython_c_args += ['-Wno-format-extra-args', '-Wno-format'] + c_args += ['-Wno-format-extra-args', '-Wno-format'] endif endif +openmp_dep = dependency('OpenMP', language: 'c', required: false) + +if not openmp_dep.found() + warning( +''' + *********** + * WARNING * + *********** + +It seems that scikit-tree cannot be built with OpenMP. + +- Make sure you have followed the installation instructions: + + https://scikit-learn.org/dev/developers/advanced_installation.html + +- If your compiler supports OpenMP but you still see this + message, please submit a bug report at: + + https://github.com/scikit-tree/scikit-tree/issues + +- The build will continue with OpenMP-based parallelism + disabled. Note however that some estimators will run in + sequential mode instead of leveraging thread-based + parallelism. + + *** +''') +endif + # NumPy include directory - needed in all submodules -incdir_numpy = run_command(py3, - [ - '-c', - 'import os; os.chdir(".."); import numpy; print(numpy.get_include())' - ], - check: true -).stdout().strip() -# inc_np = include_directories(incdir_numpy) +incdir_numpy = meson.get_external_property('numpy-include-dir', 'not-given') +if incdir_numpy == 'not-given' + incdir_numpy = run_command(py, + [ + '-c', + ''' +import os +import numpy as np +try: + incdir = os.path.relpath(np.get_include()) +except Exception: + incdir = np.get_include() +print(incdir) +''' + ], + check: true + ).stdout().strip() +endif + +inc_np = include_directories(incdir_numpy) +np_dep = declare_dependency(include_directories: inc_np) cc = meson.get_compiler('c') @@ -45,11 +88,25 @@ cc = meson.get_compiler('c') # NPY_API_VERSION in order not to break compilation for released versions # when NumPy introduces a new deprecation. Use in a meson.build file:: # -# py3.extension_module('_name', +# py.extension_module('_name', # 'source_fname', # numpy_nodepr_api) numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' -cython_c_args += numpy_nodepr_api + +# XXX: ENABLE WHEN DEBUGGING +boundscheck = 'False' + +scikit_learn_cython_args = [ + '-X language_level=3', '-X boundscheck=' + boundscheck, '-X wraparound=False', + '-X initializedcheck=False', '-X nonecheck=False', '-X cdivision=True', + '-X profile=False', + # Needed for cython imports across subpackages, e.g. cluster pyx that + # cimports metrics pxd + '--include-dir', meson.global_build_root(), +] +cython_c_args += scikit_learn_cython_args + +c_args += numpy_nodepr_api python_sources = [ '__init__.py', @@ -57,29 +114,11 @@ python_sources = [ 'conftest.py', ] -py3.install_sources( +py.install_sources( python_sources, subdir: 'sktree' ) -cython_cli = find_program('_build_utils/cythoner.py') - -cython_gen = generator(cython_cli, - arguments : ['@INPUT@', '@OUTPUT@'], - output : '@BASENAME@.c') - -cython_gen_cpp = generator(cython_cli, - arguments : ['@INPUT@', '@OUTPUT@', '--cplus'], - output : '@BASENAME@.cpp') - -c_undefined_ok = ['-Wno-maybe-uninitialized'] - -# Suppress warning for deprecated Numpy API. -# (Suppress warning messages emitted by #warning directives). -# Replace with numpy_nodepr_api after Cython 3.0 is out -cython_c_args += ['-Wno-cpp'] -cython_cpp_args = cython_c_args - subdir('_lib') subdir('ensemble') subdir('experimental') diff --git a/sktree/stats/meson.build b/sktree/stats/meson.build index 5234538ec..b99353c80 100644 --- a/sktree/stats/meson.build +++ b/sktree/stats/meson.build @@ -7,7 +7,7 @@ python_sources = [ 'permuteforest.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/stats' diff --git a/sktree/stats/tests/meson.build b/sktree/stats/tests/meson.build index 27a38d8b7..e4be1f212 100644 --- a/sktree/stats/tests/meson.build +++ b/sktree/stats/tests/meson.build @@ -6,7 +6,7 @@ python_sources = [ 'test_permuteforest.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/stats/tests' diff --git a/sktree/tests/meson.build b/sktree/tests/meson.build index b3b8ce8eb..d9baa3418 100644 --- a/sktree/tests/meson.build +++ b/sktree/tests/meson.build @@ -9,7 +9,7 @@ python_sources = [ 'test_extensions.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/tests' diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 15feaea22..3f17b8c6b 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -30,6 +30,11 @@ cdef struct ObliqueSplitRecord: float64_t improvement # Impurity improvement given parent node. float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. + float64_t lower_bound # Lower bound on value of both children for monotonicity + float64_t upper_bound # Upper bound on value of both children for monotonicity + unsigned char missing_go_to_left # Controls if missing values go to the left node. + intp_t n_missing # Number of missing values for the feature being split on + intp_t n_constant_features # Number of constant features in the split vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) vector[intp_t]* proj_vec_indices # indices of the features (max_features,) @@ -80,7 +85,6 @@ cdef class BaseObliqueSplitter(Splitter): self, float64_t impurity, # Impurity of the node SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil @@ -116,7 +120,6 @@ cdef class BestObliqueSplitter(ObliqueSplitter): self, float64_t impurity, # Impurity of the node SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil @@ -139,7 +142,6 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): self, float64_t impurity, # Impurity of the node SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index aee43b1ce..23b6e722d 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -292,7 +292,6 @@ cdef class BestObliqueSplitter(ObliqueSplitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil: @@ -431,6 +430,7 @@ cdef class BestObliqueSplitter(ObliqueSplitter): deref(oblique_split).improvement = best_split.improvement deref(oblique_split).impurity_left = best_split.impurity_left deref(oblique_split).impurity_right = best_split.impurity_right + deref(oblique_split).n_constant_features = 0 return 0 cdef class RandomObliqueSplitter(ObliqueSplitter): @@ -498,7 +498,6 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil: @@ -656,6 +655,7 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): deref(oblique_split).improvement = best_split.improvement deref(oblique_split).impurity_left = best_split.impurity_left deref(oblique_split).impurity_right = best_split.impurity_right + # deref(oblique_split).n_constant_features = 0 return 0 diff --git a/sktree/tree/_oblique_tree.pxd b/sktree/tree/_oblique_tree.pxd index 50c7949bf..b94531813 100644 --- a/sktree/tree/_oblique_tree.pxd +++ b/sktree/tree/_oblique_tree.pxd @@ -33,7 +33,7 @@ cdef class ObliqueTree(Tree): SplitRecord* split_node, Node *node, intp_t node_id - ) nogil except -1 + ) except -1 nogil cdef float32_t _compute_feature( self, const float32_t[:, :] X_ndarray, diff --git a/sktree/tree/manifold/meson.build b/sktree/tree/manifold/meson.build index 4e1ae7b85..91ed92a3d 100644 --- a/sktree/tree/manifold/meson.build +++ b/sktree/tree/manifold/meson.build @@ -1,18 +1,23 @@ -extensions = [ - '_morf_splitter', -] +tree_extension_metadata = { + '_morf_splitter': + {'sources': ['_morf_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, +} -foreach ext: extensions - py3.extension_module(ext, - cython_gen_cpp.process(ext + '.pyx'), - c_args: cython_c_args, - include_directories: [incdir_numpy], - install: true, +foreach ext_name, ext_dict : tree_extension_metadata + py.extension_module( + ext_name, + ext_dict.get('sources'), + dependencies: [np_dep], + override_options : ext_dict.get('override_options', []), + c_args: c_args, + cython_args: cython_c_args, subdir: 'sktree/tree/manifold', + install: true, ) endforeach -py3.install_sources( +py.install_sources( subdir: 'sktree/tree/manifold' # Folder relative to site-packages to install to ) diff --git a/sktree/tree/meson.build b/sktree/tree/meson.build index cc13bc1b4..9737016af 100644 --- a/sktree/tree/meson.build +++ b/sktree/tree/meson.build @@ -1,18 +1,31 @@ -extensions = [ - '_sklearn_splitter', - '_oblique_splitter', - '_oblique_tree', - '_utils', - '_marginal', -] +tree_extension_metadata = { + '_sklearn_splitter': + {'sources': ['_sklearn_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_oblique_splitter': + {'sources': ['_oblique_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_oblique_tree': + {'sources': ['_oblique_tree.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_utils': + {'sources': ['_utils.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_marginal': + {'sources': ['_marginal.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, +} -foreach ext: extensions - py3.extension_module(ext, - cython_gen_cpp.process(ext + '.pyx'), - c_args: cython_c_args, - include_directories: [incdir_numpy], - install: true, +foreach ext_name, ext_dict : tree_extension_metadata + py.extension_module( + ext_name, + ext_dict.get('sources'), + dependencies: [np_dep], + override_options : ext_dict.get('override_options', []), + c_args: c_args, + cython_args: cython_c_args, subdir: 'sktree/tree', + install: true, ) endforeach @@ -25,7 +38,7 @@ python_sources = [ '_marginalize.py', ] -py3.install_sources( +py.install_sources( python_sources, subdir: 'sktree/tree' # Folder relative to site-packages to install to ) diff --git a/sktree/tree/tests/meson.build b/sktree/tree/tests/meson.build index c88eeda7c..44717eb2a 100644 --- a/sktree/tree/tests/meson.build +++ b/sktree/tree/tests/meson.build @@ -9,7 +9,7 @@ python_sources = [ 'test_multiview.py', ] -py3.install_sources( +py.install_sources( python_sources, pure: false, subdir: 'sktree/tree/tests' diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd index 1e65a49b1..d2b47fde7 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd @@ -6,22 +6,23 @@ from libcpp.vector cimport vector from ..._lib.sklearn.tree._splitter cimport SplitRecord from ..._lib.sklearn.tree._utils cimport UINT32_t from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t +from .._oblique_splitter cimport ObliqueSplitRecord from ._unsup_splitter cimport UnsupervisedSplitter +# cdef struct ObliqueSplitRecord: +# # Data to track sample split +# intp_t feature # Which feature to split on. +# intp_t pos # Split samples array at the given position, +# # # i.e. count of samples below threshold for feature. +# # # pos is >= end if the node is a leaf. +# float64_t threshold # Threshold to split at. +# float64_t improvement # Impurity improvement given parent node. +# float64_t impurity_left # Impurity of the left split. +# float64_t impurity_right # Impurity of the right split. +# intp_t n_constant_features # Number of constant features in the split. -cdef struct ObliqueSplitRecord: - # Data to track sample split - intp_t feature # Which feature to split on. - intp_t pos # Split samples array at the given position, - # # i.e. count of samples below threshold for feature. - # # pos is >= end if the node is a leaf. - float64_t threshold # Threshold to split at. - float64_t improvement # Impurity improvement given parent node. - float64_t impurity_left # Impurity of the left split. - float64_t impurity_right # Impurity of the right split. - - vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) +# vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) +# vector[intp_t]* proj_vec_indices # indices of the features (max_features,) cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): @@ -56,7 +57,6 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): self, float64_t impurity, # Impurity of the node SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 53b2bbd43..3770b11c8 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -246,7 +246,6 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil: diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd index 5292551b9..93b50600a 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd @@ -32,7 +32,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): SplitRecord* split_node, Node *node, intp_t node_id, - ) nogil except -1 + ) except -1 nogil cdef float32_t _compute_feature( self, const float32_t[:, :] X_ndarray, diff --git a/sktree/tree/unsupervised/_unsup_splitter.pxd b/sktree/tree/unsupervised/_unsup_splitter.pxd index 6a172dc4e..48bfe9a54 100644 --- a/sktree/tree/unsupervised/_unsup_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_splitter.pxd @@ -41,7 +41,6 @@ cdef class UnsupervisedSplitter(BaseSplitter): self, float64_t impurity, # Impurity of the node SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil diff --git a/sktree/tree/unsupervised/_unsup_splitter.pyx b/sktree/tree/unsupervised/_unsup_splitter.pyx index ca5fd0349..dafb950e1 100644 --- a/sktree/tree/unsupervised/_unsup_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_splitter.pyx @@ -178,7 +178,6 @@ cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -225,7 +224,8 @@ cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement cdef intp_t n_drawn_constants = 0 - cdef intp_t n_known_constants = n_constant_features[0] + cdef intp_t n_known_constants = split.n_constant_features + # n_constant_features[0] # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants cdef intp_t partition_end @@ -381,5 +381,6 @@ cdef class BestUnsupervisedSplitter(UnsupervisedSplitter): # Return values split[0] = best_split - n_constant_features[0] = n_total_constants + split.n_constant_features = n_total_constants + # n_constant_features[0] = n_total_constants return 0 diff --git a/sktree/tree/unsupervised/_unsup_tree.pyx b/sktree/tree/unsupervised/_unsup_tree.pyx index 1236afcfc..46e9dc596 100644 --- a/sktree/tree/unsupervised/_unsup_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_tree.pyx @@ -307,7 +307,8 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): cdef intp_t node_id cdef intp_t n_node_samples - cdef intp_t n_constant_features = 0 + # cdef intp_t n_constant_features = 0 + split_ptr.n_constant_features = 0 cdef float64_t min_impurity_decrease = self.min_impurity_decrease cdef float64_t weighted_n_node_samples cdef bint is_leaf @@ -326,7 +327,7 @@ cdef class UnsupervisedBestFirstTreeBuilder(UnsupervisedTreeBuilder): ) if not is_leaf: - splitter.node_split(impurity, split_ptr, &n_constant_features, 0., 0.) + splitter.node_split(impurity, split_ptr, 0., 0.) # assign local copy of SplitRecord to assign # pos, improvement, and impurity scores @@ -443,7 +444,6 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) cdef float64_t impurity = INFINITY - cdef intp_t n_constant_features cdef bint is_leaf cdef bint first = 1 cdef intp_t max_depth_seen = -1 @@ -473,7 +473,7 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): parent = stack_record.parent is_left = stack_record.is_left impurity = stack_record.impurity - n_constant_features = stack_record.n_constant_features + split_ptr.n_constant_features = stack_record.n_constant_features n_node_samples = end - start splitter.node_reset(start, end, &weighted_n_node_samples) @@ -492,7 +492,7 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): is_leaf = is_leaf or impurity <= EPSILON if not is_leaf: - splitter.node_split(impurity, split_ptr, &n_constant_features, 0., 0.) + splitter.node_split(impurity, split_ptr, 0., 0.) # assign local copy of SplitRecord to assign # pos, improvement, and impurity scores @@ -525,7 +525,7 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): "parent": node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": n_constant_features}) + "n_constant_features": split.n_constant_features}) # Push left child on stack builder_stack.push({ @@ -535,7 +535,7 @@ cdef class UnsupervisedDepthFirstTreeBuilder(UnsupervisedTreeBuilder): "parent": node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": n_constant_features}) + "n_constant_features": split.n_constant_features}) if depth > max_depth_seen: max_depth_seen = depth diff --git a/sktree/tree/unsupervised/meson.build b/sktree/tree/unsupervised/meson.build index bf87d72df..0d9534e9d 100644 --- a/sktree/tree/unsupervised/meson.build +++ b/sktree/tree/unsupervised/meson.build @@ -1,22 +1,35 @@ -extensions = [ - '_unsup_criterion', - '_unsup_splitter', - '_unsup_tree', - '_unsup_oblique_splitter', - '_unsup_oblique_tree', -] +tree_extension_metadata = { + '_unsup_criterion': + {'sources': ['_unsup_criterion.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_unsup_splitter': + {'sources': ['_unsup_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_unsup_tree': + {'sources': ['_unsup_tree.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_unsup_oblique_splitter': + {'sources': ['_unsup_oblique_splitter.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_unsup_oblique_tree': + {'sources': ['_unsup_oblique_tree.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, +} -foreach ext: extensions - py3.extension_module(ext, - cython_gen_cpp.process(ext + '.pyx'), - c_args: cython_c_args, - include_directories: [incdir_numpy], - install: true, +foreach ext_name, ext_dict : tree_extension_metadata + py.extension_module( + ext_name, + ext_dict.get('sources'), + dependencies: [np_dep], + override_options : ext_dict.get('override_options', []), + c_args: c_args, + cython_args: cython_c_args, subdir: 'sktree/tree/unsupervised', + install: true, ) endforeach -py3.install_sources( +py.install_sources( subdir: 'sktree/tree/unsupervised' # Folder relative to site-packages to install to )