From 030a064b2ca92946f8aa3be2fa5c2a51022badf2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 13 Nov 2023 12:40:42 -0500 Subject: [PATCH] [ENH] Add dataset generators (#169) * Add datasets generating functions --------- Signed-off-by: Adam Li --- CONTRIBUTING.md | 39 ++- DEVELOPING.md | 50 ++-- README.md | 14 +- doc/references.bib | 31 +++ doc/whats_new/v0.4.rst | 1 + pyproject.toml | 1 + sktree/_lib/sklearn_fork | 2 +- sktree/datasets/__init__.py | 2 + sktree/datasets/hyppo.py | 45 +++ sktree/datasets/meson.build | 13 + sktree/datasets/multiview.py | 356 ++++++++++++++++++++++++ sktree/datasets/tests/__init__.py | 0 sktree/datasets/tests/meson.build | 11 + sktree/datasets/tests/test_hyppo.py | 10 + sktree/datasets/tests/test_multiview.py | 123 ++++++++ sktree/meson.build | 1 + sktree/stats/tests/test_coleman.py | 2 +- 17 files changed, 661 insertions(+), 40 deletions(-) create mode 100644 sktree/datasets/__init__.py create mode 100644 sktree/datasets/hyppo.py create mode 100644 sktree/datasets/meson.build create mode 100644 sktree/datasets/multiview.py create mode 100644 sktree/datasets/tests/__init__.py create mode 100644 sktree/datasets/tests/meson.build create mode 100644 sktree/datasets/tests/test_hyppo.py create mode 100644 sktree/datasets/tests/test_multiview.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 247bc2854..a1c96032b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,13 +3,13 @@ Thanks for considering contributing! Please read this document to learn the various ways you can contribute to this project and how to go about doing it. **Submodule dependency on a fork of scikit-learn** -Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a maintained fork of scikit-learn at https://github.com/neurodata/scikit-learn, where specifically, the `submodulev2` branch is used to build and install this repo. We keep that fork well-maintained and up-to-date with respect to the main sklearn repo. The only difference is the refactoring of the `tree/` submodule. This fork is used internally under the namespace ``sktree._lib.sklearn``. It is necessary to use this fork for anything related to: +Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a maintained fork of scikit-learn at , where specifically, the `submodulev3` branch is used to build and install this repo. We keep that fork well-maintained and up-to-date with respect to the main sklearn repo. The only difference is the refactoring of the `tree/` submodule. This fork is used internally under the namespace ``sktree._lib.sklearn``. It is necessary to use this fork for anything related to: - `RandomForest*` - `ExtraTrees*` - or any importable items from the `tree/` submodule, whether it is a Cython or Python object -If you are developing for scikit-tree, we will always depend on the most up-to-date commit of `https://github.com/neurodata/scikit-learn/submodulev2` as a submodule within scikit-tee. This branch is consistently maintained for changes upstream that occur in the scikit-learn tree submodule. This ensures that our fork maintains consistency and robustness due to bug fixes and improvements upstream +If you are developing for scikit-tree, we will always depend on the most up-to-date commit of `https://github.com/neurodata/scikit-learn/submodulev3` as a submodule within scikit-tee. This branch is consistently maintained for changes upstream that occur in the scikit-learn tree submodule. This ensures that our fork maintains consistency and robustness due to bug fixes and improvements upstream ## Bug reports and feature requests @@ -27,16 +27,16 @@ code sample or an executable test case demonstrating the expected behavior. We use GitHub issues to track feature requests. Before you create an feature request: -* Make sure you have a clear idea of the enhancement you would like. If you have a vague idea, consider discussing +- Make sure you have a clear idea of the enhancement you would like. If you have a vague idea, consider discussing it first on a GitHub issue. -* Check the documentation to make sure your feature does not already exist. -* Do [a quick search](https://github.com/neurodata/scikit-tree/issues) to see whether your feature has already been suggested. +- Check the documentation to make sure your feature does not already exist. +- Do [a quick search](https://github.com/neurodata/scikit-tree/issues) to see whether your feature has already been suggested. When creating your request, please: -* Provide a clear title and description. -* Explain why the enhancement would be useful. It may be helpful to highlight the feature in other libraries. -* Include code examples to demonstrate how the enhancement would be used. +- Provide a clear title and description. +- Explain why the enhancement would be useful. It may be helpful to highlight the feature in other libraries. +- Include code examples to demonstrate how the enhancement would be used. ## Making a pull request @@ -52,7 +52,7 @@ When you're ready to contribute code to address an open issue, please follow the git clone https://github.com/USERNAME/scikit-tree.git - or + or git clone git@github.com:USERNAME/scikit-tree.git @@ -142,6 +142,7 @@ When you're ready to contribute code to address an open issue, please follow the ### Installing locally with Meson + Meson is a modern build system with a lot of nice features, which is why we use it for our build system to compile the Cython/C++ code. However, there are some intricacies that might be new to a pure Python developer. @@ -151,7 +152,7 @@ In general, the steps to build scikit-tree are: - build and install scikit-tree locally using `spin` Example would be: - + pip uninstall scikit-learn # install the fork of scikit-learn @@ -172,13 +173,13 @@ The most common errors come from the following: The CI files for github actions shows how to build and install for each OS. - ### Writing docstrings We use [Sphinx](https://www.sphinx-doc.org/en/master/index.html) to build our API docs, which automatically parses all docstrings of public classes and methods. All docstrings should adhere to the [Numpy styling convention](https://www.sphinx-doc.org/en/master/usage/extensions/example_numpy.html). ### Testing Changes Locally With Poetry + With poetry installed, we have included a few convenience functions to check your code. These checks must pass and will be checked by the PR's continuous integration services. You can install the various different developer dependencies with poetry: poetry install --with style, docs, test @@ -217,6 +218,22 @@ If you need to add new, or remove old dependencies, then you need to modify the To update the lock file. +## Developing a new Tree model + +Here, we define some high-level procedures for how to best approach implementing a new decision-tree model that is not supported yet in scikit-tree. + +1. First-pass on implementation: + + Implement a Cython splitter class and expose it in Python afterwards. Follow the framework for PatchObliqueSplitter and ObliqueSplitter and their respective decision-tree models: PatchObliqueDecisionTreeClassifier and ObliqueDecisionTreeClassifier. + +2. Second-pass on implementation: + + This involves extending relevant API beyond just the Splitter in Cython. This requires maintaining some degree of backwards-compatibility. Extend the existing API for Tree, TreeBuilder, Criterion, or ObliqueSplitter to enable whatever functionality you desire. + +3. Third-pass on implementation: + + This is the most complex implementation and should in theory be rarely used. This involves both designing a change in the scikit-learn fork submodule as well as relevant changes in scikit-tree itself. Extend the scikit-learn fork API. This requires maintaining some degree of backwards-compatability and testing the proposed changes wrt whatever changes you then make in scikit-tree. + --- The Project abides by the Organization's [code of conduct](https://github.com/py-why/governance/blob/main/CODE-OF-CONDUCT.md) and [trademark policy](https://github.com/py-why/governance/blob/main/TRADEMARKS.md). diff --git a/DEVELOPING.md b/DEVELOPING.md index 71f4caad6..10ae6e656 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -6,11 +6,11 @@ - [Development Tasks](#development-tasks) - [Basic Verification](#basic-verification) - [Docsite](#docsite) - - [Details](#details) - - [Coding Style](#coding-style) - - [Lint](#lint) - - [Type checking](#type-checking) - - [Unit tests](#unit-tests) + - [Details](#details) + - [Coding Style](#coding-style) + - [Lint](#lint) + - [Type checking](#type-checking) + - [Unit tests](#unit-tests) - [Advanced Updating submodules](#advanced-updating-submodules) - [Cython and C++](#cython-and-c) - [Making a Release](#making-a-release) @@ -19,16 +19,16 @@ # Requirements -* Python 3.9+ -* numpy>=1.25 -* scipy>=1.11 -* scikit-learn>=1.3.1 +- Python 3.9+ +- numpy>=1.25 +- scipy>=1.11 +- scikit-learn>=1.3.1 For the other requirements, inspect the ``pyproject.toml`` file. # Setting up your development environment -We recommend using miniconda, as python virtual environments may not setup properly compilers necessary for our compiled code. For detailed information on setting up and managing conda environments, see https://conda.io/docs/test-drive.html. +We recommend using miniconda, as python virtual environments may not setup properly compilers necessary for our compiled code. For detailed information on setting up and managing conda environments, see . @@ -38,7 +38,7 @@ We recommend using miniconda, as python virtual environments may not setup prope **Make sure you specify a Python version if your system defaults to anything less than Python 3.9.** **Any commands should ALWAYS be after you have activated your conda environment.** -Next, install necessary build dependencies. For more information, see https://scikit-learn.org/stable/developers/advanced_installation.html. +Next, install necessary build dependencies. For more information, see . conda install -c conda-forge joblib threadpoolctl pytest compilers llvm-openmp @@ -77,7 +77,7 @@ For other commands, see Note at this stage, you will be unable to run Python commands directly. For example, ``pytest ./sktree`` will not work. -However, after installing and building the project from source using meson, you can leverage editable installs to make testing code changes much faster. For more information on meson-python's progress supporting editable installs in a better fashion, see https://meson-python.readthedocs.io/en/latest/how-to-guides/editable-installs.html. +However, after installing and building the project from source using meson, you can leverage editable installs to make testing code changes much faster. For more information on meson-python's progress supporting editable installs in a better fashion, see . pip install --no-build-isolation --editable . @@ -88,6 +88,7 @@ However, after installing and building the project from source using meson, you the unit-tests should run. # Development Tasks + There are a series of top-level tasks available through Poetry. If you are updated the dependencies, please run `poetry update` to update the lock file. These can each be run via `poetry run poe ` @@ -99,16 +100,18 @@ To do so, first install poetry and poethepoet. Now, you are ready to run quick commands to format the codebase, lint the codebase and type-check the codebase. ### Basic Verification + * **format** - runs the suite of formatting tools applying tools to make code compliant -* **format_check** - runs the suite of formatting tools checking for compliance -* **lint** - runs the suite of linting tools -* **type_check** - performs static typechecking of the codebase using mypy -* **unit_test** - executes fast unit tests -* **verify** - executes the basic PR verification suite, which includes all the tasks listed above +- **format_check** - runs the suite of formatting tools checking for compliance +- **lint** - runs the suite of linting tools +- **type_check** - performs static typechecking of the codebase using mypy +- **unit_test** - executes fast unit tests +- **verify** - executes the basic PR verification suite, which includes all the tasks listed above ### Docsite + * **build_docs** - build the API documentation site -* **build_docs_noplot** - build the API documentation site without running explicitly any of the examples, for faster local checks of any documentation updates. +- **build_docs_noplot** - build the API documentation site without running explicitly any of the examples, for faster local checks of any documentation updates. ## Details @@ -144,8 +147,8 @@ In order for any code to be added to the repository, we require unit tests to pa # (Advanced) Updating submodules -Scikit-tree relies on a submodule of a forked-version of scikit-learn for certain Python and Cython code that extends the ``DecisionTree*`` models. Usually, if a developer is making changes, they should go over to the ``submodulev3`` branch on ``https://github.com/neurodata/scikit-learn`` and -submit a PR to make changes to the submodule. +Scikit-tree relies on a submodule of a forked-version of scikit-learn for certain Python and Cython code that extends the ``DecisionTree*`` models. Usually, if a developer is making changes, they should go over to the ``submodulev3`` branch on ``https://github.com/neurodata/scikit-learn`` and +submit a PR to make changes to the submodule. This should **ALWAYS** be supported by some use-case in scikit-tree. We want the minimal amount of code-change in our forked version of scikit-learn to make it very easy to merge in upstream changes, bug fixes and features for tree-based code. @@ -160,6 +163,7 @@ Now, you can re-build the project using the latest submodule changes. spin build --clean # Cython and C++ + The general design of scikit-tree follows that of the tree-models inside scikit-learn, where tree-based models are inherently Cythonized, or written with C++. Then the actual forest (e.g. RandomForest, or ExtraForest) is just a Python API wrapper that creates an ensemble of the trees. In order to develop new tree models, generally Cython and C++ code will need to be written in order to optimize the tree building process, otherwise fitting a single forest model would take very long. @@ -170,7 +174,7 @@ Scikit-tree is in-line with scikit-learn and thus relies on each new version rel 1. Download wheels from GH Actions and put all wheels into a ``dist/`` folder -https://github.com/neurodata/scikit-tree/actions/workflows/build_wheels.yml will have all the wheels for common OSes built for each Python version. + will have all the wheels for common OSes built for each Python version. 2. Upload wheels to test PyPi @@ -186,10 +190,10 @@ Verify that installations work as expected on your machine. twine upload dist/* ``` -or if you have two-factor authentication enabled: https://pypi.org/help/#apitoken +or if you have two-factor authentication enabled: twine upload dist/* --repository scikit-tree 4. Update version number on ``meson.build`` and ``pyproject.toml`` to the relevant version. -See https://github.com/neurodata/scikit-tree/pull/160 as an example. \ No newline at end of file +See https://github.com/neurodata/scikit-tree/pull/160 as an example. diff --git a/README.md b/README.md index ac815d9ec..158575227 100644 --- a/README.md +++ b/README.md @@ -17,14 +17,16 @@ Tree-models have withstood the test of time, and are consistently used for moder Documentation ============= -See here for the documentation for our dev version: https://docs.neurodata.io/scikit-tree/dev/index.html +See here for the documentation for our dev version: Why oblique trees and why trees beyond those in scikit-learn? ============================================================= + In 2001, Leo Breiman proposed two types of Random Forests. One was known as ``Forest-RI``, which is the axis-aligned traditional random forest. One was known as ``Forest-RC``, which is the random oblique linear combinations random forest. This leveraged random combinations of features to perform splits. [MORF](1) builds upon ``Forest-RC`` by proposing additional functions to combine features. Other modern tree variants such as Canonical Correlation Forests (CCF), Extended Isolation Forests, Quantile Forests, or unsupervised random forests are also important at solving real-world problems using robust decision tree models. Installation ============ + Our installation will try to follow scikit-learn installation as close as possible, as we contain Cython code subclassed, or inspired by the scikit-learn tree submodule. Dependencies @@ -37,18 +39,20 @@ We minimally require: * scipy * scikit-learn >= 1.3 -Installation with Pip (https://pypi.org/project/scikit-tree/) +Installation with Pip () ------------------------------------------------------------- + Installing with pip on a conda environment is the recommended route. pip install scikit-tree Building locally with Meson (For developers) -------------------------------------------- + Make sure you have the necessary packages installed # install build dependencies - pip install numpy scipy meson ninja meson-python Cython scikit-learn scikit-learn-tree + pip install -r build_requirements.txt # you may need these optional dependencies to build scikit-learn locally conda install -c conda-forge joblib threadpoolctl pytest compilers llvm-openmp @@ -102,11 +106,13 @@ After building locally, you can use editable installs (warning: this only regist Development =========== + We welcome contributions for modern tree-based algorithms. We use Cython to achieve fast C/C++ speeds, while abiding by a scikit-learn compatible (tested) API. Moreover, our Cython internals are easily extensible because they follow the internal Cython API of scikit-learn as well. -Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a fork of scikit-learn at https://github.com/neurodata/scikit-learn when +Due to the current state of scikit-learn's internal Cython code for trees, we have to instead leverage a fork of scikit-learn at when extending the decision tree model API of scikit-learn. Specifically, we extend the Python and Cython API of the tree submodule in scikit-learn in our submodule, so we can introduce the tree models housed in this package. Thus these extend the functionality of decision-tree based models in a way that is not possible yet in scikit-learn itself. As one example, we introduce an abstract API to allow users to implement their own oblique splits. Our plan in the future is to benchmark these functionalities and introduce them upstream to scikit-learn where applicable and inclusion criterion are met. References ========== + [1]: [`Li, Adam, et al. "Manifold Oblique Random Forests: Towards Closing the Gap on Convolutional Deep Networks" SIAM Journal on Mathematics of Data Science, 5(1), 77-96, 2023`](https://doi.org/10.1137/21M1449117) diff --git a/doc/references.bib b/doc/references.bib index 8671a338b..0653d7a39 100644 --- a/doc/references.bib +++ b/doc/references.bib @@ -11,6 +11,21 @@ @article{breiman2001random publisher = {Springer} } +@article{choi2017selectingpca, + issn = {00905364}, + url = {http://www.jstor.org/stable/26362952}, + abstract = {Principal component analysis (PCA) is a well-known tool in multivariate statistics. One significant challenge in using PCA is the choice of the number of principal components. In order to address this challenge, we propose distribution-based methods with exact type 1 error controls for hypothesis testing and construction of confidence intervals for signals in a noisy matrix with finite samples. Assuming Gaussian noise, we derive exact type 1 error controls based on the conditional distribution of the singular values of a Gaussian matrix by utilizing a post-selection inference framework, and extending the approach of [Taylor, Loftus and Tibshirani (2013)] in a PCA setting. In simulation studies, we find that our proposed methods compare well to existing approaches.}, + author = {Yunjin Choi and Jonathan Taylor and Robert Tibshirani}, + journal = {The Annals of Statistics}, + number = {6}, + pages = {2590--2617}, + publisher = {Institute of Mathematical Statistics}, + title = {SELECTING THE NUMBER OF PRINCIPAL COMPONENTS: ESTIMATION OF THE TRUE RANK OF A NOISY MATRIX}, + urldate = {2023-10-26}, + volume = {45}, + year = {2017} +} + @article{coleman2022scalable, title = {Scalable and efficient hypothesis testing with random forests}, author = {Coleman, Tim and Peng, Wei and Mentch, Lucas}, @@ -122,4 +137,20 @@ @inproceedings{terzi2006efficient pages = {316--327}, year = {2006}, organization = {SIAM} +} + +@misc{perry2009crossvalidation, + title = {Cross-Validation for Unsupervised Learning}, + author = {Patrick O. Perry}, + year = {2009}, + eprint = {0909.3052}, + archiveprefix = {arXiv}, + primaryclass = {stat.ME} +} + +@article{panda2018learning, + title = {Learning Interpretable Characteristic Kernels via Decision Forests}, + author = {Panda, Sambit and Shen, Cencheng and Vogelstein, Joshua T}, + journal = {arXiv preprint arXiv:1812.00029}, + year = {2018} } \ No newline at end of file diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst index 1a4de76c7..239a1d824 100644 --- a/doc/whats_new/v0.4.rst +++ b/doc/whats_new/v0.4.rst @@ -14,6 +14,7 @@ Changelog --------- - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`) +- |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`) Code and Documentation Contributors ----------------------------------- diff --git a/pyproject.toml b/pyproject.toml index d7d68a751..e03c45e1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -209,6 +209,7 @@ extend-exclude = ''' | sktree/_lib | .asv | env + | build-install ) ''' diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 1adb20907..3d377ad78 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 1adb209077f12adac8f760196ae5260abab0cbdd +Subproject commit 3d377ad78db936b17ee24e59d29222f259d92412 diff --git a/sktree/datasets/__init__.py b/sktree/datasets/__init__.py new file mode 100644 index 000000000..74c770d6a --- /dev/null +++ b/sktree/datasets/__init__.py @@ -0,0 +1,2 @@ +from .hyppo import make_quadratic_classification +from .multiview import make_gaussian_mixture, make_joint_factor_model diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py new file mode 100644 index 000000000..31b4ca8d1 --- /dev/null +++ b/sktree/datasets/hyppo.py @@ -0,0 +1,45 @@ +import numpy as np + + +def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None): + """Simulate classification data from a quadratic model. + + This is a form of the simulation used in :footcite:`panda2018learning`. + + Parameters + ---------- + n_samples : int + The number of samples to generate. + n_features : int + The number of dimensions in the dataset. + noise : bool, optional + Whether or not to add noise, by default False. + seed : int, optional + Random seed, by default None. + + Returns + ------- + x : array-like, shape (2 * n_samples, n_features) + Data array. + v : array-like, shape (2* n_samples,) + Target array of 1's and 0's. + + References + ---------- + .. footbibliography:: + """ + rng = np.random.default_rng(seed) + + x = rng.standard_normal(size=(n_samples, n_features)) + coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(n_features)]) + eps = rng.standard_normal(size=(n_samples, n_features)) + + x_coeffs = x * coeffs + y = x_coeffs**2 + noise * eps + + # generate the classification labels + n1 = x.shape[0] + n2 = y.shape[0] + v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))]) + x = np.vstack((x, y)) + return x, v diff --git a/sktree/datasets/meson.build b/sktree/datasets/meson.build new file mode 100644 index 000000000..ccdd18521 --- /dev/null +++ b/sktree/datasets/meson.build @@ -0,0 +1,13 @@ +python_sources = [ + '__init__.py', + 'multiview.py', + 'hyppo.py', +] + +py3.install_sources( + python_sources, + pure: false, + subdir: 'sktree/datasets' +) + +subdir('tests') \ No newline at end of file diff --git a/sktree/datasets/multiview.py b/sktree/datasets/multiview.py new file mode 100644 index 000000000..bb92049ec --- /dev/null +++ b/sktree/datasets/multiview.py @@ -0,0 +1,356 @@ +# Original source: https://github.com/mvlearn/mvlearn +# License: MIT + +import numpy as np +from scipy.stats import ortho_group +from sklearn.utils import check_random_state + + +def make_gaussian_mixture( + centers, + covariances, + n_samples=100, + transform="linear", + noise=None, + noise_dims=None, + class_probs=None, + random_state=None, + shuffle=False, + return_latents=False, + add_latent_noise=False, +): + r"""Two-view Gaussian mixture model dataset generator. + + This creates a two-view dataset from a Gaussian mixture model and + a (possibly nonlinear) transformation. + + Parameters + ---------- + centers : 1D array-like or list of 1D array-likes + The mean(s) of the Gaussian(s) from which the latent + points are sampled. If is a list of 1D array-likes, each is the + mean of a distinct Gaussian, sampled from with + probability given by `class_probs`. Otherwise is the mean of a + single Gaussian from which all are sampled. + covariances : 2D array-like or list of 2D array-likes + The covariance matrix(s) of the Gaussian(s), matched + to the specified centers. + n_samples : int + The number of points in each view, divided across Gaussians per + `class_probs`. + transform : 'linear' | 'sin' | poly' | callable, (default 'linear') + Transformation to perform on the latent variable. If a function, + applies it to the latent. Otherwise uses an implemented function. + noise : float or None (default=None) + Variance of mean zero Gaussian noise added to the first view. + noise_dims : int or None (default=None) + Number of additional dimensions of standard normal noise to add. + class_probs : array-like, default=None + A list of probabilities specifying the probability of a latent + point being sampled from each of the Gaussians. Must sum to 1. If + None, then is taken to be uniform over the Gaussians. + random_state : int, default=None + If set, can be used to reproduce the data generated. + shuffle : bool, default=False + If ``True``, data is shuffled so the labels are not ordered. + return_latents : bool (default False) + If true, returns the non-noisy latent variables. + add_latent_noise : bool (default False) + If true, adds noise to the latent variables before applying the + transformation. + + Returns + ------- + Xs : list of np.ndarray, of shape (n_samples, n_features) + The latent data and its noisy transformation. + + y : np.ndarray, shape (n_samples,) + The integer labels for each sample's Gaussian membership. + + latents : np.ndarray, shape (n_samples, n_features) + The non-noisy latent variables. Only returned if + ``return_latents=True``. + + Notes + ----- + For each class :math:`i` with prior probability :math:`p_i`, + center and covariance matrix :math:`\mu_i` and :math:`\Sigma_i`, and + :math:`n` total samples, the latent data is sampled such that: + + .. math:: + (X_1, y_1), \dots, (X_{np_i}, Y_{np_i}) \overset{i.i.d.}{\sim} + \mathcal{N}(\mu_i, \Sigma_i) + + Two views of data are returned, the first being the latent samples and + the second being a specified transformation of the latent samples. + Additional noise may be added to the first view or added as noise + dimensions to both views. + + Examples + -------- + >>> from sktree.datasets.multiview import make_gaussian_mixture + >>> import numpy as np + >>> n_samples = 10 + >>> centers = [[0,1], [0,-1]] + >>> covariances = [np.eye(2), np.eye(2)] + >>> Xs, y = make_gaussian_mixture(n_samples, centers, covariances, + ... shuffle=True, shuffle_random_state=42) + >>> print(y) + [1. 0. 1. 0. 1. 0. 1. 0. 0. 1.] + """ + centers = np.asarray(centers) + covariances = np.asarray(covariances) + + if centers.ndim == 1: + centers = centers[np.newaxis, :] + if covariances.ndim == 2: + covariances = covariances[np.newaxis, :] + if not centers.ndim == 2: + msg = "centers is of the incorrect shape" + raise ValueError(msg) + if not covariances.ndim == 3: + msg = "covariance matrix is of the incorrect shape" + raise ValueError(msg) + if centers.shape[0] != covariances.shape[0]: + msg = "The first dimensions of 2D centers and 3D covariances" + " must be equal" + raise ValueError(msg) + if class_probs is None: + class_probs = np.ones(centers.shape[0]) + class_probs /= centers.shape[0] + elif sum(class_probs) != 1.0: + msg = "elements of `class_probs` must sum to 1" + raise ValueError(msg) + if len(centers) != len(class_probs) or len(covariances) != len(class_probs): + msg = "centers, covariances, and class_probs must be of equal length" + raise ValueError(msg) + + rng = np.random.default_rng(seed=random_state) + latent = np.concatenate( + [ + rng.multivariate_normal( + centers[i], + covariances[i], + size=int(class_probs[i] * n_samples), + ) + for i in range(len(class_probs)) + ] + ) + y = np.concatenate( + [i * np.ones(int(class_probs[i] * n_samples)) for i in range(len(class_probs))] + ) + + if add_latent_noise: + latent += rng.standard_normal(size=latent.shape) * 0.1 + + # shuffle latent samples and labels + if shuffle: + indices = np.arange(latent.shape[0]).squeeze() + rng.shuffle(indices) + latent = latent[indices, :] + y = y[indices] + + if callable(transform): + X = np.asarray([transform(x) for x in latent]) + elif not isinstance(transform, str): + raise TypeError( + "'transform' must be of type string or a callable function," + f"not {type(transform)}" + ) + elif transform == "linear": + X = _linear2view(latent, rng) + elif transform == "poly": + X = _poly2view(latent) + elif transform == "sin": + X = _sin2view(latent) + else: + raise ValueError( + "Transform type must be one of {'linear', 'poly', 'sin'}" + + f" or a callable function. Not {transform}" + ) + + if noise is not None: + Xs = [latent + np.sqrt(noise) * rng.standard_normal(size=latent.shape), X] + else: + Xs = [latent, X] + + # if random_state is not None, make sure both views are independent + # but reproducible + if noise_dims is not None: + Xs = [_add_noise(X, noise_dims, rng) for X in Xs] + + if return_latents: + return Xs, y, latent + else: + return Xs, y + + +def _add_noise(X, n_noise, rng): + """Appends dimensions of standard normal noise to X""" + noise_vars = rng.standard_normal(size=(X.shape[0], n_noise)) + return np.hstack((X, noise_vars)) + + +def _linear2view(X, rng): + """Rotates the data, a linear transformation""" + if X.shape[1] == 1: + X = -X + else: + X = X @ ortho_group.rvs(X.shape[1], random_state=rng) + return X + + +def _poly2view(X): + """Applies a degree 2 polynomial transform to the data""" + X = np.asarray([np.power(x, 2) for x in X]) + return X + + +def _sin2view(X): + """Applies a sinusoidal transformation to the data""" + X = np.asarray([np.sin(x) for x in X]) + return X + + +def _rand_orthog(n, K, random_state=None): + """ + Samples a random orthonormal matrix. + + Parameters + ---------- + n : int, positive + Number of rows in the matrix + + K : int, positive + Number of columns in the matrix + + random_state : None | int | instance of RandomState, optional + Seed to set randomization for reproducible results + + Returns + ------- + A: array-like, (n, K) + A random, column orthonormal matrix. + + Notes + ----- + See Section A.1.1 of :footcite:`perry2009crossvalidation`. + + References + ---------- + .. footbibliography:: + """ + rng = check_random_state(random_state) + + Z = rng.normal(size=(n, K)) + Q, R = np.linalg.qr(Z) + + s = np.ones(K) + neg_mask = rng.uniform(size=K) > 0.5 + s[neg_mask] = -1 + + return Q * s + + +def make_joint_factor_model( + n_views, + n_features, + n_samples=100, + joint_rank=1, + noise_std=1, + m=1.5, + random_state=None, + return_decomp=False, +): + """Joint factor model data generator. + + Samples from a low rank, joint factor model where there is one set of + shared scores. + + Parameters + ----------- + n_views : int + Number of views to sample. This corresponds to ``B`` in the notes. + + n_features : int, or list of int + Number of features in each view. A list specifies a different number + of features for each view. + + n_samples : int + Number of samples in each view + + joint_rank : int (default 1) + Rank of the common signal across views. + + noise_std : float (default 1) + Scale of noise distribution. + + m : float (default 1.5) + Signal strength. + + random_state : int or RandomState instance, optional (default=None) + Controls random orthonormal matrix sampling and random noise + generation. Set for reproducible results. + + return_decomp : bool, default=False + If ``True``, returns the ``view_loadings`` as well. + + Returns + ------- + Xs : list of array-likes + List of samples data matrices with the following attributes. + + - Xs length: n_views + - Xs[i] shape: (n_samples, n_features_i). + + U: (n_samples, joint_rank) + The true orthonormal joint scores matrix. Returned if + ``return_decomp`` is True. + + view_loadings: list of numpy.ndarray + The true view loadings matrices. Returned if + ``return_decomp`` is True. + + Notes + ----- + The data is generated as follows, where: + + - :math:`b` are the different views + - :math:`U` is is a (n_samples, joint_rank) matrix of rotation matrices. + - ``svals`` are the singular values sampled. + - :math:`W_b` are (n_features_b, joint_rank) view loadings matrices, which are + orthonormal matrices to linearly transform the data, while preserving inner + products (i.e. a unitary transformation). + + For b = 1, .., B + X_b = U @ diag(svals) @ W_b^T + noise_std * E_b + + where U and each W_b are orthonormal matrices. The singular values are + linearly increasing following :footcite:`choi2017selectingpca` section 2.2.3. + + References + ---------- + .. footbibliography:: + """ + rng = check_random_state(random_state) + if isinstance(n_features, int): + n_features = [n_features] * n_views + + # generate W_b orthonormal matrices + view_loadings = [_rand_orthog(d, joint_rank, random_state=rng) for d in n_features] + + # sample monotonically increasing singular values + # the signal increases linearly and ``m`` determines the strength of the signal + svals = np.arange(1, 1 + joint_rank).astype(float) + svals *= m * noise_std * (n_samples * max(n_features)) ** (1 / 4) + + # rotation operators that are generated via standard random normal + U = rng.standard_normal(size=(n_samples, joint_rank)) + U = np.linalg.qr(U)[0] + + # random noise for each view + Es = [noise_std * rng.standard_normal(size=(n_samples, d)) for d in n_features] + Xs = [(U * svals) @ view_loadings[b].T + Es[b] for b in range(n_views)] + + if return_decomp: + return Xs, U, view_loadings + else: + return Xs diff --git a/sktree/datasets/tests/__init__.py b/sktree/datasets/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sktree/datasets/tests/meson.build b/sktree/datasets/tests/meson.build new file mode 100644 index 000000000..79b1833da --- /dev/null +++ b/sktree/datasets/tests/meson.build @@ -0,0 +1,11 @@ +python_sources = [ + '__init__.py', + 'test_hyppo.py', + 'test_multiview.py', +] + +py3.install_sources( + python_sources, + pure: false, + subdir: 'sktree/datasets/tests' +) diff --git a/sktree/datasets/tests/test_hyppo.py b/sktree/datasets/tests/test_hyppo.py new file mode 100644 index 000000000..8c2cf7656 --- /dev/null +++ b/sktree/datasets/tests/test_hyppo.py @@ -0,0 +1,10 @@ +from sktree.datasets import make_quadratic_classification + + +def test_make_quadratic_classification_v(): + n_samples = 100 + n_features = 5 + x, v = make_quadratic_classification(n_samples, n_features) + assert all(val in [0, 1] for val in v) + assert x.shape == (n_samples * 2, n_features) + assert len(x) == len(v) diff --git a/sktree/datasets/tests/test_multiview.py b/sktree/datasets/tests/test_multiview.py new file mode 100644 index 000000000..43d62fa93 --- /dev/null +++ b/sktree/datasets/tests/test_multiview.py @@ -0,0 +1,123 @@ +import numpy as np +import pytest + +from sktree.datasets.multiview import make_gaussian_mixture, make_joint_factor_model + + +def test_make_gaussian_mixture_errors(): + with pytest.raises(ValueError, match="centers is of the incorrect shape"): + # Test centers dimension error + make_gaussian_mixture(np.zeros((2, 2, 2)), np.eye(2)) + + with pytest.raises(ValueError, match="covariance matrix is of the incorrect shape"): + # Test covariances dimension error + make_gaussian_mixture([0, 1], np.zeros((2, 2, 2, 2)), random_state=0) + + with pytest.raises(ValueError, match="The first dimensions of 2D centers and 3D covariances"): + # Test centers and covariances shape mismatch + make_gaussian_mixture(np.zeros((3, 2)), np.zeros((2, 2, 2)), random_state=0) + + with pytest.raises(ValueError, match="elements of `class_probs` must sum to 1"): + # Test class_probs not summing to 1 + make_gaussian_mixture([[0, 1], [2, 3]], [np.eye(2), np.eye(2)], class_probs=[0.5, 0.6]) + + with pytest.raises( + ValueError, match="centers, covariances, and class_probs must be of equal length" + ): + make_gaussian_mixture([[0, 1], [2, 3]], [np.eye(2), np.eye(2)], class_probs=[1.0]) + + # Test invalid transform type + with pytest.raises(ValueError, match="Transform type must be one of {'linear', 'poly', 'sin'}"): + make_gaussian_mixture([[0, 1], [2, 3]], [np.eye(2), np.eye(2)], transform="invalid") + + # Test invalid transform type (callable) + with pytest.raises( + TypeError, match="'transform' must be of type string or a callable function" + ): + make_gaussian_mixture([[0, 1], [2, 3]], [np.eye(2), np.eye(2)], transform=123) + + +def test_make_gaussian_mixture(): + # Test basic functionality + Xs, y = make_gaussian_mixture([[0, 1], [2, 3]], [np.eye(2), np.eye(2)], random_state=0) + assert len(Xs) == 2 + assert Xs[0].shape == (100, 2) + assert Xs[1].shape == (100, 2) + assert y.shape == (100,) + + # Test with noise + Xs, y = make_gaussian_mixture( + [[0, 1], [2, 3]], + [np.eye(2), np.eye(2)], + noise=0.1, + noise_dims=2, + random_state=0, + ) + assert len(Xs) == 2 + assert Xs[0].shape == (100, 4) # 2 original dimensions + 2 noise dimensions + assert Xs[1].shape == (100, 4) # 2 original dimensions + 2 noise dimensions + assert y.shape == (100,) + + +def custom_transform(x): + return x + 2 + + +@pytest.mark.parametrize("transform", ["linear", "poly", "sin", custom_transform]) +def test_make_gaussian_mixture_with_transform(transform): + # Test with any transformation + Xs, y = make_gaussian_mixture( + [[0, 1], [2, 3]], + [np.eye(2), np.eye(2)], + transform=transform, + random_state=0, + ) + assert len(Xs) == 2 + assert Xs[0].shape == (100, 2) + assert Xs[1].shape == (100, 2) + assert y.shape[0] == 100 + old_sum = np.sum(y) + + # Test with any transformation + Xs, y = make_gaussian_mixture( + [[0, 1], [2, 3]], [np.eye(2), np.eye(2)], transform=transform, random_state=0, shuffle=True + ) + assert len(Xs) == 2 + assert Xs[0].shape == (100, 2) + assert Xs[1].shape == (100, 2) + assert y.shape[0] == 100 + assert np.sum(y) == old_sum + + +def test_make_joint_factor_model(): + Xs = make_joint_factor_model(1, 3, n_samples=100, random_state=0) + assert len(Xs) == 1 + assert Xs[0].shape == (100, 3) + + Xs, U, view_loadings = make_joint_factor_model( + 2, 3, n_samples=100, random_state=0, return_decomp=True + ) + assert len(Xs) == 2 + assert Xs[0].shape == (100, 3) + assert Xs[1].shape == (100, 3) + assert U.shape == (100, 1) + assert len(view_loadings) == 2 + assert view_loadings[0].shape == (3, 1) + assert view_loadings[1].shape == (3, 1) + + Xs, U, view_loadings = make_joint_factor_model( + 3, + [2, 4, 3], + n_samples=100, + random_state=0, + return_decomp=True, + ) + assert len(Xs) == 3, f"Expected 3 views, got {len(Xs)}" + assert Xs[0].shape == (100, 2) + assert Xs[1].shape == (100, 4) + assert Xs[2].shape == (100, 3) + assert U.shape == (100, 1) + assert len(view_loadings) == 3 + assert view_loadings[0].shape == (2, 1) + assert view_loadings[1].shape == (4, 1) + assert view_loadings[2].shape == (3, 1) diff --git a/sktree/meson.build b/sktree/meson.build index 8608052b8..aa46fe60f 100644 --- a/sktree/meson.build +++ b/sktree/meson.build @@ -86,3 +86,4 @@ subdir('experimental') subdir('stats') subdir('tests') subdir('tree') +subdir('datasets') diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 69efbe378..aa918191c 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -204,7 +204,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_estimators=100, n_jobs=-1, ), - "permute_forest_fraction": 0.3, + "permute_forest_fraction": 0.5, "sample_dataset_per_tree": False, }, 600, # n_samples