diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4660fb4c2d..59c43bd151 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,31 +1,24 @@ -name: Check documentation +name: Documentation on: push: branches: [main] + tags: ["*"] pull_request: # Check all PR jobs: build: runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.10"] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + - uses: actions/checkout@v3 + - name: setup Python + uses: actions/setup-python@v4 with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies + python-version: "3.10" + - name: install dependencies run: | - python -m pip install --upgrade pip - pip install -r docs/requirements.txt - pip install . + python -m pip install tox sudo apt install pandoc - - name: Build the documentation - run: | - cd docs - make html + - name: build documentation + run: tox -e docs diff --git a/.github/workflows/documentation-links.yml b/.github/workflows/documentation-links.yml new file mode 100644 index 0000000000..e2c8b39107 --- /dev/null +++ b/.github/workflows/documentation-links.yml @@ -0,0 +1,17 @@ +name: readthedocs/actions + +on: + pull_request_target: + types: + - opened + +permissions: + pull-requests: write + +jobs: + documentation-links: + runs-on: ubuntu-latest + steps: + - uses: readthedocs/actions/preview@v1 + with: + project-slug: scikit-matter diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 15d2cc0acb..fd6f6bbcd3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,36 +1,20 @@ name: Lint on: - push: - branches: [main] pull_request: - # Check all PR + branches: [main] jobs: - build: + lint: runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.10"] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 black isort - - name: Lint with flake8 - run: | - # ignore formatting, it will be checked by black - export FORMATTING_RULES="E101,E111,E114,E115,E116,E117,E12,E13,E2,E3,E401,E5,E70,W1,W2,W3,W5" - flake8 --ignore=$FORMATTING_RULES . - - name: Lint with black - run: | - black --check . - - name: Check imports - run: | - isort skmatter/*/*py -m 3 --tc --fgw --up -e -l 88 --check + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - run: pip install tox + + - name: Test Lint + run: tox -e lint diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0e6fa07e41..5d68550cb3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,31 +1,35 @@ -name: Test +name: Tests on: push: branches: [main] pull_request: - # Check all PR + branches: [main] jobs: - build: - runs-on: ubuntu-latest + tests: + runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + os: ['ubuntu-latest'] + python-version: ['3.8', '3.9', '3.10', '3.11'] + include: + - os: macos-latest + python-version: '3.10' + - os: windows-latest + python-version: '3.10' steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox - - name: Run tests - run: | - tox -e tests,examples - - uses: codecov/codecov-action@v1 - with: - file: ./tests/coverage.xml + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - run: pip install tox + + - name: run Python tests + run: tox -e tests,examples + - uses: codecov/codecov-action@v3 + with: + files: ./tests/coverage.xml diff --git a/.gitignore b/.gitignore index d3882bf2ce..55534fbad9 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,4 @@ __pycache__ .tox/ build/ dist/ -docs/source/read-only-examples +docs/src/read-only-examples diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a369501d1b..cc343aceb3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,9 +5,15 @@ # Required version: 2 +# Set the version of Python and other tools we need +build: + os: ubuntu-22.04 + tools: + python: "3.10" + # Build documentation in the docs/ directory with Sphinx sphinx: - configuration: docs/source/conf.py + configuration: docs/src/conf.py # Optionally build your docs in additional formats such as PDF formats: @@ -15,7 +21,6 @@ formats: # Optionally set the version of Python and requirements required to build your docs python: - version: 3.8 install: - requirements: docs/requirements.txt - method: pip diff --git a/MANIFEST.in b/MANIFEST.in index 3e72a0c41a..a6fbf3c850 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,4 @@ -recursive-include skmatter/datasets/data/ * -recursive-include skmatter/datasets/descr/ * +recursive-include src/skmatter/datasets/data/ * +recursive-include src/skmatter/datasets/descr/ * + +prune tests diff --git a/contributors.txt b/contributors.txt index 412e7387dc..95ee0bef7d 100644 --- a/contributors.txt +++ b/contributors.txt @@ -3,4 +3,5 @@ Guillaume Fraux Sergei Kliavinek Alexander Goscinski Benjamin A. Helfrecht +Victor P. Principe Michele Ceriotti diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d0c3cbf102..0000000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 6247f7e231..0000000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/requirements.txt b/docs/requirements.txt index b80e9a5af1..19f9efcc8c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,12 +1,12 @@ ipykernel +jinja2 < 3.1 matplotlib -nbsphinx==0.8.12 nbconvert +nbsphinx==0.8.12 numpy +pandas scikit-learn >=0.24.0 sphinx >=3.3 sphinx_rtd_theme tqdm traitlets>=5.0 -jinja2 < 3.1 -pandas diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst deleted file mode 100644 index 7cd09f947f..0000000000 --- a/docs/source/datasets.rst +++ /dev/null @@ -1,11 +0,0 @@ -Datasets -======== - -.. include:: ../../skmatter/datasets/descr/csd-1000r.rst - -.. include:: ../../skmatter/datasets/descr/degenerate_CH4_manifold.rst - -.. include:: ../../skmatter/datasets/descr/nice_dataset.rst - -.. include:: ../../skmatter/datasets/descr/who_dataset.rst - diff --git a/docs/source/VoronoiFPS-Schematic.pdf b/docs/src/VoronoiFPS-Schematic.pdf similarity index 100% rename from docs/source/VoronoiFPS-Schematic.pdf rename to docs/src/VoronoiFPS-Schematic.pdf diff --git a/docs/source/bibliography.rst b/docs/src/bibliography.rst similarity index 100% rename from docs/source/bibliography.rst rename to docs/src/bibliography.rst diff --git a/docs/source/conf.py b/docs/src/conf.py similarity index 96% rename from docs/source/conf.py rename to docs/src/conf.py index bf30c25e3b..0bdd3767a0 100644 --- a/docs/source/conf.py +++ b/docs/src/conf.py @@ -15,13 +15,14 @@ import sphinx_rtd_theme from nbconvert import NotebookExporter from traitlets.config import Config +from datetime import datetime ROOT = os.path.abspath(os.path.join("..", "..")) sys.path.insert(0, ROOT) # Copying and Compiling of Examples -if not os.path.exists(os.path.join(ROOT, "docs/source/read-only-examples")): - os.mkdir(os.path.join(ROOT, "docs/source/read-only-examples")) +if not os.path.exists(os.path.join(ROOT, "docs/src/read-only-examples")): + os.mkdir(os.path.join(ROOT, "docs/src/read-only-examples")) # Set up nbconvert configuration to strip empty cells and tables of contents c = Config() @@ -45,7 +46,7 @@ # Skip any non-notebooks if nb.endswith("ipynb") and "no-doc" not in nb: nb_in = os.path.join(ROOT, "examples", nb) - nb_out = os.path.join(ROOT, "docs/source/read-only-examples", nb) + nb_out = os.path.join(ROOT, "docs/src/read-only-examples", nb) # Skip any notebooks which already exist if not os.path.exists(nb_out): @@ -53,7 +54,7 @@ converted = exporter.from_filename(nb_in)[0] out_stream.write(converted) -import skmatter # noqa +import skmatter # NoQa # -- Project information ----------------------------------------------------- @@ -62,7 +63,7 @@ project = "scikit-matter" author = ", ".join(open(os.path.join(ROOT, "contributors.txt"))) -copyright = "2020, " + author +copyright = f"{datetime.now().date().year}, {author}" # The full version, including alpha/beta/rc tags release = skmatter.__version__ diff --git a/docs/source/contributing.rst b/docs/src/contributing.rst similarity index 82% rename from docs/source/contributing.rst rename to docs/src/contributing.rst index d1af980756..20a1bd001f 100644 --- a/docs/source/contributing.rst +++ b/docs/src/contributing.rst @@ -27,12 +27,9 @@ Running the tests .. code-block:: bash cd - # run unit tests - tox - # run the code formatter - black --check . - # run the linter - flake8 + tox -e tests # unit tests + tox -e lint # code style + tox -e format # format all files You may want to setup your editor to automatically apply the `black `_ code formatter when saving your @@ -43,9 +40,11 @@ editors `_. Issues and Pull Requests ######################## -Having a problem with scikit-matter? Please let us know by `submitting an issue `_. +Having a problem with scikit-matter? Please let us know by +`submitting an issue `_. -Submit new features or bug fixes through a `pull request `_. +Submit new features or bug fixes through a `pull request +`_. Contributing Datasets @@ -54,9 +53,10 @@ Contributing Datasets Have an example dataset that would fit into scikit-matter? Contributing a dataset is easy. First, copy your numpy file into -``skmatter/datasets/data/`` with an informative name. Here, we'll call it ``my-dataset.npz``. +``src/skmatter/datasets/data/`` with an informative name. Here, we'll call it +``my-dataset.npz``. -Next, create a documentation file in ``skmatter/datasets/data/my-dataset.rst``. +Next, create a documentation file in ``src/skmatter/datasets/data/my-dataset.rst``. This file should look like this: .. code-block:: @@ -132,9 +132,9 @@ Then, show ``scikit-matter`` how to load your data by adding a loader function t return Bunch(data=data, DESCR=fdescr) -Add this function to ``skmatter/datasets/__init__.py``. +Add this function to ``src/skmatter/datasets/__init__.py``. -Finally, add a test to ``skmatter/tests/test_datasets.py`` to see that your dataset +Finally, add a test to ``tests/test_datasets.py`` to see that your dataset loads properly. It should look something like this: .. code-block:: python @@ -158,5 +158,5 @@ You're good to go! Time to submit a `pull request. =1.1.0", +] +dynamic = ["version"] + +[project.urls] +homepage = "http://scikit-matter.readthedocs.io" +documentation = "http://scikit-matter.readthedocs.io" +repository = "https://github.com/lab-cosmo/scikit-matter" +issues = "https://github.com/lab-cosmo/scikit-matter/issues" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.dynamic] +version = {attr = "skmatter.__version__"} [tool.coverage.run] branch = true @@ -41,8 +53,17 @@ data_file = 'tests/.coverage' [tool.coverage.report] include = [ - "skmatter/*" + "src/skmatter/*" ] [tool.coverage.xml] output = 'tests/coverage.xml' + +[tool.isort] +skip = "__init__.py" +profile = "black" +line_length = 88 +indent = 4 +include_trailing_comma = true +lines_after_imports = 2 +known_first_party = "skmatter" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 242d4b3522..0000000000 --- a/setup.cfg +++ /dev/null @@ -1,26 +0,0 @@ -[metadata] -name = skmatter -long_description = file: README.md -long_description_content_type = text/markdown - -license_files = LICENSE -author = Rose K. Cersonsky, Guillaume Fraux, Sergei Kliavinek, Alexander Goscinski, Benjamin A. Helfrecht, Victor P. Principe, Michele Ceriotti -author_email = rose.cersonsky@epfl.ch -description = A collection of scikit-learn compatible utilities that implement methods born out of the materials science and chemistry communities. -; keywords = -url = http://scikit-matter.readthedocs.io -classifiers = Development Status :: 3 - Alpha - Environment :: Console - Intended Audience :: Science/Research - License :: OSI Approved :: BSD License - Natural Language :: English - Programming Language :: Python - Topic :: Scientific/Engineering - -[options] -include_package_data = True -zip_safe = True -packages = find: -install_requires = - numpy - scikit-learn>=1.1.0 diff --git a/setup.py b/setup.py deleted file mode 100644 index 5d9a6710db..0000000000 --- a/setup.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python3 -from setuptools import setup -import re - -__version__ = re.search( - r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', open("skmatter/__init__.py").read() -).group(1) - -if __name__ == "__main__": - setup(version=__version__) diff --git a/skmatter/__init__.py b/src/skmatter/__init__.py similarity index 100% rename from skmatter/__init__.py rename to src/skmatter/__init__.py diff --git a/skmatter/_selection.py b/src/skmatter/_selection.py similarity index 96% rename from skmatter/_selection.py rename to src/skmatter/_selection.py index 1d1fcb3403..76ed8c2b0e 100644 --- a/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -10,16 +10,9 @@ import scipy from scipy.linalg import eigh from scipy.sparse.linalg import eigsh -from sklearn.base import ( - BaseEstimator, - MetaEstimatorMixin, -) +from sklearn.base import BaseEstimator, MetaEstimatorMixin from sklearn.feature_selection._base import SelectorMixin -from sklearn.utils import ( - check_array, - check_random_state, - safe_mask, -) +from sklearn.utils import check_array, check_random_state, safe_mask from sklearn.utils._tags import _safe_tags from sklearn.utils.validation import check_is_fitted @@ -57,19 +50,20 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + n_to_select is chosen. Otherwise will stop when the score falls below the + threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + the selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when ``current_score / + first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + progress bar to monitor selections. Stored in + :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -84,7 +78,8 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): X_selected_ : ndarray, Matrix containing the selected samples or features, for use in fitting y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for use in fitting + In sample selection, the matrix containing the selected targets, for + use in fitting """ @@ -199,7 +194,8 @@ def fit(self, X, y=None, warm_start=False): if warm_start: if not hasattr(self, "n_selected_") or getattr(self, "n_selected_") == 0: raise ValueError( - "Cannot fit with warm_start=True without having been previously initialized" + "Cannot fit with warm_start=True without having been previously" + " initialized." ) self._continue_greedy_search(X, y, n_iterations) else: @@ -310,8 +306,8 @@ def get_support(self, indices=False, ordered=False): Returns ------- support : An index that selects the retained subset from a original vectors. - If indices is False, this is a bool array of shape [# input], - in which an element is True iff its corresponding feature or sample is selected + If indices is False, this is a bool array of shape [# input], in which + an element is True iff its corresponding feature or sample is selected for retention. If indices is True, this is an integer array of shape [# n_to_select] whose values are indices into the input vectors. @@ -594,8 +590,8 @@ def _compute_pi(self, X, y=None): def _update_post_selection(self, X, y, last_selected): """ Saves the most recently selected feature, increments the feature counter, - and, if the CUR is iterative (if recompute_every>0), orthogonalizes the remaining features by - the most recently selected. + and, if the CUR is iterative (if recompute_every>0), orthogonalizes the + remaining features by the most recently selected. """ super()._update_post_selection(X, y, last_selected) @@ -746,9 +742,9 @@ def _continue_greedy_search(self, X, y, n_to_select): def _update_post_selection(self, X, y, last_selected): """ - Saves the most recently selected feature, increments the feature counter, - and, if the CUR is iterative (if recompute_every>0), orthogonalizes the remaining features by - the most recently selected. + Saves the most recently selected feature, increments the feature counter, and, + if the CUR is iterative (if recompute_every>0), orthogonalizes the remaining + features by the most recently selected. """ super()._update_post_selection(X, y, last_selected) @@ -916,14 +912,15 @@ def get_distance(self): """ Traditional FPS employs a column-wise Euclidean - distance for feature selection, which can be expressed using the covariance matrix - :math:`\\mathbf{C} = \\mathbf{X} ^ T \\mathbf{X}` + distance for feature selection, which can be expressed using the covariance + matrix :math:`\\mathbf{C} = \\mathbf{X} ^ T \\mathbf{X}` .. math:: \\operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}. - For sample selection, this is a row-wise Euclidean distance, which can - be expressed in terms of the Gram matrix :math:`\\mathbf{K} = \\mathbf{X} \\mathbf{X} ^ T` + For sample selection, this is a row-wise Euclidean distance, which can be + expressed in terms of the Gram matrix + :math:`\\mathbf{K} = \\mathbf{X} \\mathbf{X} ^ T` .. math:: \\operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}. diff --git a/skmatter/datasets/__init__.py b/src/skmatter/datasets/__init__.py similarity index 100% rename from skmatter/datasets/__init__.py rename to src/skmatter/datasets/__init__.py diff --git a/skmatter/datasets/_base.py b/src/skmatter/datasets/_base.py similarity index 96% rename from skmatter/datasets/_base.py rename to src/skmatter/datasets/_base.py index 694145969d..fdcd11e53a 100644 --- a/skmatter/datasets/_base.py +++ b/src/skmatter/datasets/_base.py @@ -1,13 +1,7 @@ -from os.path import ( - dirname, - join, -) +from os.path import dirname, join import numpy as np -from sklearn.utils import ( - Bunch, - check_pandas_support, -) +from sklearn.utils import Bunch, check_pandas_support def load_nice_dataset(): diff --git a/skmatter/datasets/data/csd-1000r.npz b/src/skmatter/datasets/data/csd-1000r.npz similarity index 100% rename from skmatter/datasets/data/csd-1000r.npz rename to src/skmatter/datasets/data/csd-1000r.npz diff --git a/skmatter/datasets/data/degenerate_CH4_manifold.npz b/src/skmatter/datasets/data/degenerate_CH4_manifold.npz similarity index 100% rename from skmatter/datasets/data/degenerate_CH4_manifold.npz rename to src/skmatter/datasets/data/degenerate_CH4_manifold.npz diff --git a/skmatter/datasets/data/nice_dataset.npz b/src/skmatter/datasets/data/nice_dataset.npz similarity index 100% rename from skmatter/datasets/data/nice_dataset.npz rename to src/skmatter/datasets/data/nice_dataset.npz diff --git a/skmatter/datasets/data/who_dataset.csv b/src/skmatter/datasets/data/who_dataset.csv similarity index 100% rename from skmatter/datasets/data/who_dataset.csv rename to src/skmatter/datasets/data/who_dataset.csv diff --git a/skmatter/datasets/descr/csd-1000r.rst b/src/skmatter/datasets/descr/csd-1000r.rst similarity index 100% rename from skmatter/datasets/descr/csd-1000r.rst rename to src/skmatter/datasets/descr/csd-1000r.rst diff --git a/skmatter/datasets/descr/degenerate_CH4_manifold.rst b/src/skmatter/datasets/descr/degenerate_CH4_manifold.rst similarity index 100% rename from skmatter/datasets/descr/degenerate_CH4_manifold.rst rename to src/skmatter/datasets/descr/degenerate_CH4_manifold.rst diff --git a/skmatter/datasets/descr/nice_dataset.rst b/src/skmatter/datasets/descr/nice_dataset.rst similarity index 100% rename from skmatter/datasets/descr/nice_dataset.rst rename to src/skmatter/datasets/descr/nice_dataset.rst diff --git a/skmatter/datasets/descr/who_dataset.rst b/src/skmatter/datasets/descr/who_dataset.rst similarity index 100% rename from skmatter/datasets/descr/who_dataset.rst rename to src/skmatter/datasets/descr/who_dataset.rst diff --git a/skmatter/decomposition/__init__.py b/src/skmatter/decomposition/__init__.py similarity index 100% rename from skmatter/decomposition/__init__.py rename to src/skmatter/decomposition/__init__.py diff --git a/skmatter/decomposition/_kernel_pcovr.py b/src/skmatter/decomposition/_kernel_pcovr.py similarity index 97% rename from skmatter/decomposition/_kernel_pcovr.py rename to src/skmatter/decomposition/_kernel_pcovr.py index 3e9a870691..40a7e2844f 100644 --- a/skmatter/decomposition/_kernel_pcovr.py +++ b/src/skmatter/decomposition/_kernel_pcovr.py @@ -9,26 +9,13 @@ from sklearn.kernel_ridge import KernelRidge from sklearn.linear_model._base import LinearModel from sklearn.metrics.pairwise import pairwise_kernels -from sklearn.utils import ( - check_array, - check_random_state, -) +from sklearn.utils import check_array, check_random_state from sklearn.utils._arpack import _init_arpack_v0 -from sklearn.utils.extmath import ( - randomized_svd, - stable_cumsum, - svd_flip, -) -from sklearn.utils.validation import ( - check_is_fitted, - check_X_y, -) +from sklearn.utils.extmath import randomized_svd, stable_cumsum, svd_flip +from sklearn.utils.validation import check_is_fitted, check_X_y from ..preprocessing import KernelNormalizer -from ..utils import ( - check_krr_fit, - pcovr_kernel, -) +from ..utils import check_krr_fit, pcovr_kernel class KernelPCovR(_BasePCA, LinearModel): @@ -191,7 +178,7 @@ class KernelPCovR(_BasePCA, LinearModel): [-1.5635827 , 1.11078662]] >>> kpcovr.score(X, Y) -0.520388347837897 - """ + """ # NoQa: E501 def __init__( self, @@ -346,8 +333,9 @@ def fit(self, X, Y, W=None): ] ): raise ValueError( - "Kernel parameter mismatch: the regressor has kernel parameters {%s}" - " and KernelPCovR was initialized with kernel parameters {%s}" + "Kernel parameter mismatch: the regressor has kernel " + "parameters {%s} and KernelPCovR was initialized with kernel " + "parameters {%s}" % ( ", ".join( [ @@ -381,7 +369,6 @@ def fit(self, X, Y, W=None): # Can be bypassed if the regressor is pre-fitted. try: check_is_fitted(regressor) - except NotFittedError: self.regressor_.set_params(**regressor.get_params()) self.regressor_.X_fit_ = self.X_fit_ @@ -493,12 +480,14 @@ def score(self, X, Y): .. math:: \ell=\frac{\operatorname{Tr}\left[\mathbf{K}_{VV} - 2 - \mathbf{K}_{VN} \mathbf{T}_N (\mathbf{T}_N^T \mathbf{T}_N)^{-1} \mathbf{T}_V^T + \mathbf{K}_{VN} \mathbf{T}_N + (\mathbf{T}_N^T \mathbf{T}_N)^{-1} \mathbf{T}_V^T +\mathbf{T}_V(\mathbf{T}_N^T \mathbf{T}_N)^{-1} \mathbf{T}_N^T \mathbf{K}_{NN} \mathbf{T}_N (\mathbf{T}_N^T \mathbf{T}_N)^{-1} \mathbf{T}_V^T\right]}{\operatorname{Tr}(\mathbf{K}_{VV})} - The negative loss is returned for easier use in sklearn pipelines, e.g., a grid search, where methods named 'score' are meant to be maximized. + The negative loss is returned for easier use in sklearn pipelines, e.g., a + grid search, where methods named 'score' are meant to be maximized. Arguments --------- diff --git a/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py similarity index 95% rename from skmatter/decomposition/_pcovr.py rename to src/skmatter/decomposition/_pcovr.py index 0365a5c4f1..de8e887957 100644 --- a/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -8,32 +8,14 @@ from scipy.sparse.linalg import svds from sklearn.decomposition._base import _BasePCA from sklearn.decomposition._pca import _infer_dimension -from sklearn.linear_model import ( - LinearRegression, - Ridge, - RidgeCV, -) +from sklearn.linear_model import LinearRegression, Ridge, RidgeCV from sklearn.linear_model._base import LinearModel -from sklearn.utils import ( - check_array, - check_random_state, -) +from sklearn.utils import check_array, check_random_state from sklearn.utils._arpack import _init_arpack_v0 -from sklearn.utils.extmath import ( - randomized_svd, - stable_cumsum, - svd_flip, -) -from sklearn.utils.validation import ( - check_is_fitted, - check_X_y, -) - -from ..utils import ( - check_lr_fit, - pcovr_covariance, - pcovr_kernel, -) +from sklearn.utils.extmath import randomized_svd, stable_cumsum, svd_flip +from sklearn.utils.validation import check_is_fitted, check_X_y + +from ..utils import check_lr_fit, pcovr_covariance, pcovr_kernel class PCovR(_BasePCA, LinearModel): @@ -124,7 +106,8 @@ class PCovR(_BasePCA, LinearModel): regressor for computing approximated :math:`{\mathbf{\hat{Y}}}`. The regressor should be one `sklearn.linear_model.Ridge`, `sklearn.linear_model.RidgeCV`, or `sklearn.linear_model.LinearRegression`. - If a pre-fitted regressor is provided, it is used to compute :math:`{\mathbf{\hat{Y}}}`. + If a pre-fitted regressor is provided, it is used to compute + :math:`{\mathbf{\hat{Y}}}`. Note that any pre-fitting of the regressor will be lost if `PCovR` is within a composite estimator that enforces cloning, e.g., `sklearn.compose.TransformedTargetRegressor` or @@ -133,7 +116,8 @@ class PCovR(_BasePCA, LinearModel): training data as the composite estimator. If `precomputed`, we assume that the `y` passed to the `fit` function is the regressed form of the targets :math:`{\mathbf{\hat{Y}}}`. - If None, `sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)` is used as the regressor. + If None, ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)`` + is used as the regressor. iterated_power : int or 'auto', default='auto' Number of iterations for the power method computed by @@ -204,7 +188,7 @@ class PCovR(_BasePCA, LinearModel): [-1.02804032, 1.06737777], [ 0.98167556, -4.9830631 ], [-2.99627428, 1.98241962]] - """ + """ # NoQa: E501 def __init__( self, @@ -272,7 +256,8 @@ def fit(self, X, Y, W=None): if np.max(np.abs(self.mean_)) > self.tol: warnings.warn( - "This class does not automatically center data, and your data mean is greater than the supplied tolerance." + "This class does not automatically center data, and your data mean is" + " greater than the supplied tolerance." ) if self.space is not None and self.space not in [ @@ -618,8 +603,9 @@ def inverse_transform(self, T): if np.max(np.abs(self.mean_)) > self.tol: warnings.warn( - "This class does not automatically un-center data, and your data mean is greater than the supplied tolerance," - "so the inverse transformation will be off by the original data mean." + "This class does not automatically un-center data, and your data mean " + "is greater than the supplied tolerance, so the inverse transformation " + "will be off by the original data mean." ) return T @ self.ptx_ @@ -664,15 +650,19 @@ def score(self, X, Y, T=None): .. math:: - \ell_{X} = \frac{\lVert \mathbf{X} - \mathbf{T}\mathbf{P}_{TX} \rVert ^ 2}{\lVert \mathbf{X}\rVert ^ 2} + \ell_{X} = \frac{\lVert \mathbf{X} - \mathbf{T}\mathbf{P}_{TX} \rVert ^ 2} + {\lVert \mathbf{X}\rVert ^ 2} and .. math:: - \ell_{Y} = \frac{\lVert \mathbf{Y} - \mathbf{T}\mathbf{P}_{TY} \rVert ^ 2}{\lVert \mathbf{Y}\rVert ^ 2} + \ell_{Y} = \frac{\lVert \mathbf{Y} - \mathbf{T}\mathbf{P}_{TY} \rVert ^ 2} + {\lVert \mathbf{Y}\rVert ^ 2} - The negative loss :math:`-\ell = -(\ell_{X} + \ell{Y})` is returned for easier use in sklearn pipelines, e.g., a grid search, where methods named 'score' are meant to be maximized. + The negative loss :math:`-\ell = -(\ell_{X} + \ell{Y})` is returned for easier + use in sklearn pipelines, e.g., a grid search, where methods named 'score' are + meant to be maximized. Parameters @@ -686,8 +676,9 @@ def score(self, X, Y, T=None): Returns ------- loss : float - Negative sum of the loss in reconstructing X from the latent-space projection T - and the loss in predicting Y from the latent-space projection T + Negative sum of the loss in reconstructing X from the latent-space + projection T and the loss in predicting Y from the latent-space + projection T """ if T is None: diff --git a/skmatter/feature_selection/__init__.py b/src/skmatter/feature_selection/__init__.py similarity index 100% rename from skmatter/feature_selection/__init__.py rename to src/skmatter/feature_selection/__init__.py diff --git a/skmatter/feature_selection/_base.py b/src/skmatter/feature_selection/_base.py similarity index 81% rename from skmatter/feature_selection/_base.py rename to src/skmatter/feature_selection/_base.py index e78b8ee1af..0344a6a976 100644 --- a/skmatter/feature_selection/_base.py +++ b/src/skmatter/feature_selection/_base.py @@ -2,12 +2,7 @@ Sequential feature selection """ -from .._selection import ( - _CUR, - _FPS, - _PCovCUR, - _PCovFPS, -) +from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS class FPS(_FPS): @@ -28,20 +23,21 @@ class FPS(_FPS): select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None - Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + Threshold for the score. If `None` selection will continue until the n_to_select + is chosen. Otherwise will stop when the score falls below the threshold. Stored + in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + the selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -104,19 +100,20 @@ class PCovFPS(_PCovFPS): score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + n_to_select is chosen. Otherwise will stop when the score falls below the + threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + the selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -184,19 +181,20 @@ class CUR(_CUR): score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + n_to_select is chosen. Otherwise will stop when the score falls below the + threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + the selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -272,20 +270,21 @@ class PCovCUR(_PCovCUR): select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None - Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + Threshold for the score. If `None` selection will continue until the n_to_select + is chosen. Otherwise will stop when the score falls below the threshold. Stored + in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" - How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + How to interpret the ``score_threshold``. When "absolute", the score used by the + selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose diff --git a/skmatter/linear_model/__init__.py b/src/skmatter/linear_model/__init__.py similarity index 100% rename from skmatter/linear_model/__init__.py rename to src/skmatter/linear_model/__init__.py diff --git a/skmatter/linear_model/_base.py b/src/skmatter/linear_model/_base.py similarity index 98% rename from skmatter/linear_model/_base.py rename to src/skmatter/linear_model/_base.py index 0698b3ba16..de6b354cb1 100644 --- a/skmatter/linear_model/_base.py +++ b/src/skmatter/linear_model/_base.py @@ -1,9 +1,6 @@ import numpy as np from scipy.linalg import orthogonal_procrustes -from sklearn.base import ( - MultiOutputMixin, - RegressorMixin, -) +from sklearn.base import MultiOutputMixin, RegressorMixin from sklearn.linear_model import LinearRegression @@ -45,8 +42,7 @@ class OrthogonalRegression(MultiOutputMixin, RegressorMixin): coef_ : ndarray of shape (n_features,) or (n_targets, n_features) or (max_components, max_components) Weight matrix. The shape (max_components, max_components) is used if `use_orthogonal_projector` is set to False. - - """ + """ # NoQa: E501 def __init__(self, use_orthogonal_projector=True, linear_estimator=None): self.use_orthogonal_projector = use_orthogonal_projector diff --git a/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py similarity index 92% rename from skmatter/linear_model/_ridge.py rename to src/skmatter/linear_model/_ridge.py index 5ba59aa1e1..d9f477917d 100644 --- a/skmatter/linear_model/_ridge.py +++ b/src/skmatter/linear_model/_ridge.py @@ -1,18 +1,14 @@ import numpy as np -from joblib import ( - Parallel, - delayed, -) -from sklearn.base import ( - MultiOutputMixin, - RegressorMixin, -) +from joblib import Parallel, delayed +from sklearn.base import MultiOutputMixin, RegressorMixin from sklearn.metrics import check_scoring from sklearn.model_selection import KFold class RidgeRegression2FoldCV(MultiOutputMixin, RegressorMixin): - r"""Ridge regression with an efficient 2-fold cross-validation method using the SVD solver. + r"""Ridge regression with an efficient 2-fold cross-validation method using the SVD + solver. + Minimizes the objective function: .. math:: @@ -22,13 +18,15 @@ class RidgeRegression2FoldCV(MultiOutputMixin, RegressorMixin): while the alpha value is determined with a 2-fold cross-validation from a list of alpha values. It is more efficient than doing a 2-fold cross-validation using :obj:`sklearn.linear_model.RidgeCV`. - The advantage over :obj:`sklearn.linear_model.RidgeCV` using leave-one-out cross-validation - (LOOCV) [loocv]_ needs to be analyzed more in detail. Internal benchmarks suggest that it - is more efficient than the LOOCV in :obj:`sklearn.linear_model.RidgeCV` for feature sizes < 600 - and in general more accurate, see issue #40. However, it is constraint to a svd - solver for the matrix inversion. - It offers additional functionalities in comparison to :obj:`sklearn.linear_model.Ridge`: - The regularaization parameters can be chosen relative to the largest eigenvalue of the feature matrix + The advantage over :obj:`sklearn.linear_model.RidgeCV` using leave-one-out + cross-validation (LOOCV) [loocv]_ needs to be analyzed more in detail. Internal + benchmarks suggest that it is more efficient than the LOOCV in + :obj:`sklearn.linear_model.RidgeCV` for feature sizes < 600 and in general more + accurate, see issue #40. However, it is constraint to a svd solver for the matrix + inversion. + It offers additional functionalities in comparison to + :obj:`sklearn.linear_model.Ridge`: The regularaization parameters can be chosen + relative to the largest eigenvalue of the feature matrix as well as regularization method. Details are explained in the `Parameters` section. Parameters @@ -92,7 +90,7 @@ class RidgeRegression2FoldCV(MultiOutputMixin, RegressorMixin): ---------- .. [loocv] Rifkin "Regularized Least Squares." https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf - """ + """ # NoQa: E501 def __init__( self, @@ -124,7 +122,8 @@ def fit(self, X, y): Training data, where n_samples is the number of samples and n_targets is the number of target properties. """ - # check input parameters, can be moved at some point to a sklearn-like check function + # check input parameters, can be moved at some point to a sklearn-like check + # function if self.regularization_method not in ["tikhonov", "cutoff"]: raise ValueError( f"regularization method {self.regularization_method} is not known." @@ -135,10 +134,12 @@ def fit(self, X, y): np.any(self.alphas < 0) or np.any(self.alphas >= 1) ): raise ValueError( - "relative alphas type used, but the alphas are not within the range [0,1)" + "relative alphas type used, but the alphas are not within the range " + "[0,1)" ) - # check_scoring uses estimators scoring function if the scorer is None, this is intercepted here + # check_scoring uses estimators scoring function if the scorer is None, this is + # intercepted here if self.scoring is None: scorer = check_scoring( self, scoring="neg_root_mean_squared_error", allow_none=False diff --git a/skmatter/metrics/__init__.py b/src/skmatter/metrics/__init__.py similarity index 100% rename from skmatter/metrics/__init__.py rename to src/skmatter/metrics/__init__.py diff --git a/skmatter/metrics/_reconstruction_measures.py b/src/skmatter/metrics/_reconstruction_measures.py similarity index 99% rename from skmatter/metrics/_reconstruction_measures.py rename to src/skmatter/metrics/_reconstruction_measures.py index 5e8420779d..02d3d6557b 100644 --- a/skmatter/metrics/_reconstruction_measures.py +++ b/src/skmatter/metrics/_reconstruction_measures.py @@ -1,13 +1,7 @@ import numpy as np -from joblib import ( - Parallel, - delayed, -) - -from ..linear_model import ( - OrthogonalRegression, - RidgeRegression2FoldCV, -) +from joblib import Parallel, delayed + +from ..linear_model import OrthogonalRegression, RidgeRegression2FoldCV from ..model_selection import train_test_split from ..preprocessing import StandardFlexibleScaler @@ -199,7 +193,8 @@ def pointwise_global_reconstruction_distortion( GRD^{(i)}(X,Y) = \min_Q ||y_i - x_iQ\|| \quad\mathrm{subject\ to}\quad Q^TQ=I If used with X and Y of shape (n_samples, n_features) it computes the pointwise - global reconstruction distortion of the features as defined in Ref. [Goscinski2021]_. + global reconstruction distortion of the features as defined in + Ref. [Goscinski2021]_. In this case the number of samples of X and Y should agree with each other, but the number of features can be different. The distortion is expressed per sample. @@ -295,7 +290,8 @@ def global_reconstruction_distortion( GRD(X,Y) = \min_Q ||y - XQ\|| \quad\mathrm{subject\ to}\quad Q^TQ=I If used with X and Y of shape (n_samples, n_features) it computes the - global reconstruction distortion of the features as defined in Ref. [Goscinski2021]_. + global reconstruction distortion of the features as defined in + Ref. [Goscinski2021]_. In this case the number of samples of X and Y should agree with each other, but the number of features can be different. The distortion is expressed per sample. diff --git a/skmatter/model_selection/__init__.py b/src/skmatter/model_selection/__init__.py similarity index 100% rename from skmatter/model_selection/__init__.py rename to src/skmatter/model_selection/__init__.py diff --git a/skmatter/model_selection/_split.py b/src/skmatter/model_selection/_split.py similarity index 99% rename from skmatter/model_selection/_split.py rename to src/skmatter/model_selection/_split.py index 0dc5b97953..6e1f2cbcfb 100644 --- a/skmatter/model_selection/_split.py +++ b/src/skmatter/model_selection/_split.py @@ -44,7 +44,7 @@ def train_test_split(*arrays, **options): ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. - """ + """ # NoQa: E501 train_test_overlap = options.pop("train_test_overlap", False) test_size = options.get("test_size", None) train_size = options.get("train_size", None) diff --git a/skmatter/preprocessing/__init__.py b/src/skmatter/preprocessing/__init__.py similarity index 100% rename from skmatter/preprocessing/__init__.py rename to src/skmatter/preprocessing/__init__.py diff --git a/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py similarity index 98% rename from skmatter/preprocessing/_data.py rename to src/skmatter/preprocessing/_data.py index aefeca7867..62179002f8 100644 --- a/skmatter/preprocessing/_data.py +++ b/src/skmatter/preprocessing/_data.py @@ -1,14 +1,7 @@ import numpy as np -from sklearn.base import ( - BaseEstimator, - TransformerMixin, -) +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing._data import KernelCenterer -from sklearn.utils.validation import ( - FLOAT_DTYPES, - _check_sample_weight, - check_is_fitted, -) +from sklearn.utils.validation import FLOAT_DTYPES, _check_sample_weight, check_is_fitted class StandardFlexibleScaler(TransformerMixin, BaseEstimator): @@ -429,7 +422,8 @@ class SparseKernelCenterer(TransformerMixin, BaseEstimator): approximation: .. math:: - \mathbf{K} \approx \hat{\mathbf{K}}_{N N}=\mathbf{K}_{N M} \mathbf{K}_{M M}^{-1} \mathbf{K}_{N M}^{T} + \mathbf{K} \approx \hat{\mathbf{K}}_{N N} + = \mathbf{K}_{N M} \mathbf{K}_{M M}^{-1} \mathbf{K}_{N M}^{T} where the subscripts for $\mathbf{K}$ denote the size of the sets of samples compared in each kernel, with $N$ being the size of the full data set and diff --git a/skmatter/sample_selection/__init__.py b/src/skmatter/sample_selection/__init__.py similarity index 100% rename from skmatter/sample_selection/__init__.py rename to src/skmatter/sample_selection/__init__.py diff --git a/skmatter/sample_selection/_base.py b/src/skmatter/sample_selection/_base.py similarity index 86% rename from skmatter/sample_selection/_base.py rename to src/skmatter/sample_selection/_base.py index 2377eb2e35..478e4637e5 100644 --- a/skmatter/sample_selection/_base.py +++ b/src/skmatter/sample_selection/_base.py @@ -5,29 +5,19 @@ import warnings import numpy as np -from scipy.interpolate import ( - LinearNDInterpolator, - interp1d, -) +from scipy.interpolate import LinearNDInterpolator, interp1d from scipy.interpolate.interpnd import _ndim_coords_from_arrays from scipy.spatial import ConvexHull -from sklearn.utils.validation import ( - check_array, - check_is_fitted, - check_X_y, -) +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y -from .._selection import ( - _CUR, - _FPS, - _PCovCUR, - _PCovFPS, -) +from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS def _linear_interpolator(points, values): """ - Returns linear interpolater for unstructured D-D data. Tessellate the input point set to N-D simplices, and interpolate linearly on each simplex. See `LinearNDInterpolator` for more details. + Returns linear interpolater for unstructured D-D data. Tessellate the input point + set to N-D simplices, and interpolate linearly on each simplex. See + ``LinearNDInterpolator`` for more details. points : 2-D ndarray of floats with shape (n, D), or length D tuple of 1-D ndarrays with shape (n,). Data point coordinates. @@ -39,7 +29,7 @@ def _linear_interpolator(points, values): --------- The code is an adapted excerpt from https://github.com/scipy/scipy/blob/dde50595862a4f9cede24b5d1c86935c30f1f88a/scipy/interpolate/_ndgriddata.py#L119-L273 - """ + """ # NoQa: E501 points = _ndim_coords_from_arrays(points) @@ -80,19 +70,20 @@ class FPS(_FPS): score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + n_to_select is chosen. Otherwise will stop when the score falls below the + threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" - How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + How to interpret the ``score_threshold``. When "absolute", the score used by the + selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -108,7 +99,8 @@ class FPS(_FPS): X_selected_ : ndarray, Matrix containing the selected samples, for use in fitting y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for use in fitting + In sample selection, the matrix containing the selected targets, for + use in fitting """ @@ -156,20 +148,21 @@ class PCovFPS(_PCovFPS): select. Stored in :py:attr:`self.n_to_select`. score_threshold : float, default=None - Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + Threshold for the score. If `None` selection will continue until the n_to_select + is chosen. Otherwise will stop when the score falls below the threshold. Stored + in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" - How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + How to interpret the ``score_threshold``. When "absolute", the score used by the + selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -185,7 +178,8 @@ class PCovFPS(_PCovFPS): X_selected_ : ndarray, Matrix containing the selected samples, for use in fitting y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for use in fitting + In sample selection, the matrix containing the selected targets, for + use in fitting """ @@ -239,19 +233,20 @@ class CUR(_CUR): score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + n_to_select is chosen. Otherwise will stop when the score falls below the + threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + the selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -270,7 +265,8 @@ class CUR(_CUR): X_selected_ : ndarray, Matrix containing the selected samples, for use in fitting y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for use in fitting + In sample selection, the matrix containing the selected targets, for + use in fitting """ @@ -332,19 +328,21 @@ class PCovCUR(_PCovCUR): score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the threshold. - Stored in :py:attr:`self.score_threshold`. + n_to_select is chosen. Otherwise will stop when the score falls below the + threshold. Stored in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each iteration, - the score used by the selector is compared proportionally to the score of the first - selection, i.e. the selector quits when ``current_score / first_score < threshold``. - Stored in :py:attr:`self.score_threshold_type`. + the selector is compared to the threshold directly. When "relative", at each + iteration, the score used by the selector is compared proportionally to the + score of the first selection, i.e. the selector quits when + ``current_score / first_score < threshold``. Stored in + :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm `_ - progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. + progress bar to monitor selections. + Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose @@ -367,7 +365,8 @@ class PCovCUR(_PCovCUR): X_selected_ : ndarray, Matrix containing the selected samples, for use in fitting y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for use in fitting + In sample selection, the matrix containing the selected targets, for + use in fitting """ @@ -424,7 +423,8 @@ def _directional_distance(equations, points): class DirectionalConvexHull: """ - Performs Sample Selection by constructing a Directional Convex Hull and determining the distance to the hull as outlined in the reference + Performs Sample Selection by constructing a Directional Convex Hull and determining + the distance to the hull as outlined in the reference Parameters ---------- @@ -604,13 +604,13 @@ def score_samples(self, X, y): return self._directional_convex_hull_distance(hull_space_data).reshape(y.shape) def _directional_convex_hull_distance(self, points): - """ - Get the distance to the fitted directional convex hull in the target dimension y. + """Distance to the fitted directional convex hull in the target dimension y. - If a point lies above the convex hull, it will have a positive distance. - If a point lies below the convex hull, it will have a negative distance - this should only - occur if you pass a point that the convex hull has not been fitted on in the `fit` function. - If a point lies on the convex hull, it will have a distance of zero. + If a point lies above the convex hull, it will have a positive distance. If a + point lies below the convex hull, it will have a negative distance - this should + only occur if you pass a point that the convex hull has not been fitted on in + the `fit` function. If a point lies on the convex hull, it will have a distance + of zero. Parameters ---------- @@ -621,7 +621,8 @@ def _directional_convex_hull_distance(self, points): all_directional_distances = _directional_distance( self._directional_equations_, points ) - # we get negative distances for each plane to check if any distance is below the threshold + # we get negative distances for each plane to check if any distance is below the + # threshold below_directional_convex_hull = np.any( all_directional_distances < -self.tolerance, axis=1 ) @@ -678,7 +679,8 @@ def score_feature_matrix(self, X): if np.any(np.isnan(interpolated_high_dim_feats)): warnings.warn( - "There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans.", + "There are samples in X with a low-dimensional part that is outside " + "of the range of the convex surface. Distance will contain nans.", UserWarning, ) diff --git a/skmatter/sample_selection/_voronoi_fps.py b/src/skmatter/sample_selection/_voronoi_fps.py similarity index 95% rename from skmatter/sample_selection/_voronoi_fps.py rename to src/skmatter/sample_selection/_voronoi_fps.py index 6a772ef78a..2413ed927b 100644 --- a/skmatter/sample_selection/_voronoi_fps.py +++ b/src/skmatter/sample_selection/_voronoi_fps.py @@ -24,10 +24,10 @@ class VoronoiFPS(GreedySelector): .. image:: VoronoiFPS-Schematic.pdf - To demonstrate the algorithm behind Voronoi FPS, let :math:`*_{m+1}` be a new chosen point, - :math:`v(j)` was chosen earlier, :math:`j` is a point in the polyhedron with center - :math:`v(j)`. From the inequalities of the triangle one can easily see that if - :math:`d(v(j),j)= 0 distances - In an old implementation we observed this bug for the dataset we use in this test - (see issue #162) + Ensure that when we score on the points we fitted that we obtain only >= 0 + distances. + + In an old implementation we observed this bug for the dataset we use in this + test (see issue #162). """ X = [ [1.88421449, 0.86675162], @@ -161,7 +170,8 @@ def test_positive_score(self): def test_score_function_warnings(self): """ - Ensure that calling `score_samples` with points outside the range causes an error + Ensure that calling `score_samples` with points outside the range causes an + error. """ selector = DirectionalConvexHull(low_dim_idx=[0]) @@ -184,7 +194,8 @@ def test_score_function_warnings(self): self.assertTrue(len(warning) == 1) self.assertTrue(issubclass(warning[0].category, UserWarning)) self.assertTrue( - "There are samples in X with a low-dimensional part that is outside of the range of the convex surface. Distance will contain nans." + "There are samples in X with a low-dimensional part that is outside " + "of the range of the convex surface. Distance will contain nans." == str(warning[0].message) ) diff --git a/tests/test_greedy_selector.py b/tests/test_greedy_selector.py index a9d03d8a0d..e059e4a6fe 100644 --- a/tests/test_greedy_selector.py +++ b/tests/test_greedy_selector.py @@ -55,7 +55,8 @@ def test_bad_warm_start(self): selector.fit(self.X, warm_start=True) self.assertTrue( str(cm.exception), - "Cannot fit with warm_start=True without having been previously initialized", + "Cannot fit with warm_start=True without having been previously " + "initialized", ) def test_bad_y(self): @@ -122,7 +123,8 @@ def test_size_input(self): selector_feature.fit(X) self.assertEqual( str(cm.exception), - "Found array with 1 feature(s) (shape=(5, 1)) while a minimum of 2 is required.", + "Found array with 1 feature(s) (shape=(5, 1)) while a minimum of 2 is " + "required.", ) X = X.reshape(1, -1) selector_feature.fit(X) @@ -130,7 +132,8 @@ def test_size_input(self): selector_sample.fit(X) self.assertEqual( str(cm.exception), - "Found array with 1 sample(s) (shape=(1, 5)) while a minimum of 2 is required.", + "Found array with 1 sample(s) (shape=(1, 5)) while a minimum of 2 is " + "required.", ) diff --git a/tests/test_kernel_normalizer.py b/tests/test_kernel_normalizer.py index dd32ccaa96..694b39d0a2 100644 --- a/tests/test_kernel_normalizer.py +++ b/tests/test_kernel_normalizer.py @@ -12,7 +12,8 @@ def __init__(self, *args, **kwargs): self.random_state = np.random.RandomState(0) def test_sample_weights(self): - """Checks that sample weights of one are equal to the unweighted case and that nonuniform weights are different from the unweighted case""" + """Checks that sample weights of one are equal to the unweighted case and + that nonuniform weights are different from the unweighted case""" K = self.random_state.uniform(0, 100, size=(3, 3)) equal_wts = np.ones(len(K)) nonequal_wts = self.random_state.uniform(0, 100, size=(len(K),)) @@ -29,7 +30,8 @@ def test_sample_weights(self): ) def test_invalid_sample_weights(self): - """Checks that weights must be 1D array with the same length as the number of samples""" + """Checks that weights must be 1D array with the same length as the number of + samples""" K = self.random_state.uniform(0, 100, size=(3, 3)) wts_len = np.ones(len(K) + 1) wts_dim = np.ones((len(K), 2)) diff --git a/tests/test_kernel_pcovr.py b/tests/test_kernel_pcovr.py index 3744b05c01..5ffa345e0c 100644 --- a/tests/test_kernel_pcovr.py +++ b/tests/test_kernel_pcovr.py @@ -4,16 +4,10 @@ from sklearn import exceptions from sklearn.datasets import load_diabetes as get_dataset from sklearn.kernel_ridge import KernelRidge -from sklearn.linear_model import ( - Ridge, - RidgeCV, -) +from sklearn.linear_model import Ridge, RidgeCV from sklearn.utils.validation import check_X_y -from skmatter.decomposition import ( - KernelPCovR, - PCovR, -) +from skmatter.decomposition import KernelPCovR, PCovR from skmatter.preprocessing import StandardFlexibleScaler as SFS @@ -47,7 +41,7 @@ def __init__(self, *args, **kwargs): regressor=regressor, n_components=n_components, svd_solver=kwargs.pop("svd_solver", "full"), - **kwargs + **kwargs, ) def setUp(self): @@ -333,7 +327,7 @@ def _linear_kernel(X, Y): kernel=kernel, **kernel_params.get(kernel, {}) ), kernel=kernel, - **kernel_params.get(kernel, {}) + **kernel_params.get(kernel, {}), ) kpcovr.fit(self.X, self.Y) @@ -358,7 +352,7 @@ def test_linear_matches_pcovr(self): regressor=KernelRidge(alpha=ridge.alpha_, kernel="linear"), kernel="linear", fit_inverse_transform=True, - **hypers + **hypers, ) kpcovr.fit(self.X, self.Y) ly = ( diff --git a/tests/test_linear_model.py b/tests/test_linear_model.py index aa603c57f8..1d0cb0084e 100644 --- a/tests/test_linear_model.py +++ b/tests/test_linear_model.py @@ -3,15 +3,9 @@ import numpy as np from parameterized import parameterized from sklearn.datasets import load_iris -from sklearn.utils import ( - check_random_state, - extmath, -) +from sklearn.utils import check_random_state, extmath -from skmatter.linear_model import ( - OrthogonalRegression, - RidgeRegression2FoldCV, -) +from skmatter.linear_model import OrthogonalRegression, RidgeRegression2FoldCV class BaseTests(unittest.TestCase): @@ -32,7 +26,8 @@ def setUpClass(cls): cls.features_rotated_small = cls.features_small @ random_orthonormal_mat def test_orthogonal_regression_small_to_rotated_small(self): - # tests if OrthogonalRegression can predict rotated small features using small features with use_orthogonal_projector False + # tests if OrthogonalRegression can predict rotated small features using small + # features with use_orthogonal_projector False err = np.linalg.norm( self.features_rotated_small - OrthogonalRegression(use_orthogonal_projector=False) @@ -53,13 +48,15 @@ def test_orthogonal_regression_large_to_small(self): ) self.assertTrue( n_features == self.features_large.shape[1], - f"n_features {n_features} does not match larger feature size {self.features_large.shape[1]}", + f"n_features {n_features} does not match larger feature size " + f"{self.features_large.shape[1]}", ) def test_orthogonal_regression_use_orthogonal_projector_small_to_rotated_small( self, ): - # tests if OrthogonalRegression can predict rotated small features using small features with use_orthogonal_projector True + # tests if OrthogonalRegression can predict rotated small features using small + # features with use_orthogonal_projector True err = np.linalg.norm( self.features_rotated_small - OrthogonalRegression(use_orthogonal_projector=True) @@ -80,7 +77,8 @@ def test_orthogonal_regression_use_orthogonal_projector_small_to_large(self): ) self.assertTrue( n_features == self.features_large.shape[1], - f"n_features {n_features} does not match projection feature size {self.features_large.shape[1]}", + f"n_features {n_features} does not match projection feature size " + f"{self.features_large.shape[1]}", ) def test_orthogonal_regression_use_orthogonal_projector_large_to_small(self): @@ -93,7 +91,8 @@ def test_orthogonal_regression_use_orthogonal_projector_large_to_small(self): ) self.assertTrue( n_features == self.features_small.shape[1], - f"n_features {n_features} does not match projection feature size {self.features_small.shape[1]}", + f"n_features {n_features} does not match projection feature size " + f"{self.features_small.shape[1]}", ) @@ -145,7 +144,8 @@ def test_ridge_regression_2fold_relative_alpha_type_raise_error(self): def test_ridge_regression_2fold_cv_small_to_small( self, name, alpha_type, regularization_method ): - # tests if RidgeRegression2FoldCV can predict small features using small features with use_orthogonal_projector False + # tests if RidgeRegression2FoldCV can predict small features using small + # features with use_orthogonal_projector False err = np.linalg.norm( self.features_small - RidgeRegression2FoldCV( @@ -162,7 +162,8 @@ def test_ridge_regression_2fold_cv_small_to_small( @parameterized.expand(ridge_parameters) def test_ridge_regression_2fold_cv_small_to_large( - # tests if RidgeRegression2FoldCV can predict large features using small features with use_orthogonal_projector False + # tests if RidgeRegression2FoldCV can predict large features using small + # features with use_orthogonal_projector False self, name, alpha_type, diff --git a/tests/test_metrics.py b/tests/test_metrics.py index d6944f68c8..1577e45972 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -2,10 +2,7 @@ import numpy as np from sklearn.datasets import load_iris -from sklearn.utils import ( - check_random_state, - extmath, -) +from sklearn.utils import check_random_state, extmath from skmatter.metrics import ( global_reconstruction_distortion, @@ -38,90 +35,108 @@ def test_global_reconstruction_error_identity(self): gfre_val = global_reconstruction_error(self.features_large, self.features_large) self.assertTrue( abs(gfre_val) < self.eps, - f"global_reconstruction_error {gfre_val} surpasses threshold for zero {self.eps}", + f"global_reconstruction_error {gfre_val} surpasses threshold for zero " + f"{self.eps}", ) def test_global_reconstruction_error_small_to_large(self): - # tests that the GRE of a small set of features onto a larger set of features returns within a threshold of zero + # tests that the GRE of a small set of features onto a larger set of features + # returns within a threshold of zero gfre_val = global_reconstruction_error(self.features_small, self.features_large) self.assertTrue( abs(gfre_val) < self.eps, - f"global_reconstruction_error {gfre_val} surpasses threshold for zero {self.eps}", + f"global_reconstruction_error {gfre_val} surpasses threshold for zero " + f"{self.eps}", ) def test_global_reconstruction_error_large_to_small(self): - # tests that the GRE of a large set of features onto a smaller set of features returns within a threshold of zero + # tests that the GRE of a large set of features onto a smaller set of features + # returns within a threshold of zero gfre_val = global_reconstruction_error(self.features_large, self.features_small) self.assertTrue( abs(gfre_val) < self.eps, - f"global_reconstruction_error {gfre_val} surpasses threshold for zero {self.eps}", + f"global_reconstruction_error {gfre_val} surpasses threshold for zero " + f"{self.eps}", ) def test_global_reconstruction_distortion_identity(self): - # tests that the GRD of a set of features onto itself returns within a threshold of zero + # tests that the GRD of a set of features onto itself returns within a threshold + # of zero gfrd_val = global_reconstruction_distortion( self.features_large, self.features_large ) self.assertTrue( abs(gfrd_val) < self.eps, - f"global_reconstruction_error {gfrd_val} surpasses threshold for zero {self.eps}", + f"global_reconstruction_error {gfrd_val} surpasses threshold for zero " + f"{self.eps}", ) def test_global_reconstruction_distortion_small_to_large(self): - # tests that the GRD of a small set of features onto a larger set of features returns within a threshold of zero + # tests that the GRD of a small set of features onto a larger set of features + # returns within a threshold of zero # should just run global_reconstruction_error(self.features_small, self.features_large) def test_global_reconstruction_distortion_large_to_small(self): - # tests that the GRD of a large set of features onto a smaller set of features returns within a threshold of zero + # tests that the GRD of a large set of features onto a smaller set of features + # returns within a threshold of zero # should just run global_reconstruction_error(self.features_large, self.features_small) def test_global_reconstruction_distortion_small_to_rotated_small(self): - # tests that the GRD of a small set of features onto a rotation of itself returns within a threshold of zero + # tests that the GRD of a small set of features onto a rotation of itself + # returns within a threshold of zero gfrd_val = global_reconstruction_distortion( self.features_small, self.features_rotated_small ) self.assertTrue( abs(gfrd_val) < self.eps, - f"global_reconstruction_error {gfrd_val} surpasses threshold for zero {self.eps}", + f"global_reconstruction_error {gfrd_val} surpasses threshold for zero " + f"{self.eps}", ) def test_local_reconstruction_error_identity(self): - # tests that the local reconstruction error of a set of features onto itself returns within a threshold of zero + # tests that the local reconstruction error of a set of features onto itself + # returns within a threshold of zero lfre_val = local_reconstruction_error( self.features_large, self.features_large, self.n_local_points ) self.assertTrue( abs(lfre_val) < self.eps, - f"local_reconstruction_error {lfre_val} surpasses threshold for zero {self.eps}", + f"local_reconstruction_error {lfre_val} surpasses threshold for zero" + f" {self.eps}", ) def test_local_reconstruction_error_small_to_large(self): - # tests that the local reconstruction error of a small set of features onto a larger set of features returns within a threshold of zero + # tests that the local reconstruction error of a small set of features onto a + # larger set of features returns within a threshold of zero lfre_val = local_reconstruction_error( self.features_small, self.features_large, self.n_local_points ) self.assertTrue( abs(lfre_val) < self.eps, - f"local_reconstruction_error {lfre_val} surpasses threshold for zero {self.eps}", + f"local_reconstruction_error {lfre_val} surpasses threshold for zero " + f"{self.eps}", ) def test_local_reconstruction_error_large_to_small(self): - # tests that the local reconstruction error of a larger set of features onto a smaller set of features returns within a threshold of zero + # tests that the local reconstruction error of a larger set of features onto a + # smaller set of features returns within a threshold of zero lfre_val = local_reconstruction_error( self.features_large, self.features_small, self.n_local_points ) self.assertTrue( abs(lfre_val) < self.eps, - f"local_reconstruction_error {lfre_val} surpasses threshold for zero {self.eps}", + f"local_reconstruction_error {lfre_val} surpasses threshold for zero " + f"{self.eps}", ) def test_local_reconstruction_error_train_idx(self): - # tests that the local reconstruction error works when specifying a manual train idx + # tests that the local reconstruction error works when specifying a manual + # train idx lfre_val = pointwise_local_reconstruction_error( self.features_large, @@ -132,11 +147,13 @@ def test_local_reconstruction_error_train_idx(self): test_size = len(self.features_large) - (len(self.features_large) // 4) self.assertTrue( len(lfre_val) == test_size, - f"size of pointwise LFRE {len(lfre_val)} differs from expected test set size {test_size}", + f"size of pointwise LFRE {len(lfre_val)} differs from expected test set " + f"size {test_size}", ) def test_local_reconstruction_error_test_idx(self): - # tests that the local reconstruction error works when specifying a manual train idx + # tests that the local reconstruction error works when specifying a manual + # train idx lfre_val = pointwise_local_reconstruction_error( self.features_large, @@ -147,7 +164,8 @@ def test_local_reconstruction_error_test_idx(self): test_size = len(self.features_large) // 4 self.assertTrue( len(lfre_val) == test_size, - f"size of pointwise LFRE {len(lfre_val)} differs from expected test set size {test_size}", + f"size of pointwise LFRE {len(lfre_val)} differs from expected test set " + f"size {test_size}", ) diff --git a/tests/test_model_selection.py b/tests/test_model_selection.py index 921cb19583..92c20530cf 100644 --- a/tests/test_model_selection.py +++ b/tests/test_model_selection.py @@ -1,8 +1,9 @@ import unittest +import sklearn.model_selection from sklearn.datasets import load_iris + import skmatter.model_selection -import sklearn.model_selection class SplitTests(unittest.TestCase): @@ -23,7 +24,8 @@ def test_train_test_splits(self): self.assertTrue((sklearn_outputs[i] == skmatter_outputs[i]).all()) def test_train_test_splits_train_test_overlap(self): - # tests that a test/train split which necessitates overlap returns the right number of points in each set + # tests that a test/train split which necessitates overlap returns the right + # number of points in each set X_train, X_test = skmatter.model_selection.train_test_split( self.X, train_size=0.8, diff --git a/tests/test_orthogonalizers.py b/tests/test_orthogonalizers.py index 1854de97ef..5c856e098b 100644 --- a/tests/test_orthogonalizers.py +++ b/tests/test_orthogonalizers.py @@ -10,6 +10,7 @@ Y_sample_orthogonalizer, ) + EPSILON = 1e-8 @@ -132,8 +133,9 @@ def test_multicolumn(self): ) self.assertEqual( str(cm.exception), - "You can only orthogonalize a matrix using a vector with the same number of rows." - f"Matrix X has {self.n_samples} rows, whereas the orthogonalizing matrix has {self.n_samples+4} rows.", + "You can only orthogonalize a matrix using a vector with the same number " + f"of rows. Matrix X has {self.n_samples} rows, whereas the " + f"orthogonalizing matrix has {self.n_samples+4} rows.", ) def test_warning(self): diff --git a/tests/test_pcovr.py b/tests/test_pcovr.py index e48482bc83..667f1f0f92 100644 --- a/tests/test_pcovr.py +++ b/tests/test_pcovr.py @@ -5,9 +5,9 @@ from sklearn import exceptions from sklearn.datasets import load_diabetes as get_dataset from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import Ridge from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import Ridge +from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_X_y from skmatter.decomposition import PCovR @@ -398,7 +398,8 @@ def test_centering(self): pcovr.fit(X, self.Y) self.assertEquals( str(w[0].message), - "This class does not automatically center data, and your data mean is greater than the supplied tolerance.", + "This class does not automatically center data, and your data mean is" + " greater than the supplied tolerance.", ) def test_T_shape(self): diff --git a/tests/test_pcovr_distances.py b/tests/test_pcovr_distances.py index 3eb9a22c72..2966fdd061 100644 --- a/tests/test_pcovr_distances.py +++ b/tests/test_pcovr_distances.py @@ -4,10 +4,7 @@ import scipy from sklearn.datasets import load_diabetes as get_dataset -from skmatter.utils import ( - pcovr_covariance, - pcovr_kernel, -) +from skmatter.utils import pcovr_covariance, pcovr_kernel class CovarianceTest(unittest.TestCase): diff --git a/tests/test_sample_pcov_cur.py b/tests/test_sample_pcov_cur.py index e15e0ed20d..cb0f3d5d0d 100644 --- a/tests/test_sample_pcov_cur.py +++ b/tests/test_sample_pcov_cur.py @@ -5,6 +5,7 @@ from skmatter.sample_selection import PCovCUR + EPSILON = 1e-6 diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py index de19dbcc9b..9e82c18c37 100644 --- a/tests/test_sample_simple_cur.py +++ b/tests/test_sample_simple_cur.py @@ -2,8 +2,8 @@ import numpy as np from sklearn import exceptions - from sklearn.datasets import fetch_california_housing as load + from skmatter.sample_selection import CUR diff --git a/tests/test_sparse_kernel_centerer.py b/tests/test_sparse_kernel_centerer.py index 3ec3142308..619e8e3870 100644 --- a/tests/test_sparse_kernel_centerer.py +++ b/tests/test_sparse_kernel_centerer.py @@ -12,7 +12,8 @@ def __init__(self, *args, **kwargs): self.random_state = np.random.RandomState(0) def test_sample_weights(self): - """Checks that sample weights of one are equal to the unweighted case and that the nonuniform weights are different from the unweighted case""" + """Checks that sample weights of one are equal to the unweighted case and that + the nonuniform weights are different from the unweighted case""" X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) @@ -38,7 +39,8 @@ def test_sample_weights(self): ) def test_invalid_sample_weights(self): - """Checks that weights must be 1D array with the same length as the number of samples""" + """Checks that weights must be 1D array with the same length as the number of + samples""" X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) @@ -68,7 +70,8 @@ def test_Square_Kmm(self): self.assertEqual(str(cm.exception), "The active kernel is not square.") def test_LatterDim(self): - """Checks that a matrix must have the same latter dimension as its active counterpart cannot be normalized.""" + """Checks that a matrix must have the same latter dimension as its active + counterpart cannot be normalized.""" X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) diff --git a/tests/test_standard_flexible_scaler.py b/tests/test_standard_flexible_scaler.py index c37b572f88..5e5108a470 100644 --- a/tests/test_standard_flexible_scaler.py +++ b/tests/test_standard_flexible_scaler.py @@ -13,7 +13,9 @@ def __init__(self, *args, **kwargs): self.random_state = np.random.RandomState(0) def test_sample_weights(self): - """Checks that sample weights of one are equal to the unweighted case and that the nonuniform weights are different from the unweighted case""" + """Checks that sample weights of one are equal to the unweighted case. + + Also, that the nonuniform weights are different from the unweighted case""" X = self.random_state.uniform(0, 100, size=(3, 3)) equal_wts = np.ones(len(X)) nonequal_wts = self.random_state.uniform(0, 100, size=(len(X),)) @@ -30,7 +32,8 @@ def test_sample_weights(self): ) def test_invalid_sample_weights(self): - """Checks that weights must be 1D array with the same length as the number of samples""" + """Checks that weights must be 1D array with the same length as the number of + samples""" X = self.random_state.uniform(0, 100, size=(3, 3)) wts_len = np.ones(len(X) + 1) wts_dim = np.ones((len(X), 2)) diff --git a/tests/test_voronoi_fps.py b/tests/test_voronoi_fps.py index 60e5a9f512..41fb63ddc1 100644 --- a/tests/test_voronoi_fps.py +++ b/tests/test_voronoi_fps.py @@ -2,13 +2,10 @@ import numpy as np from sklearn.exceptions import NotFittedError - -from skmatter.sample_selection import ( - FPS, - VoronoiFPS, -) from test_sample_simple_fps import TestFPS +from skmatter.sample_selection import FPS, VoronoiFPS + class TestVoronoiFPS(TestFPS): def setUp(self): diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000..8b338535e6 --- /dev/null +++ b/tox.ini @@ -0,0 +1,65 @@ +[tox] +envlist = + tests + lint + +lint_folders = {toxinidir}/src {toxinidir}/tests + + +[testenv:tests] +usedevelop = true +changedir = tests +deps = -rtests/requirements.txt + +commands = + coverage run -m unittest discover -p "*.py" + coverage xml + +[testenv:examples] +usedevelop = true +changedir = examples +allowlist_externals = bash + +deps = -r examples/requirements.txt + +commands = + bash -ec "\ + SKMATTER_EXAMPLES=$PWD; \ + for file in $(find . -name '*.ipynb' -not -path '*/\\.*'); do \ + filename=$(basename $file); \ + jupyter nbconvert \ + --to notebook \ + --output "$SKMATTER_EXAMPLES/../.tox/examples/$filename" \ + --execute "$file"; \ + done" + +[testenv:lint] +skip_install = true +deps = + flake8 + black + isort +commands = + flake8 {[tox]lint_folders} + black --check --diff {[tox]lint_folders} + isort --check-only --diff {[tox]lint_folders} + +[testenv:format] +# Abuse tox to do actual formatting. Users can call `tox -e format` to run +# formatting on all files +skip_install = true +deps = + black + isort +commands = + black {[tox]lint_folders} + isort {[tox]lint_folders} + +[testenv:docs] +usedevelop = true +deps = + -r docs/requirements.txt +commands = sphinx-build {posargs:-E} -W -b html docs/src docs/build/html + +[flake8] +max_line_length = 88