diff --git a/.azure-ci/docker_scripts.sh b/.azure-ci/docker_scripts.sh index 73caa1524..8807deeea 100755 --- a/.azure-ci/docker_scripts.sh +++ b/.azure-ci/docker_scripts.sh @@ -7,7 +7,7 @@ echo "Start manylinux2010 docker build" PYTHON_PATH=$(eval find "/opt/python/*${python_ver}*" -print) export PATH="${PYTHON_PATH}/bin:${PATH}" pip config set global.progress_bar off -pip install --upgrade pip==19.3.1 setuptools +pip install --upgrade pip setuptools # Install CMake pip install cmake @@ -25,7 +25,7 @@ tar -zxvf /boost_1_69_0.tar.gz mkdir boost cd /boost_1_69_0 ./bootstrap.sh --prefix=/boost -./b2 install -j3 || echo "Parts of boost failed to build. Continuing.." +./b2 install -j3 || echo "Parts of boost failed to build. Continuing..." cd .. ccache -s @@ -36,6 +36,7 @@ export Boost_INCLUDE_DIR=/boost/include # Install dev environment cd /io +pip install wheel pip install -e ".[dev]" # Test dev install with pytest @@ -46,10 +47,10 @@ pip uninstall -y giotto-tda pip uninstall -y giotto-tda-nightly # Build wheels -pip install wheel==0.34.1 auditwheel==3.1.0 python setup.py bdist_wheel # Repair wheels with auditwheel +pip install auditwheel auditwheel repair dist/*whl -w dist/ # remove wheels that are not manylinux2010 rm -rf dist/*-linux*.whl diff --git a/.gitignore b/.gitignore index ea2d9f47f..4fd0a1c40 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Compiled python modules. *.pyc +*.pyo +*.pyd # Setuptools distribution folder. /dist/ @@ -26,9 +28,6 @@ doc/gallery doc/notebooks/*.ipynb doc/notebooks/*.py -# Pybind11 -gtda/externals/pybind11 - # Output files *.out diff --git a/.gitmodules b/.gitmodules index 7c18a5d56..2bb0377bf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -8,3 +8,9 @@ [submodule "gtda/externals/hera"] path = gtda/externals/hera url = https://github.com/grey-narn/hera +[submodule "gtda/externals/eigen"] + path = gtda/externals/eigen + url = https://gitlab.com/libeigen/eigen +[submodule "gtda/externals/pybind11"] + path = gtda/externals/pybind11 + url = https://github.com/pybind/pybind11 diff --git a/CMakeLists.txt b/CMakeLists.txt index eccc4b2b0..da0fa1d68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,12 +12,13 @@ find_package(OpenMP) set(RIPSER_SRC_DIR "gtda/externals/ripser") set(GUDHI_SRC_DIR "gtda/externals/gudhi-devel/src") set(HERA_DIR "gtda/externals/hera") +set(EIGEN_DIR "gtda/externals/eigen") ####################################################################### # Ripser # ####################################################################### -pybind11_add_module(gtda_ripser "${BINDINGS_DIR}/ripser_bindings.cpp") +pybind11_add_module(gtda_ripser MODULE "${BINDINGS_DIR}/ripser_bindings.cpp") set_property(TARGET gtda_ripser PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -39,7 +40,7 @@ endif() # Ripser - Coefficient enable # ####################################################################### -pybind11_add_module(gtda_ripser_coeff "${BINDINGS_DIR}/ripser_bindings.cpp") +pybind11_add_module(gtda_ripser_coeff MODULE "${BINDINGS_DIR}/ripser_bindings.cpp") set_property(TARGET gtda_ripser_coeff PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -61,7 +62,7 @@ endif() # Wasserstein # ####################################################################### -pybind11_add_module(gtda_wasserstein ${BINDINGS_DIR}/wasserstein_bindings.cpp) +pybind11_add_module(gtda_wasserstein MODULE ${BINDINGS_DIR}/wasserstein_bindings.cpp) set_property(TARGET gtda_wasserstein PROPERTY CXX_STANDARD 14) target_link_libraries(gtda_wasserstein LINK_PUBLIC ${Boost_LIBRARIES}) @@ -82,7 +83,7 @@ endif() # Bottleneck # ####################################################################### -pybind11_add_module(gtda_bottleneck "${BINDINGS_DIR}/bottleneck_bindings.cpp") +pybind11_add_module(gtda_bottleneck MODULE "${BINDINGS_DIR}/bottleneck_bindings.cpp") set_property(TARGET gtda_bottleneck PROPERTY CXX_STANDARD 14) target_link_libraries(gtda_bottleneck LINK_PUBLIC ${Boost_LIBRARIES}) @@ -103,7 +104,7 @@ endif() # Cubical Complex # ####################################################################### -pybind11_add_module(gtda_cubical_complex "${BINDINGS_DIR}/cubical_complex_bindings.cpp") +pybind11_add_module(gtda_cubical_complex MODULE "${BINDINGS_DIR}/cubical_complex_bindings.cpp") set_property(TARGET gtda_cubical_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -128,7 +129,7 @@ endif() # Persistent Cohomology # ####################################################################### -pybind11_add_module(gtda_persistent_cohomology "${BINDINGS_DIR}/persistent_cohomology_bindings.cpp") +pybind11_add_module(gtda_persistent_cohomology MODULE "${BINDINGS_DIR}/persistent_cohomology_bindings.cpp") set_property(TARGET gtda_persistent_cohomology PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -155,7 +156,7 @@ endif() # Simplex Tree # ####################################################################### -pybind11_add_module(gtda_simplex_tree "${BINDINGS_DIR}/simplex_tree_bindings.cpp") +pybind11_add_module(gtda_simplex_tree MODULE "${BINDINGS_DIR}/simplex_tree_bindings.cpp") set_property(TARGET gtda_simplex_tree PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -171,6 +172,8 @@ target_include_directories(gtda_simplex_tree PRIVATE "${GUDHI_SRC_DIR}/Cech_comp target_include_directories(gtda_simplex_tree PRIVATE "${GUDHI_SRC_DIR}/Persistent_cohomology/include") target_include_directories(gtda_simplex_tree PRIVATE "${GUDHI_SRC_DIR}/Subsampling/include") target_include_directories(gtda_simplex_tree PRIVATE "${GUDHI_SRC_DIR}/python/include") +target_include_directories(gtda_simplex_tree PRIVATE "${GUDHI_SRC_DIR}/Collapse/include") +target_include_directories(gtda_simplex_tree PRIVATE "${EIGEN_DIR}") if(MSVC) target_compile_options(gtda_simplex_tree PUBLIC $<$: /O2 /Wall /fp:strict>) @@ -184,7 +187,7 @@ endif() # Periodic Cubical Complex # ####################################################################### -pybind11_add_module(gtda_periodic_cubical_complex "${BINDINGS_DIR}/periodic_cubical_complex_bindings.cpp") +pybind11_add_module(gtda_periodic_cubical_complex MODULE "${BINDINGS_DIR}/periodic_cubical_complex_bindings.cpp") set_property(TARGET gtda_periodic_cubical_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -211,7 +214,7 @@ endif() # Witness Complex # ####################################################################### -pybind11_add_module(gtda_witness_complex "${BINDINGS_DIR}/witness_complex_bindings.cpp") +pybind11_add_module(gtda_witness_complex MODULE "${BINDINGS_DIR}/witness_complex_bindings.cpp") set_property(TARGET gtda_witness_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -227,6 +230,8 @@ target_include_directories(gtda_witness_complex PRIVATE "${GUDHI_SRC_DIR}/Cech_c target_include_directories(gtda_witness_complex PRIVATE "${GUDHI_SRC_DIR}/Persistent_cohomology/include") target_include_directories(gtda_witness_complex PRIVATE "${GUDHI_SRC_DIR}/python/include") target_include_directories(gtda_witness_complex PRIVATE "${GUDHI_SRC_DIR}/common/include") +target_include_directories(gtda_witness_complex PRIVATE "${GUDHI_SRC_DIR}/Collapse/include") +target_include_directories(gtda_witness_complex PRIVATE "${EIGEN_DIR}") if(MSVC) target_compile_options(gtda_witness_complex PUBLIC $<$: /O2 /Wall /fp:strict>) @@ -240,7 +245,7 @@ endif() # Strong Witness Complex # ####################################################################### -pybind11_add_module(gtda_strong_witness_complex "${BINDINGS_DIR}/strong_witness_complex_bindings.cpp") +pybind11_add_module(gtda_strong_witness_complex MODULE "${BINDINGS_DIR}/strong_witness_complex_bindings.cpp") set_property(TARGET gtda_strong_witness_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -256,6 +261,8 @@ target_include_directories(gtda_strong_witness_complex PRIVATE "${GUDHI_SRC_DIR} target_include_directories(gtda_strong_witness_complex PRIVATE "${GUDHI_SRC_DIR}/Persistent_cohomology/include") target_include_directories(gtda_strong_witness_complex PRIVATE "${GUDHI_SRC_DIR}/python/include") target_include_directories(gtda_strong_witness_complex PRIVATE "${GUDHI_SRC_DIR}/common/include") +target_include_directories(gtda_strong_witness_complex PRIVATE "${GUDHI_SRC_DIR}/Collapse/include") +target_include_directories(gtda_strong_witness_complex PRIVATE "${EIGEN_DIR}") if(MSVC) target_compile_options(gtda_strong_witness_complex PUBLIC $<$: /O2 /Wall /fp:strict>) @@ -269,7 +276,7 @@ endif() # RipsComplex # ####################################################################### -pybind11_add_module(gtda_sparse_rips_complex "${BINDINGS_DIR}/rips_complex_bindings.cpp") +pybind11_add_module(gtda_sparse_rips_complex MODULE "${BINDINGS_DIR}/rips_complex_bindings.cpp") set_property(TARGET gtda_sparse_rips_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -286,6 +293,8 @@ target_include_directories(gtda_sparse_rips_complex PRIVATE "${GUDHI_SRC_DIR}/Pe target_include_directories(gtda_sparse_rips_complex PRIVATE "${GUDHI_SRC_DIR}/Rips_complex/include") target_include_directories(gtda_sparse_rips_complex PRIVATE "${GUDHI_SRC_DIR}/Subsampling/include") target_include_directories(gtda_sparse_rips_complex PRIVATE "${GUDHI_SRC_DIR}/python/include") +target_include_directories(gtda_sparse_rips_complex PRIVATE "${GUDHI_SRC_DIR}/Collapse/include") +target_include_directories(gtda_sparse_rips_complex PRIVATE "${EIGEN_DIR}") if(MSVC) target_compile_options(gtda_sparse_rips_complex PUBLIC $<$: /O2 /Wall /fp:strict>) @@ -299,7 +308,7 @@ endif() # Cech Complex # ####################################################################### -pybind11_add_module(gtda_cech_complex "${BINDINGS_DIR}/cech_complex_bindings.cpp") +pybind11_add_module(gtda_cech_complex MODULE "${BINDINGS_DIR}/cech_complex_bindings.cpp") set_property(TARGET gtda_cech_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) @@ -314,6 +323,8 @@ target_include_directories(gtda_cech_complex PRIVATE "${GUDHI_SRC_DIR}/Cech_comp target_include_directories(gtda_cech_complex PRIVATE "${GUDHI_SRC_DIR}/Persistent_cohomology/include") target_include_directories(gtda_cech_complex PRIVATE "${GUDHI_SRC_DIR}/python/include") target_include_directories(gtda_cech_complex PRIVATE "${GUDHI_SRC_DIR}/common/include") +target_include_directories(gtda_cech_complex PRIVATE "${GUDHI_SRC_DIR}/Collapse/include") +target_include_directories(gtda_cech_complex PRIVATE "${EIGEN_DIR}") if(MSVC) target_compile_options(gtda_cech_complex PUBLIC $<$: /O2 /Wall /fp:strict>) @@ -323,3 +334,28 @@ else() target_compile_options(gtda_cech_complex PUBLIC $<$:-O2 -ggdb -D_GLIBCXX_DEBUG>) endif() +####################################################################### +# Collapser # +####################################################################### + +pybind11_add_module(gtda_collapser MODULE "${BINDINGS_DIR}/collapser_bindings.cpp") +set_property(TARGET gtda_collapser PROPERTY CXX_STANDARD 14) + +if(OpenMP_FOUND) + target_link_libraries(gtda_collapser PRIVATE OpenMP::OpenMP_CXX) +endif() + +target_link_libraries(gtda_collapser LINK_PUBLIC ${Boost_LIBRARIES}) +target_compile_definitions(gtda_collapser PRIVATE BOOST_RESULT_OF_USE_DECLTYPE=1 BOOST_ALL_NO_LIB=1 BOOST_SYSTEM_NO_DEPRECATED=1) + +target_include_directories(gtda_collapser PRIVATE "${GUDHI_SRC_DIR}/common/include") +target_include_directories(gtda_collapser PRIVATE "${GUDHI_SRC_DIR}/Collapse/include") +target_include_directories(gtda_collapser PRIVATE "${EIGEN_DIR}") + +if(MSVC) + target_compile_options(gtda_collapser PUBLIC $<$: /O2 /Wall /fp:strict>) + target_compile_options(gtda_collapser PUBLIC $<$:/O1 /DEBUG:FULL /Zi /Zo>) +else() + target_compile_options(gtda_collapser PUBLIC $<$: -Ofast -shared -pthread -fPIC -fwrapv -Wall -fno-strict-aliasing -frounding-math>) + target_compile_options(gtda_collapser PUBLIC $<$:-O2 -ggdb -D_GLIBCXX_DEBUG>) +endif() diff --git a/CODE_AUTHORS.rst b/CODE_AUTHORS.rst index d8440eda8..3d5752318 100644 --- a/CODE_AUTHORS.rst +++ b/CODE_AUTHORS.rst @@ -2,9 +2,9 @@ The following is the list of code authors of the ``giotto-tda`` python package. Where component authors are known, add them here. -| Guillaume Tauzin, guillaume.tauzin@epfl.ch -| Umberto Lupo, u.lupo@l2f.ch -| Lewis Tunstall, l.tunstall@l2f.ch +| Guillaume Tauzin, gtauzin@protonmail.com +| Umberto Lupo, umberto.lupo@epfl.ch +| Lewis Tunstall, lewis.c.tunstall@gmail.com | Matteo Caorsi, m.caorsi@l2f.ch | Philippe Nguyen, p.nguyen@l2f.ch | Julian Burella Pérez, julian.burellaperez@heig-vd.ch @@ -13,3 +13,4 @@ Where component authors are known, add them here. | Anibal Medina-Mardones, anibal.medinamardones@epfl.ch | Wojciech Reise, reisewojciech@gmail.com | Roman Yurchak, roman.yurchak@symerio.com +| Nick Sale, nicholas.j.sale@gmail.com diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 5c0654a6d..8e1897dea 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -3,6 +3,7 @@ Contributing guidelines This document only redirects to more `detailed instructions `_, which consist of: + - a pull request checklist, - a Contributor License Agreement, -- contributing guidelines and standards, including coding style guides. \ No newline at end of file +- contributing guidelines and standards, including coding style guides. diff --git a/GOVERNANCE.rst b/GOVERNANCE.rst index e36194a25..07e793eda 100644 --- a/GOVERNANCE.rst +++ b/GOVERNANCE.rst @@ -13,8 +13,8 @@ Authors: Giotto-tda Project Team: ------------------------ -- Umberto Lupo u.lupo@l2f.ch (Maintainer) -- Lewis Tunstall l.tunstall@l2f.ch (Maintainer) +- Umberto Lupo umberto.lupo@epfl.ch (Maintainer) +- Lewis Tunstall lewis.c.tunstall@gmail.com (Maintainer) - Matteo Caorsi m.caorsi@l2f.ch (Project Leader) Former Project Team Members: diff --git a/README.rst b/README.rst index 9a36600e8..82163fcb1 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -.. image:: doc/images/tda_logo.svg +.. image:: https://raw.githubusercontent.com/giotto-ai/giotto-tda/master/doc/images/tda_logo.svg :width: 850 |Version|_ |Azure-build|_ |Azure-cov|_ |Azure-test|_ |Twitter-follow|_ |Slack-join|_ @@ -49,11 +49,6 @@ Documentation Please visit `https://giotto-ai.github.io/gtda-docs `_ and navigate to the version you are interested in. -Use cases -========= - -For a wide selection of use cases and application domains, you can visit `this page `_. - Installation ============ @@ -63,12 +58,13 @@ Dependencies The latest stable version of ``giotto-tda`` requires: - Python (>= 3.6) -- NumPy (>= 1.17.0) -- SciPy (>= 0.17.0) -- joblib (>= 0.13) -- scikit-learn (>= 0.22.0) -- python-igraph (>= 0.7.1.post6) -- plotly (>= 4.4.1) +- NumPy (>= 1.19.1) +- SciPy (>= 1.5.0) +- joblib (>= 0.16.0) +- scikit-learn (>= 0.23.1) +- pyflagser (>= 0.4.1) +- python-igraph (>= 0.8.2) +- plotly (>= 4.8.2) - ipywidgets (>= 7.5.1) To run the examples, jupyter is required. @@ -78,7 +74,7 @@ User installation The simplest way to install ``giotto-tda`` is using ``pip`` :: - pip install -U giotto-tda + python -m pip install -U giotto-tda If necessary, this will also automatically install all the above dependencies. Note: we recommend upgrading ``pip`` to a recent version as the above may fail on very old versions. @@ -86,7 +82,7 @@ upgrading ``pip`` to a recent version as the above may fail on very old versions Pre-release, experimental builds containing recently added features, and/or bug fixes can be installed by running :: - pip install -U giotto-tda-nightly + python -m pip install -U giotto-tda-nightly The main difference between ``giotto-tda-nightly`` and the developer installation (see the section on contributing, below) is that the former is shipped with pre-compiled wheels (similarly to the stable diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e6afc9795..23d4d730c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,5 +1,8 @@ # These jobs are triggered automatically and they test code, examples, and wheels. # Additional checks can be manually triggered +variables: + nightlyRelease: $[and(eq(variables['nightly_check'], true), eq(variables['Build.SourceBranch'], 'refs/heads/master'), ne(variables['Build.Reason'], 'PullRequest'))] + trigger: - master @@ -38,15 +41,16 @@ jobs: - task: Cache@2 inputs: - key: '"ccache-wheels-v2020.05.12" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-wheels-v2020.10.05" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache - bash: | set -e sed -i "s/'giotto-tda'/'giotto-tda-nightly'/1" setup.py + sed -i 's/"giotto-tda"/"giotto-tda-nightly"/1' setup.py sed -i "s/__version__.*/__version__ = '$(Build.BuildNumber)'/1" gtda/_version.py - condition: eq(variables['nightly_check'], 'true') + condition: eq(variables.nightlyRelease, true) displayName: 'Change name to giotto-tda-nightly' - task: Bash@3 @@ -60,12 +64,12 @@ jobs: - script: | set -e python -m pip install --upgrade pip - pip install dist/*manylinux2010*.whl + python -m pip install dist/*manylinux2010*.whl displayName: 'Install the wheels' - script: | set -e - pip install pytest pytest-cov pytest-azurepipelines pytest-benchmark hypothesis + python -m pip install pandas pytest pytest-cov pytest-azurepipelines pytest-benchmark hypothesis mkdir tmp_test_cov cd tmp_test_cov pytest --pyargs gtda --ignore-glob='*externals*' --no-cov --no-coverage-upload @@ -73,8 +77,8 @@ jobs: - script: | set -e - pip install pandas openml matplotlib - pip install "papermill==1.2.1" + python -m pip install openml matplotlib + python -m pip install papermill cd examples for n in *.ipynb do @@ -97,9 +101,9 @@ jobs: - bash: | set -e - pip install twine + python -m pip install twine twine upload -u giotto-learn -p $(pypi_psw) --skip-existing dist/*manylinux2010*.whl - condition: eq(variables['nightly_check'], 'true') + condition: eq(variables.nightlyRelease, true) displayName: 'Upload nightly wheels to PyPI' @@ -125,20 +129,22 @@ jobs: - bash: | set -e sed -i.bak "s/'giotto-tda'/'giotto-tda-nightly'/1" setup.py + sed -i.bak 's/"giotto-tda"/"giotto-tda-nightly"/1' setup.py rm setup.py.bak sed -i.bak "s/__version__.*/__version__ = '$(Build.BuildNumber)'/1" gtda/_version.py rm gtda/_version.py.bak - condition: eq(variables['nightly_check'], 'true') + condition: eq(variables.nightlyRelease, true) displayName: 'Change name to giotto-tda-nightly' - task: Cache@2 inputs: - key: '"ccache-v2020.05.12" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-v2020.10.05" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache - script: | set -e + export HOMEBREW_NO_INSTALL_CLEANUP=1 brew update brew install boost ccache displayName: 'Install system dependencies' @@ -146,10 +152,11 @@ jobs: - script: | set -e python -m pip install --upgrade pip setuptools + python -m pip install wheel source .azure-ci/setup_ccache.sh python -m pip install -e ".[dev]" ccache -s - displayName: 'Install dependencies and dev environement' + displayName: 'Install dependencies and dev environment' - script: | set -e @@ -169,11 +176,10 @@ jobs: - script: | set -e - pip install wheel python setup.py bdist_wheel displayName: 'Build the wheels' - - script: pip install dist/*.whl + - script: python -m pip install dist/*.whl displayName: 'Install the wheels' - script: | @@ -186,12 +192,13 @@ jobs: - script: | set -e python -m pip install -e ".[examples]" - pip install "papermill==1.2.1" + python -m pip install papermill cd examples for n in *.ipynb do papermill --start_timeout 2000 $n - done + condition: eq(variables['notebooks_check'], 'true') displayName: 'Test jupyter notebooks with papermill' - task: CopyFiles@2 @@ -202,7 +209,7 @@ jobs: - script: | set -e - pip install twine + python -m pip install twine twine check dist/* displayName: 'Check distribution with twine' @@ -215,7 +222,7 @@ jobs: - bash: | set -e twine upload -u giotto-learn -p $(pypi_psw) --skip-existing dist/* - condition: eq(variables['nightly_check'], 'true') + condition: eq(variables.nightlyRelease, true) displayName: 'Upload nightly wheels to PyPI' @@ -242,8 +249,9 @@ jobs: - bash: | set -e sed -i "s/'giotto-tda'/'giotto-tda-nightly'/1" setup.py + sed -i 's/"giotto-tda"/"giotto-tda-nightly"/1' setup.py sed -i "s/__version__.*/__version__ = '$(Build.BuildNumber)'/1" gtda/_version.py - condition: eq(variables['nightly_check'], 'true') + condition: eq(variables.nightlyRelease, true) displayName: 'Change name to giotto-tda-nightly' # Set BOOST_ROOT_PIPELINE to the version used in the pipeline @@ -254,6 +262,7 @@ jobs: - script: | python -m pip install --upgrade pip setuptools + python -m pip install wheel python -m pip install -e ".[dev]" displayName: 'Install dev environment' @@ -262,18 +271,17 @@ jobs: displayName: 'Test dev install with pytest' - script: | - pip uninstall -y giotto-tda - pip uninstall -y giotto-tda-nightly + python -m pip uninstall -y giotto-tda + python -m pip uninstall -y giotto-tda-nightly displayName: 'Uninstall giotto-tda/giotto-tda-nightly dev' - bash: | set -e sed -i $'s/\r$//' README.rst - pip install wheel python setup.py bdist_wheel displayName: 'Build the wheels' - - bash: pip install dist/*.whl + - bash: python -m pip install dist/*.whl displayName: 'Install the wheels' - script: | @@ -284,7 +292,7 @@ jobs: - script: | python -m pip install -e ".[examples]" - pip install "papermill==1.2.1" + python -m pip install papermill cd examples FOR %%n in (*.ipynb) DO (papermill --start_timeout 2000 %%n - || exit /b) condition: eq(variables['notebooks_check'], 'true') @@ -304,7 +312,7 @@ jobs: - bash: | set -e - pip install twine + python -m pip install twine twine upload -u giotto-learn -p $(pypi_psw) --skip-existing dist/* - condition: eq(variables['nightly_check'], 'true') + condition: eq(variables.nightlyRelease, true) displayName: 'Upload nightly wheels to PyPI' diff --git a/cmake/HelperBoost.cmake b/cmake/HelperBoost.cmake index d3f19b12e..feaeff07f 100644 --- a/cmake/HelperBoost.cmake +++ b/cmake/HelperBoost.cmake @@ -1,6 +1,5 @@ - # Add to BOOST_ROOT variable a custom path to -# ease installation of giotto-tda on Windows platform +# ease installation of giotto-tda on Windows platforms # The custom path will be at `C:\\local\` if(WIN32) list(APPEND BOOST_ROOT "C:/local") diff --git a/doc/Makefile b/doc/Makefile index ce472f77e..db64aa20d 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -6,11 +6,11 @@ ########## Config ifndef GITDIR -GITDIR = ../../gtda-docs +GITDIR = ../../../Projects/gtda-docs endif ifndef VERSION -VERSION = stable +VERSION = latest endif ifndef RUNNOTEBOOKS @@ -41,40 +41,50 @@ check_installs: fi;\ done; -togit: +copy-to-docs-location: @mkdir -p "$(CURRENTDOCFOLDER)" - @cp -a build/html/ "$(CURRENTDOCFOLDER)" + @cp -r build/html/* "$(CURRENTDOCFOLDER)" -github: - @make update-versions-list +test-notebooks: + @make run-notebooks + @make html-docs + +html-docs: @export VERSION=$(VERSION); make html - @make togit theory-gl: @pandoc theory/glossary.tex -f latex -t rst --toc -s --bibliography=theory/bibliography.bib -o theory/glossary.rst -convert-notebooks: - @jupyter nbconvert notebooks/*.ipynb --execute --to rst - clean-gh: @make clean @rm -f theory/glossary.rst @make gh-remove +update-versions: + @export VERSION=$(VERSION); python3 update_versions.py $(GITDIR) + update-versions-list: @cd $(GITDIR); find . -type d -depth 1 > versions @mv $(GITDIR)/versions versions - @python3 update_versions.py $(GITDIR) + @make update-versions + +run-notebooks: + @cp ../examples/*.ipynb notebooks/ + @cp -r ../examples/data notebooks/ + @cp -r ../examples/images notebooks/ + @jupyter nbconvert notebooks/*.ipynb --execute --to rst --ExecutePreprocessor.timeout=1200 all-gh: @make clean-gh @make theory-gl @if ($(RUNNOTEBOOKS)); then\ - cp ../examples/*.ipynb notebooks/;\ - cp ../examples/*.py notebooks/;\ - make convert-notebooks;\ + make run-notebooks;\ + fi + @if ($(UPDATE_VERSIONS)); then\ + make update-versions-list;\ fi - @make github + @make html-docs + @make copy-to-docs-location gh-remove: @cd $(GITDIR); rm -rfv $(VERSION) && mkdir $(VERSION) diff --git a/doc/conf.py b/doc/conf.py index c8eccd985..ad6bed88a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -12,7 +12,7 @@ # import os import sys -import sphinx_rtd_theme +import warnings from gtda import __version__ @@ -40,7 +40,7 @@ 'sphinx.ext.viewcode', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', - 'sphinx.ext.imgconverter', + #'sphinx.ext.imgconverter', 'sphinx_issues', 'sphinx_rtd_theme', 'sphinx.ext.napoleon' @@ -118,9 +118,13 @@ # List versions current_version = os.environ['VERSION'] html_theme_options.update({'current_version': current_version}) -with open('versions', 'r') as f: - _versions = [c[2:] for c in f.read().splitlines()] -_versions = list(filter(lambda c: not(c.startswith('.')), _versions)) +try: + with open('versions', 'r') as f: + _versions = [c[2:] for c in f.read().splitlines()] + _versions = list(filter(lambda c: not(c.startswith('.')), _versions)) +except FileNotFoundError: + warnings.warn("Versions not found. Test mode.") + _versions = ['test', current_version] html_theme_options.update({ 'versions': [ (c, f'../{c}/index.html') @@ -130,6 +134,7 @@ # Get logo html_logo = "images/tda_logo.svg" +html_favicon = 'images/tda_favicon.svg' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -143,3 +148,9 @@ """.format( versionnum=release, ) + +supported_image_types = [ + 'image/svg+xml', + 'image/gif', + 'image/jpeg' +] diff --git a/doc/contributing/index.rst b/doc/contributing/index.rst index 58ff4f748..ed3cde8af 100644 --- a/doc/contributing/index.rst +++ b/doc/contributing/index.rst @@ -104,7 +104,7 @@ changes. To install ``flake8`` just do .. code-block:: python - pip install flake8 + python -m pip install flake8 You can use ``flake8`` on your python code via the following instructions: @@ -131,7 +131,7 @@ There are two ways to run unit tests for ``giotto-tda``. .. code-block:: python - pip install pytest + python -m pip install pytest You can use ``pytest`` on your python code via the following instructions: diff --git a/doc/images/tda_favicon.svg b/doc/images/tda_favicon.svg new file mode 100644 index 000000000..7033da8f8 --- /dev/null +++ b/doc/images/tda_favicon.svg @@ -0,0 +1,107 @@ + + + + + + image/svg+xml + + giotto-tdaAsset 11 + + + + + + + + + + + giotto-tdaAsset 11 + + + + + + + + + + diff --git a/doc/installation.rst b/doc/installation.rst index dfcc3fb37..3422f1286 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -8,27 +8,28 @@ Installation Dependencies ************ -The latest stable version of giotto-tda requires: +The latest stable version of ``giotto-tda`` requires: - Python (>= 3.6) -- NumPy (>= 1.17.0) -- SciPy (>= 0.17.0) -- joblib (>= 0.13) -- scikit-learn (>= 0.22.0) -- python-igraph (>= 0.7.1.post6) -- plotly (>= 4.4.1) +- NumPy (>= 1.19.1) +- SciPy (>= 1.5.0) +- joblib (>= 0.16.0) +- scikit-learn (>= 0.23.1) +- pyflagser (>= 0.4.1) +- python-igraph (>= 0.8.2) +- plotly (>= 4.8.2) - ipywidgets (>= 7.5.1) -To run the examples, jupyter is required. +To run the examples, ``jupyter`` is required. ***************** User installation ***************** -The simplest way to install giotto-tda is using ``pip`` :: +The simplest way to install ``giotto-tda`` is using ``pip`` :: - pip install -U giotto-tda + python -m pip install -U giotto-tda If necessary, this will also automatically install all the above dependencies. Note: we recommend upgrading ``pip`` to a recent version as the above may fail on very old versions. @@ -36,12 +37,12 @@ upgrading ``pip`` to a recent version as the above may fail on very old versions Pre-release, experimental builds containing recently added features, and/or bug fixes can be installed by running :: - pip install -U giotto-tda-nightly + python -m pip install -U giotto-tda-nightly -The main difference between giotto-tda-nightly and the developer installation (see the section +The main difference between ``giotto-tda-nightly`` and the developer installation (see the section on contributing, below) is that the former is shipped with pre-compiled wheels (similarly to the stable release) and hence does not require any C++ dependencies. As the main library module is called ``gtda`` in -both the stable and nightly versions, giotto-tda and giotto-tda-nightly should not be installed in +both the stable and nightly versions, ``giotto-tda`` and ``giotto-tda-nightly`` should not be installed in the same environment. ********************** @@ -50,7 +51,7 @@ Developer installation .. _dev_installation: -Installing both the PyPI release and source of giotto-tda in the same environment is not recommended since it is +Installing both the PyPI release and source of ``giotto-tda`` in the same environment is not recommended since it is known to cause conflicts with the C++ bindings. The developer installation requires three important C++ dependencies: @@ -96,36 +97,35 @@ is as follows: Boost ----- -Some users are experiencing some issue when installation `boost` on Windows. To help them resolve this issue, we customized a little bit the detection of `boost` library. -To install boost on windows, we (maintainers of giotto-tda) recommend 3 options: +Some users have been experiencing issues when installing Boost on Windows. To help them resolve them, we customized a little bit the detection of Boost. +To install Boost on Windows, we recommend 3 options: - Pre-built binaries, - Directly from source, -- Use an already installed boost version that fulfills `giotto-tda` requirements. +- Use an already installed Boost version that fulfills ``giotto-tda`` requirements. Pre-built binaries ------------------ -Boost propose for windows pre-built binaries to ease the installation of boost -in your system. In the -`website `_, you'll have access to all versions of boost. At the time of writing -this documentation, the most recent version of boost is `1.72.0`. If you go -into the folder, you'll find different executables - choose the version -corresponding to your system (32, 64 bits). In our case, we downloaded `boost_1_72_0-msvc-14.2-64.exe`. -Follow the installation instructions, and when prompted to specify the folder to install boost, go for `C:\\local\\`. +For Windows, Boost propose pre-built binaries to ease the installation in your system. In the +`website `_, you'll have access to all versions of Boost. +At the time of writing this documentation, the most recent version of Boost is `1.72.0`. If you go into the folder, +you'll find different executables – choose the version corresponding to your system (32, 64 bits). In our case, we +downloaded `boost_1_72_0-msvc-14.2-64.exe`. Follow the installation instructions, and when prompted to specify the +folder to install Boost, go for `C:\\local\\`. Source code ----------- -Boost proposes to `download `_ directly the source code of boost. +Boost proposes to `download `_ directly the Boost source code. You can choose from different sources (compressed in `.7z` or `.zip`). Download one and uncompress it in `C:\\local\\`, so you should have something like `C:\\local\\boost_x_y_z\\`. -Already installed boost version +Already installed Boost version ------------------------------- -If by some obscure reason, you have boost installed in your system but the installation procedure cannot find it (can happen, no control on cmake ...). -You can help the installation script by adding the path to your installation in the following place `gtda\\cmake\\HelperBoost.cmake`. +If, for some obscure reason, you have Boost installed in your system but the installation procedure cannot find it (can happen, no control on cmake ...). +You can help the installation script by adding the path to your installation in the following place: `gtda\\cmake\\HelperBoost.cmake`. In `HelperBoost.cmake` file, line 7, you can add your path between the quotation marks, e.g.:: list(APPEND BOOST_ROOT "C:\\"). @@ -133,17 +133,17 @@ In `HelperBoost.cmake` file, line 7, you can add your path between the quotation Troubleshooting --------------- -If you need to understand where the compiler tries to look for ``boost`` headers, +If you need to understand where the compiler tries to look for Boost headers, you can install ``giotto-tda`` with:: - pip install -e . -v + python -m pip install -e . -v Then you can look at the output for lines starting with:: Boost_INCLUDE_DIR: Boost_INCLUDE_DIRS: -Also, if you have installed different versions of ``boost`` in the process of trying to instal ``giotto-tda``, +Also, if you have installed different versions of Boost in the process of trying to install ``giotto-tda``, make sure to clear CMake cache entries:: rm -rf build/ @@ -175,6 +175,3 @@ After installation, you can launch the test suite from outside the source directory:: pytest gtda - - - diff --git a/doc/ipynb_to_py.py b/doc/ipynb_to_py.py index 803918d95..e53edd9f5 100644 --- a/doc/ipynb_to_py.py +++ b/doc/ipynb_to_py.py @@ -3,7 +3,7 @@ Usage: python ipynb_to_gallery.py Dependencies: -pypandoc: install using `pip install pypandoc` +pypandoc: install using `python -m pip install pypandoc` """ import pypandoc as pdoc import json diff --git a/doc/library.rst b/doc/library.rst index 19787b844..2737bd8ae 100644 --- a/doc/library.rst +++ b/doc/library.rst @@ -114,5 +114,5 @@ What's new .. include:: release.rst - :start-after: Release 0.2.2 - :end-before: Release 0.2.1 + :start-after: Release 0.3.0 + :end-before: Release 0.2.2 diff --git a/doc/modules/curves.rst b/doc/modules/curves.rst new file mode 100644 index 000000000..d2097e08f --- /dev/null +++ b/doc/modules/curves.rst @@ -0,0 +1,26 @@ +:mod:`gtda.curves`: Curves +========================== + +.. automodule:: gtda.curves + :no-members: + :no-inherited-members: + +Preprocessing +------------- +.. currentmodule:: gtda + +.. autosummary:: + :toctree: generated/curves/preprocessing/ + :template: class.rst + + curves.Derivative + +Feature extraction +------------------ +.. currentmodule:: gtda + +.. autosummary:: + :toctree: generated/curves + :template: class.rst + + curves.StandardFeatures diff --git a/doc/modules/diagrams.rst b/doc/modules/diagrams.rst index 5742a3e8c..de8955865 100644 --- a/doc/modules/diagrams.rst +++ b/doc/modules/diagrams.rst @@ -50,4 +50,6 @@ Features :template: class.rst diagrams.Amplitude - diagrams.PersistenceEntropy \ No newline at end of file + diagrams.PersistenceEntropy + diagrams.NumberOfPoints + diagrams.ComplexPolynomial diff --git a/doc/modules/homology.rst b/doc/modules/homology.rst index 4e3aad74b..e0795b02f 100644 --- a/doc/modules/homology.rst +++ b/doc/modules/homology.rst @@ -5,6 +5,8 @@ :no-members: :no-inherited-members: +Undirected simplicial homology +------------------------------ .. currentmodule:: gtda .. autosummary:: @@ -13,5 +15,25 @@ homology.VietorisRipsPersistence homology.SparseRipsPersistence + homology.WeakAlphaPersistence homology.EuclideanCechPersistence + +Directed simplicial homology +---------------------------- +.. currentmodule:: gtda + +.. autosummary:: + :toctree: generated/homology/ + :template: class.rst + + homology.FlagserPersistence + +Cubical homology +---------------- +.. currentmodule:: gtda + +.. autosummary:: + :toctree: generated/homology/ + :template: class.rst + homology.CubicalPersistence diff --git a/doc/modules/images.rst b/doc/modules/images.rst index 7238b1ab2..b7cf82580 100644 --- a/doc/modules/images.rst +++ b/doc/modules/images.rst @@ -9,7 +9,6 @@ Preprocessing ------------- - .. autosummary:: :toctree: generated/images :template: class.rst @@ -21,7 +20,6 @@ Preprocessing Filtrations ----------- - .. autosummary:: :toctree: generated/images :template: class.rst @@ -31,3 +29,4 @@ Filtrations images.DilationFiltration images.ErosionFiltration images.SignedDistanceFiltration + images.DensityFiltration diff --git a/doc/modules/index.rst b/doc/modules/index.rst index 9f960ec34..899224c6b 100644 --- a/doc/modules/index.rst +++ b/doc/modules/index.rst @@ -11,6 +11,7 @@ This pages contains a list of available features in the library. mapper.rst homology.rst diagrams.rst + curves.rst point_clouds.rst time_series.rst graphs.rst @@ -18,7 +19,8 @@ This pages contains a list of available features in the library. plotting.rst base.rst pipeline.rst - validation.rst + metaestimators.rst + utils.rst .. :mod:`gtda.manifold`: Manifold learning diff --git a/doc/modules/mapper.rst b/doc/modules/mapper.rst index 406b68607..f0d7e276e 100644 --- a/doc/modules/mapper.rst +++ b/doc/modules/mapper.rst @@ -1,7 +1,7 @@ :mod:`gtda.mapper`: Mapper ========================== -.. automodule:: gtda.mapper +.. automodule:: gtda :no-members: :no-inherited-members: @@ -9,7 +9,6 @@ Filters ------- - .. currentmodule:: gtda .. autosummary:: @@ -22,7 +21,6 @@ Filters Covers ------- - .. currentmodule:: gtda .. autosummary:: @@ -34,7 +32,6 @@ Covers Clustering ---------- - .. currentmodule:: gtda .. autosummary:: @@ -43,17 +40,28 @@ Clustering mapper.FirstSimpleGap mapper.FirstHistogramGap + mapper.ParallelClustering + +Nerve (graph construction) +-------------------------- +.. currentmodule:: gtda + +.. autosummary:: + :toctree: generated/mapper/nerve/ + :template: class.rst + + mapper.Nerve + Pipeline -------- - .. currentmodule:: gtda .. autosummary:: :toctree: generated/mapper/pipeline/ :template: function.rst - mapper.pipeline.make_mapper_pipeline + mapper.make_mapper_pipeline .. autosummary:: @@ -64,15 +72,14 @@ Pipeline Visualization ------------- - .. currentmodule:: gtda .. autosummary:: :toctree: generated/mapper/visualization :template: function.rst - mapper.visualization.plot_static_mapper_graph - mapper.visualization.plot_interactive_mapper_graph + mapper.plot_static_mapper_graph + mapper.plot_interactive_mapper_graph Utilities --------- @@ -82,5 +89,5 @@ Utilities :toctree: generated/mapper/utils :template: function.rst - mapper.utils.decorators.method_to_transform - mapper.utils.pipeline.transformer_from_callable_on_rows \ No newline at end of file + mapper.method_to_transform + mapper.transformer_from_callable_on_rows \ No newline at end of file diff --git a/doc/modules/metaestimators.rst b/doc/modules/metaestimators.rst new file mode 100644 index 000000000..125d16766 --- /dev/null +++ b/doc/modules/metaestimators.rst @@ -0,0 +1,14 @@ +:mod:`gtda.metaestimators`: Meta-estimators +=========================================== + +.. automodule:: gtda.metaestimators + :no-members: + :no-inherited-members: + +.. currentmodule:: gtda + +.. autosummary:: + :toctree: generated/base/ + :template: class.rst + + metaestimators.CollectionTransformer diff --git a/doc/modules/time_series.rst b/doc/modules/time_series.rst index 9c936fa9d..b0f2aca87 100644 --- a/doc/modules/time_series.rst +++ b/doc/modules/time_series.rst @@ -26,6 +26,13 @@ Time-delay embedding :template: class.rst time_series.TakensEmbedding + time_series.SingleTakensEmbedding + +.. autosummary:: + :toctree: generated/time_series/embedding/ + :template: function.rst + + time_series.takens_embedding_optimal_parameters Target preparation ------------------ diff --git a/doc/modules/validation.rst b/doc/modules/utils.rst similarity index 74% rename from doc/modules/validation.rst rename to doc/modules/utils.rst index 1e20375fb..21a8a6ddf 100644 --- a/doc/modules/validation.rst +++ b/doc/modules/utils.rst @@ -1,5 +1,5 @@ -:mod:`gtda.utils`: Validation -============================= +:mod:`gtda.utils`: Utilities +============================ .. automodule:: gtda.utils :no-members: @@ -11,6 +11,7 @@ :toctree: generated/utils :template: function.rst - utils.check_diagrams + utils.check_collection utils.check_point_clouds + utils.check_diagrams utils.validate_params diff --git a/doc/notebooks/examples.rst b/doc/notebooks/examples.rst index 924921297..3da2cb993 100644 --- a/doc/notebooks/examples.rst +++ b/doc/notebooks/examples.rst @@ -9,6 +9,7 @@ This page contains examples of use of ``giotto-tda``. classifying_shapes lorenz_attractor + MNIST_classification voids_on_the_plane .. diff --git a/doc/notebooks/tutorials.rst b/doc/notebooks/tutorials.rst index 52f886203..5eea09ced 100644 --- a/doc/notebooks/tutorials.rst +++ b/doc/notebooks/tutorials.rst @@ -11,6 +11,9 @@ by Lewis Tunstall provides a friendly introduction to the philosophy of ``giotto vietoris_rips_quickstart plotting_api mapper_quickstart + time_series_classification + time_series_forecasting + persistent_homology_graphs .. include:: vietoris_rips_quickstart.rst @@ -19,7 +22,7 @@ by Lewis Tunstall provides a friendly introduction to the philosophy of ``giotto Try it on `github `__ for full interactivity, or check `the static version `__. - .. include:: mapper_quickstart.rst + .. include:: time_series_forecasting.rst :end-before: Import libraries Try it on `github `__ for full interactivity, diff --git a/doc/release.rst b/doc/release.rst index cbe237c3e..cc8f8b00b 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -1,10 +1,190 @@ - ############# Release Notes ############# .. _stable: +************* +Release 0.3.0 +************* + +Major Features and Improvements +=============================== + +This is a major release which adds substantial new functionality and introduces several improvements. + +Persistent homology of directed flag complexes via ``pyflagser`` +---------------------------------------------------------------- + +- The ``pyflagser`` package (`source `_, `docs `_) is now an official dependency of ``giotto-tda``. +- The ``FlagserPersistence`` transformer has been added to ``gtda.homology`` (`#339 `_). It wraps ``pyflagser.flagser_weighted`` to allow for computations of persistence diagrams from directed or undirected weighted graphs. A `new notebook `_ demonstrates its use. + +Edge collapsing and performance improvements for persistent homology +-------------------------------------------------------------------- + +- GUDHI C++ components have been updated to the state of GUDHI v3.3.0, yielding performance improvements in ``SparseRipsPersistence``, ``EuclideanCechPersistence`` and ``CubicalPersistence`` (`#468 `_). +- Bindings for GUDHI's `edge collapser `_ have been created and can now be used as an optional preprocessing step via the optional keyword argument ``collapse_edges`` in ``VietorisRipsPersistence`` and in ``gtda.externals.ripser`` (`#469 `_ and `#483 `_). When ``collapse_edges=True``, and the input data and/or number of required homology dimensions is sufficiently large, the resulting runtimes for Vietoris–Rips persistent homology are state of the art. +- The performance of the Ripser bindings has otherwise been improved by avoiding unnecessary data copies, better managing the memory, and using more efficient matrix routines (`#501 `_ and `#507 `_). + +New transformers and functionality in ``gtda.homology`` +------------------------------------------------------- + +- The ``WeakAlphaPersistence`` transformer has been added to ``gtda.homology`` (`#464 `_). Like ``VietorisRipsPersistence``, ``SparseRipsPersistence`` and ``EuclideanCechPersistence``, it computes persistent homology from point clouds, but its runtime can scale much better with size in low dimensions. +- ``VietorisRipsPersistence`` now accepts sparse input when ``metric="precomputed"`` (`#424 `_). +- ``CubicalPersistence`` now accepts lists of 2D arrays (`#503 `_). +- A ``reduced_homology`` parameter has been added to all persistent homology transformers. When ``True``, one infinite bar in the H0 barcode is removed for the user automatically. Previously, it was not possible to *keep* these bars in the simplicial homology transformers. The default is always ``True``, which implies a breaking change in the case of ``CubicalPersistence`` (`#467 `_). + +Persistence diagrams +-------------------- + +- A ``ComplexPolynomial`` feature extraction transformer has been added (`#479 `_). +- A ``NumberOfPoints`` feature extraction transformer has been added (`#496 `_). +- An option to normalize the entropy in ``PersistenceEntropy`` according to a heuristic has been added, and a ``nan_fill_value`` parameter allows to replace any NaN produced by the entropy calculation with a fixed constant (`#450 `_). +- The computations in ``HeatKernel``, ``PersistenceImage`` and in the pairwise distances and amplitudes related to them has been changed to yield the continuum limit when ``n_bins`` tends to infinity; ``sigma`` is now measured in the same units as the filtration parameter and defaults to 0.1 (`#454 `_). + +New ``curves`` subpackage +------------------------- + +A new ``curves`` subpackage has been added to preprocess, and extract features from, collections of multi-channel curves such as returned by ``BettiCurve``, ``PersistenceLandscape`` and ``Silhouette`` (`#480 `_). It contains: + +- A ``StandardFeatures`` transformer that can extract features channel-wise in a generic way. +- A ``Derivative`` transformer that computes channel-wise derivatives of any order by discrete differences (`#492 `_). + +New ``metaestimators`` subpackage +--------------------------------- + +A new ``metaestimator`` subpackage has been added with a ``CollectionTransformer`` meta-estimator which converts any transformer instance into a fit-transformer acting on collections (`#495 `_). + +Images +------ + +- A ``DensityFiltration`` for collections of binary images has been added (`#473 `_). +- ``Padder`` and ``Inverter`` have been extended to greyscale images (`#489 `_). + +Time series +----------- + +- ``TakensEmbedding`` is now a new transformer acting on collections of time series (`#460 `_). +- The former ``TakensEmbedding`` acting on a single time series has been renamed to ``SingleTakensEmbedding`` transformer, and the internal logic employed in its ``fit`` for computing optimal hyperparameters is now available via a ``takens_embedding_optimal_parameters`` convenience function (`#460 `_). +- The ``_slice_windows`` method of ``SlidingWindow`` has been made public and renamed into ``slice_windows`` (`#460 `_). + +Graphs +------ + +- ``GraphGeodesicDistance`` has been improved as follows (`#422 `_): + + - The new parameters ``directed``, ``unweighted`` and ``method`` have been added. + - The rules on the role of zero entries, infinity entries, and non-stored values have been made clearer. + - Masked arrays are now supported. + +- A ``mode`` parameter has been added to ``KNeighborsGraph``; as in ``scikit-learn``, it can be set to either ``"distance"`` or ``"connectivity"`` (`#478 `_). + +- List input is now accepted by all transformers in ``gtda.graphs``, and outputs are consistently either lists or 3D arrays (`#478 `_). + +- Sparse matrices returned by ``KNeighborsGraph`` and ``TransitionGraph`` now have int dtype (0-1 adjacency matrices), and are not necessarily symmetric (`#478 `_). + +Mapper +------ + +- Pullback cover set labels and partial cluster labels have been added to Mapper node hovertexts (`#445 `_). + +- The functionality of ``Nerve`` and ``make_mapper_pipeline`` has been greatly extended (`#447 `_ and `#456 `_): + + - Node and edge metadata are now accessible in output ``igraph.Graph`` objects by means of the ``VertexSeq`` and ``EdgeSeq`` attributes ``vs`` and ``es`` (respectively). Graph-level dictionaries are no longer used. + - Available node metadata can be accessed by ``graph.vs[attr_name]`` where for ``attr_name`` is one of ``"pullback_set_label"``, ``"partial_cluster_label"``, or ``"node_elements"``. + - Sizes of intersections are automatically stored as edge weights, accessible by ``graph.es["weight"]``. + - A ``"store_intersections"`` keyword argument has been added to ``Nerve`` and ``make_mapper_pipeline`` to allow to store the indices defining node intersections as edge attributes, accessible via ``graph.es["edge_elements"]``. + - A ``contract_nodes`` optional parameter has been added to both ``Nerve`` and ``make_mapper_pipeline``; nodes which are subsets of other nodes are thrown away from the graph when this parameter is set to ``True``. + - A ``graph_`` attribute is stored during ``Nerve.fit``. + +- Two of the ``Nerve`` parameters (``min_intersection`` and the new ``contract_nodes``) are now available in the widgets generated by ``plot_interactive_mapper_graph``, and the layout of these widgets has been improved (`#456 `_). + +- ``ParallelClustering`` and ``Nerve`` have been exposed in the documentation and in ``gtda.mapper``'s ``__init__`` (`#447 `_). + +Plotting +-------- + +- A ``plot_params`` kwarg is available in plotting functions and methods throughout to allow user customisability of output figures. The user must pass a dictionary with keys ``"layout"`` and/or ``"trace"`` (or ``"traces"`` in some cases) (`#441 `_). +- Several plots produced by ``plot`` class methods now have default titles (`#453 `_). +- Infinite deaths are now plotted by ``plot_diagrams`` (`#461 `_). +- Possible multiplicities of persistence pairs in persistence diagram plots are now indicated in the hovertext (`#454 `_). +- ``plot_heatmap`` now accepts boolean array input (`#444 `_). + +New tutorials and examples +-------------------------- + +The following new tutorials have been added: + +- `Topology of time series `_, which explains the theory of the Takens time-delay embedding and its use with persistent homology, demonstrates the new ``API`` of several components in ``gtda.time_series``, and shows how to construct time series *classification* pipelines in ``giotto-tda`` by partially reproducing `arXiv:1910:08245 `_. +- `Topology in time series forecasting `_, which explains how to set up time series *forecasting* pipelines in ``giotto-tda`` via ``TransformerResamplerMixin``s and the ``giotto-tda`` ``Pipeline`` class. +- `Topological feature extraction from graphs `_, which explains what the features extracted from directed or undirected graphs by ``VietorisRipsPersistence``, ``SparseRipsPersistence`` and ``FlagserPersistence`` are. +- `Classifying handwritten digits `_, which presents a fully-fledged machine learning pipeline in which cubical persistent homology is applied to the classification of handwritten images from he MNIST dataset, partially reproducing `arXiv:1910.08345 `_. + +Utils +----- + +- A ``check_collection`` input validation function has been added (`#491 `_). +- ``validate_params`` now accepts ``"in"`` and ``"of"`` keys simultaneously in the ``references`` dictionaries, with ``"in"`` used for non-list-like types and ``"of"`` otherwise (`#502 `_). + +Installation improvements +------------------------- + +- ``pybind11`` is now treated as a standard git submodule in the developer installation (`#459 `_). +- ``pandas`` is now part of the testing requirements when intalling from source (`#508 `_). + +Bug Fixes +========= + +- A bug has been fixed which could lead to features with negative lifetime in persistent homology transformers when ``infinity_values`` was set too low (`#339 `_). +- By relying on ``scipy``'s ``shortest_path`` instead of ``scikit-learn``'s ``graph_shortest_path``, some errors in computing ``GraphGeodesicDistance`` (e.g. when som edges are zero) have been fixed (`#422 `_). +- A bug in the handling of COO matrices by the ``ripser`` interface has been fixed (`#465 `_). +- A bug which led to the incorrect handling of the ``homology_dimensions`` parameter in ``Filtering`` has been fixed (`#439 `_). +- An issue with the use of ``joblib.Parallel``, which led to errors when attempting to run ``HeatKernel``, ``PersistenceImage``, and the corresponding amplitudes and distances on large datasets, has been fixed (`#428 `_ and `#481 `_). +- A bug leading to plots of persistence diagrams not showing points with negative births or deaths has been fixed, as has a bug with the computation of the range to be shown in the plot (`#437 `_). +- A bug in the handling of persistence pairs with negative death values by ``Filtering`` has been fixed (`#436 `_). +- A bug in the handling of ``homology_dimension_ix`` (now renamed to ``homology_dimension_idx``) in the ``plot`` methods of ``HeatKernel`` and ``PersistenceImage`` has been fixed (`#452 `_). +- A bug in the labelling of axes in ``HeatKernel`` and ``PersistenceImage`` plots has ben fixed (`#453 `_ and `#454 `_). +- ``PersistenceLandscape`` plots now show all homology dimensions, instead of just the first (`#454 `_). +- A bug in the computation of amplitudes and pairwise distances based on persistence images has been fixed (`#454 `_). +- ``Silhouette`` now does not create NaNs when a subdiagram is trivial (`#454 `_). +- ``CubicalPersistence`` now does not create pairs with negative persistence when ``infinity_values`` is set too low (`#467 `_). +- Warnings are no longer thrown by ``KNeighborsGraph`` when ``metric="precomputed"`` (`#506 `_). +- A bug in ``Labeller.resample`` affecting cases in which ``n_steps_future >= size - 1``, has been fixed (`#460 `_). +- A bug in ``validate_params``, affecting the case of tuples of allowed types, has been fixed (`#502 `_). + +Backwards-Incompatible Changes +============================== + +- The minimum required versions from most of the dependencies have been bumped. The updated dependencies are ``numpy >= 1.19.1``, ``scipy >= 1.5.0``, ``joblib >= 0.16.0``, ``scikit-learn >= 0.23.1``, ``python-igraph >= 0.8.2``, ``plotly >= 4.8.2``, and ``pyflagser >= 0.4.1`` (`#457 `_). +- ``GraphGeodesicDistance`` now returns either lists or 3D dense ndarrays for compatibility with the homology transformers - By relying on ``scipy``'s ``shortest_path`` instead of ``scikit-learn``'s ``graph_shortest_path``, some errors in computing ``GraphGeodesicDistance`` (e.g. when som edges are zero) have been fixed (`#422 `_). +- The output of ``PairwiseDistance`` has been transposed to match ``scikit-learn`` convention ``(n_samples_transform, n_samples_fit)`` (`#420 `_). +- ``plot`` class methods now return figures instead of showing them (`#441 `_). +- Mapper node and edge attributes are no longer stored as graph-level dictionaries, ``"node_id"`` is no longer an available node attribute, and the attributes ``nodes_`` and ``edges_`` previously stored by ``Nerve.fit`` have been removed in favour of a ``graph_`` attribute (`#447 `_). +- The ``homology_dimension_ix`` parameter available in some transformers in ``gtda.diagrams`` has been renamed to ``homology_dimensions_idx`` (`#452 `_). +- The base of the logarithm used by ``PersistenceEntropy`` is now 2 instead of *e*, and NaN values are replaced with -1 instead of 0 by default (`#450 `_ and `#474 `_). +- The outputs of ``PersistenceImage``, ``HeatKernel`` and of the pairwise distances and amplitudes based on them is now different due to the improvements described above. +- Weights are no longer stored in the ``effective_metric_params_`` attribute of ``PairwiseDistance``, ``Amplitude`` and ``Scaler`` objects when the metric is persistence-image–based; only the weight function is (`#454 `_). +- The ``homology_dimensions_`` attributes of several transformers have been converted from lists to tuples. When possible, homology dimensions stored as parts of attributes are now presented as ints (`#454 `_). +- ``gaussian_filter`` (used to make heat– and persistence-image–based representations/pairwise distances/amplitudes) is now called with ``mode="constant"`` instead of ``"reflect"`` (`#454 `_). +- The default value of ``order`` in ``Amplitude`` has been changed from ``2.`` to ``None``, giving vector instead of scalar features (`#454 `_). +- The meaning of the default ``None`` for ``weight_function`` in ``PersistenceImage`` (and in ``Amplitude`` and ``PairwiseDistance`` when ``metric="persistence_image"``) has been changed from the identity function to the function returning a vector of ones (`#454 `_). +- Due to the updates in the GUDHI components, some of the bindings and Python interfaces to the GUDHI C++ components in ``gtda.externals`` have changed (`#468 `_). +- ``Labeller.transform`` now returns a 1D array instead of a column array (`#475 `_). +- ``PersistenceLandscape`` now returns 3D arrays instead of 4D ones, for compatibility with the new ``curves`` subpackage (`#480 `_). +- By default, ``CubicalPersistence`` now removes one infinite bar in H0 (`#467 `_, and see above). +- The former ``width`` parameter in ``SlidingWindow`` and ``Labeller`` has been replaced with a more intuitive ``size`` parameter. The relation between the two is: ``size = width + 1`` (`#460 `_). +- ``clusterer`` is now a required parameter in ``ParallelClustering`` (`#508 `_). +- The ``max_fraction`` parameter in ``FirstSimpleGap`` and ``FirstHistogramGap`` now indicates the floor of ``max_fraction * n_samples``; its default value has been changed from ``None`` to ``1`` (`#412 `_). + +Thanks to our Contributors +========================== + +This release contains contributions from many people: + +Umberto Lupo, Guillaume Tauzin, Julian Burella Pérez, Wojciech Reise, Lewis Tunstall, Nick Sale, and Anibal Medina-Mardones. + +We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of inspiring discussions. + ************* Release 0.2.2 ************* @@ -259,7 +439,7 @@ package ``giotto-tda`` will start at v0.1.4 for project continuity. Short summary: install via :: - pip install -U giotto-tda + python -m pip install -U giotto-tda and ``import gtda`` in your scripts or notebooks! @@ -440,4 +620,4 @@ for this implementation. Release 0.1a.0 ************** -Initial release of the library, original named ``giotto-learn``. +Initial release of the library, originally named ``giotto-learn``. diff --git a/doc/theory/glossary.tex b/doc/theory/glossary.tex index 8b69e415f..fb7237d62 100644 --- a/doc/theory/glossary.tex +++ b/doc/theory/glossary.tex @@ -67,13 +67,13 @@ \end{equation*} is said to be an \textit{normed space} if the values of $||-||$ are all non-negative and for all $u,v \in V$ and $a \in \mathbb R$ \begin{equation*} - ||v|| = 0\ \Leftrightarrow\ u = 0 + ||u|| = 0\ \Leftrightarrow\ u = 0 \end{equation*} \begin{equation*} ||a u || = |a|\, ||u|| \end{equation*} \begin{equation*} - ||u+v|| = ||u|| + ||v||. + ||u + v|| \leq ||u|| + ||v||. \end{equation*} The function $||-||$ is referred to as the \textit{norm}. @@ -149,7 +149,7 @@ \begin{equation*} \langle x, y \rangle = (x_1-y_1)^2 + \cdots + (x_n-y_n)^2. \end{equation*} - This inner product is referred to as \textit{dot product} and the associated norm and distance function are respectively named \textit{euclidean norm} and \textit{euclidean distance}. + This inner product is referred to as \textit{dot product} and the associated norm and distance function are respectively named \textit{Euclidean norm} and \textit{Euclidean distance}. For any $p \in (0,\infty]$ the pair $\mathbb R^n, ||-||_p$ with \begin{equation*} @@ -168,8 +168,8 @@ metric space. A \textit{distance matrix} associated to it is obtained by choosing a total order on $X = {x_1 < \cdots < x_m}$ and setting the $(i,j)$-entry to be equal to $d(x_i, x_j)$. A \textit{point cloud} is a finite subset of $\mathbb{R}^n$ (for some $n$) together with the metric induced from the - % \hyperref[euclidean_distance_and_norm]{eucliden distance} - euclidean distance. + % \hyperref[euclidean_distance_and_norm]{Eucliden distance} + Euclidean distance. \subsection*{$L^p$-norms} \label{functional_lp} @@ -635,4 +635,4 @@ \bibliography{bibliography}{} \bibliographystyle{alpha} -\end{document} \ No newline at end of file +\end{document} diff --git a/doc/versions b/doc/versions new file mode 100644 index 000000000..2956d16b0 --- /dev/null +++ b/doc/versions @@ -0,0 +1,4 @@ +./0.2.0 +./0.2.1 +./0.2.2 +./latest diff --git a/examples/MNIST_classification.ipynb b/examples/MNIST_classification.ipynb new file mode 100644 index 000000000..34c130b83 --- /dev/null +++ b/examples/MNIST_classification.ipynb @@ -0,0 +1,636 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Classifying handwritten digits\n", + "\n", + "This notebook shows how ``giotto-tda`` can be used to generate topological features for image classification. We'll be using the famous MNIST dataset, which contains images of handwritten digits and is a standard benchmark for testing new classification algorithms.\n", + "\n", + "
\n", + "\n", + "

Figure 1: A few digits from the MNIST dataset. Figure reference: en.wikipedia.org/wiki/MNIST_database.

\n", + "
\n", + "\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/MNIST_classification.ipynb).\n", + "\n", + "\n", + "## Useful references\n", + "\n", + "* [_A Topological \"Reading\" Lesson: Classification of MNIST using TDA_](https://arxiv.org/abs/1910.08345) by Adélie Garin and Guillaume Tauzin\n", + "* [_The MNIST Database of Handwritten Digits_](http://yann.lecun.com/exdb/mnist/) by Yann LeCun, Corinna Cortes, and Christopher J.C. Burges\n", + "\n", + "**License: AGPLv3**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the MNIST dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To get started, let's fetch the MNIST dataset using one of ``scikit-learn``'s helper functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_openml\n", + "\n", + "X, y = fetch_openml(\"mnist_784\", version=1, return_X_y=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By looking at the shapes of these arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"X shape: {X.shape}, y shape: {y.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we see that there are 70,000 images, where each image has 784 features that represent pixel intensity. Let's reshape the feature vector to a 28x28 array and visualise one of the \"8\" digits using ``giotto-tda``'s plotting API:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from gtda.plotting import plot_heatmap\n", + "\n", + "im8_idx = np.flatnonzero(y == \"8\")[0]\n", + "img8 = X[im8_idx]\n", + "plot_heatmap(img8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create train and test sets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this example, we will work with a small subset of images – to run a full-blown analysis simply change the values of ``train_size`` and ``test_size`` below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train_size, test_size = 60, 10\n", + "\n", + "# Reshape to (n_samples, n_pixels_x, n_pixels_y)\n", + "X = X.reshape((-1, 28, 28))\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, train_size=train_size, test_size=test_size, stratify=y, random_state=666\n", + ")\n", + "\n", + "print(f\"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}\")\n", + "print(f\"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## From pixels to topological features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As shown in the figure below, several steps are required to extract topological features from an image. Since our images are made of pixels, it is convenient to use filtrations of [_cubical complexes_](https://giotto-ai.github.io/gtda-docs/latest/theory/glossary.html#cubical-complex) instead of simplicial ones. Let's go through each of these steps for a single \"8\" digit using ``giotto-tda``!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

Figure 2: An example of a topological feature extraction pipeline. Figure reference: arXiv:1910.08345.

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Binarize the image" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In ``giotto-tda``, filtrations of cubical complexes are built from _binary images_ consisting of only black and white pixels. We can convert our greyscale image to binary by applying a threshold on each pixel value via the ``Binarizer`` transformer:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.images import Binarizer\n", + "\n", + "# Pick out index of first 8 image\n", + "im8_idx = np.flatnonzero(y_train == \"8\")[0]\n", + "# Reshape to (n_samples, n_pixels_x, n_pixels_y) format\n", + "im8 = X_train[im8_idx][None, :, :]\n", + "\n", + "binarizer = Binarizer(threshold=0.4)\n", + "im8_binarized = binarizer.fit_transform(im8)\n", + "\n", + "binarizer.plot(im8_binarized)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From binary image to filtration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have a binary image $\\mathcal{B}$ of our \"8\" digit, we can build a wide variety of different filtrations – see the ``giotto-tda`` [docs](https://giotto-ai.github.io/gtda-docs/latest/modules/images.html#filtrations) for a full list. For our example, we'll use the _radial filtration_ $\\mathcal{R}$, which assigns to each pixel $p$ a value corresponding to its distance from a predefined center $c$ of the image\n", + "\n", + "$$ \\mathcal{R}(p) = \\left\\{ \\begin{array}{cl} \n", + "\\lVert c - p \\rVert_2 &\\mbox{if } \\mathcal{B}(p)=1 \\\\ \n", + "\\mathcal{R}_\\infty &\\mbox{if } \\mathcal{B}(p)=0 \n", + "\\end{array} \\right. $$\n", + "\n", + "where $\\mathcal{R}_\\infty$ is the distance of the pixel that is furthest from $c$. To reproduce the filtered image from the MNIST [article](https://arxiv.org/abs/1910.08345), we'll pick $c = (20,6)$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.images import RadialFiltration\n", + "\n", + "radial_filtration = RadialFiltration(center=np.array([20, 6]))\n", + "im8_filtration = radial_filtration.fit_transform(im8_binarized)\n", + "\n", + "radial_filtration.plot(im8_filtration, colorscale=\"jet\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see from the resulting plot that we've effectively transformed our binary image into a greyscale one, where the pixel values increase as we move from the upper-right to bottom-left of the image! These pixel values can be used to define a filtration of cubical complexes $\\{K_i\\}_{i\\in \\mathrm{Im}(I)}$, where $K_i$ contains all pixels with value less than the $i$th smallest pixel value in the greyscale image. In other words, $K_i$ is the $i$th sublevel set of the image's cubical complex $K$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From filtration to persistence diagram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given a greyscale filtration it is straightforward to calculate the corresponding persistence diagram. In ``giotto-tda`` we make use of the ``CubicalPersistence`` transformer which is the cubical analogue to simplicial transformers like ``VietorisRipsPersistence``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.homology import CubicalPersistence\n", + "\n", + "cubical_persistence = CubicalPersistence(n_jobs=-1)\n", + "im8_cubical = cubical_persistence.fit_transform(im8_filtration)\n", + "\n", + "cubical_persistence.plot(im8_cubical)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It works! We can clearly see two persistent $H_1$ generators corresponding to the loops in the digit \"8\", along with a single $H_0$ generator corresponding to the connected components. \n", + "\n", + "As a postprocessing step, it is often convenient to rescale the persistence diagrams which can be achieved in ``giotto-tda`` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.diagrams import Scaler\n", + "\n", + "scaler = Scaler()\n", + "im8_scaled = scaler.fit_transform(im8_cubical)\n", + "\n", + "scaler.plot(im8_scaled)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From persistence diagram to representation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final step is to define a vectorial representation of the persistence diagram that can be used to obtain machine learning features. Following our example from Figure 2, we convolve our persistence diagram with a Gaussian kernel and symmetrize along the main diagonal, a procedure achieved via the [``HeatKernel``](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/diagrams/representations/gtda.diagrams.HeatKernel.html#gtda.diagrams.HeatKernel) transformer:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.diagrams import HeatKernel\n", + "\n", + "heat = HeatKernel(sigma=.15, n_bins=60, n_jobs=-1)\n", + "im8_heat = heat.fit_transform(im8_scaled)\n", + "\n", + "# Visualise the heat kernel for H1\n", + "heat.plot(im8_heat, homology_dimension_idx=1, colorscale='jet')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Combining all steps as a single pipeline\n", + "\n", + "We've now seen how each step in Figure 2 is implemented in ``giotto-tda`` – let's combine them as a single ``scikit-learn`` pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from gtda.diagrams import Amplitude\n", + "\n", + "steps = [\n", + " (\"binarizer\", Binarizer(threshold=0.4)),\n", + " (\"filtration\", RadialFiltration(center=np.array([20, 6]))),\n", + " (\"diagram\", CubicalPersistence()),\n", + " (\"rescaling\", Scaler()),\n", + " (\"amplitude\", Amplitude(metric=\"heat\", metric_params={'sigma':0.15, 'n_bins':60}))\n", + "]\n", + "\n", + "heat_pipeline = Pipeline(steps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "im8_pipeline = heat_pipeline.fit_transform(im8)\n", + "im8_pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the final step we've used the [``Amplitude``](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/diagrams/features/gtda.diagrams.Amplitude.html) transformer to \"vectorize\" the persistence diagram via the heat kernel method above. In our example, this produces a vector of amplitudes $\\mathbf{a} = (a_0, a_1)$ where each amplitude $a_i$ corresponds to a given homology dimension in the persistence diagram. By extracting these feature vectors from each image, we can feed them into a machine learning classifier – let's tackle this in the next section!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building a full-blown feature extraction pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we've seen how to extract topological features for a single image, let's make it more realistic and extract a wide variety of features over the whole training set. The resulting pipeline resembles the figure below, where different filtrations and vectorizations of persistence diagrams can be concatenated to produce informative feature vectors.\n", + "\n", + "
\n", + "\n", + "

Figure 3: A full-blown topological feature extraction pipeline

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To keep things simple, we'll augment our radial filtration with a _height filtration_ $\\mathcal{H}$, defined by choosing a unit vector $v \\in \\mathbb{R}^2$ in some _direction_ and assigning values $\\mathcal{H}(p) = \\langle p, v \\rangle$ based on the distance of $p$ to the hyperplane defined by $v$. Following the article by Garin and Tauzin, we'll pick a uniform set of directions and centers for our filtrations as shown in the figure below.\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "We'll also generate features from persistence diagrams by using [_persistence entropy_](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/diagrams/features/gtda.diagrams.PersistenceEntropy.html) and a broad set of amplitudes. Putting it all together yields the following pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline, make_union\n", + "from gtda.diagrams import PersistenceEntropy\n", + "from gtda.images import HeightFiltration\n", + "\n", + "direction_list = [[1, 0], [1, 1], [0, 1], [-1, 1], [-1, 0], [-1, -1], [0, -1], [1, -1]]\n", + "\n", + "center_list = [\n", + " [13, 6],\n", + " [6, 13],\n", + " [13, 13],\n", + " [20, 13],\n", + " [13, 20],\n", + " [6, 6],\n", + " [6, 20],\n", + " [20, 6],\n", + " [20, 20],\n", + "]\n", + "\n", + "# Creating a list of all filtration transformer, we will be applying\n", + "filtration_list = (\n", + " [\n", + " HeightFiltration(direction=np.array(direction), n_jobs=-1)\n", + " for direction in direction_list\n", + " ]\n", + " + [RadialFiltration(center=np.array(center), n_jobs=-1) for center in center_list]\n", + ")\n", + "\n", + "# Creating the diagram generation pipeline\n", + "diagram_steps = [\n", + " [\n", + " Binarizer(threshold=0.4, n_jobs=-1),\n", + " filtration,\n", + " CubicalPersistence(n_jobs=-1),\n", + " Scaler(n_jobs=-1),\n", + " ]\n", + " for filtration in filtration_list\n", + "]\n", + "\n", + "# Listing all metrics we want to use to extract diagram amplitudes\n", + "metric_list = [\n", + " {\"metric\": \"bottleneck\", \"metric_params\": {}},\n", + " {\"metric\": \"wasserstein\", \"metric_params\": {\"p\": 1}},\n", + " {\"metric\": \"wasserstein\", \"metric_params\": {\"p\": 2}},\n", + " {\"metric\": \"landscape\", \"metric_params\": {\"p\": 1, \"n_layers\": 1, \"n_bins\": 100}},\n", + " {\"metric\": \"landscape\", \"metric_params\": {\"p\": 1, \"n_layers\": 2, \"n_bins\": 100}},\n", + " {\"metric\": \"landscape\", \"metric_params\": {\"p\": 2, \"n_layers\": 1, \"n_bins\": 100}},\n", + " {\"metric\": \"landscape\", \"metric_params\": {\"p\": 2, \"n_layers\": 2, \"n_bins\": 100}},\n", + " {\"metric\": \"betti\", \"metric_params\": {\"p\": 1, \"n_bins\": 100}},\n", + " {\"metric\": \"betti\", \"metric_params\": {\"p\": 2, \"n_bins\": 100}},\n", + " {\"metric\": \"heat\", \"metric_params\": {\"p\": 1, \"sigma\": 1.6, \"n_bins\": 100}},\n", + " {\"metric\": \"heat\", \"metric_params\": {\"p\": 1, \"sigma\": 3.2, \"n_bins\": 100}},\n", + " {\"metric\": \"heat\", \"metric_params\": {\"p\": 2, \"sigma\": 1.6, \"n_bins\": 100}},\n", + " {\"metric\": \"heat\", \"metric_params\": {\"p\": 2, \"sigma\": 3.2, \"n_bins\": 100}},\n", + "]\n", + "\n", + "#\n", + "feature_union = make_union(\n", + " *[PersistenceEntropy(nan_fill_value=-1)]\n", + " + [Amplitude(**metric, n_jobs=-1) for metric in metric_list]\n", + ")\n", + "\n", + "tda_union = make_union(\n", + " *[make_pipeline(*diagram_step, feature_union) for diagram_step in diagram_steps],\n", + " n_jobs=-1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "which can be visualised using ``scikit-learn``'s nifty [HTML feature](https://scikit-learn.org/stable/modules/compose.html#visualizing-composite-estimators):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import set_config\n", + "set_config(display='diagram') \n", + "\n", + "tda_union" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's now a simple matter to run the whole pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_tda = tda_union.fit_transform(X_train)\n", + "X_train_tda.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training a classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see we have generated $(8 + 9) \\times 2 \\times 14 = 476$ topological features per image! In general, some of these features will be highly correlated and a feature selection procedure could be used to select the most informative ones. Nevertheless, let's train a Random Forest classifier on our training set to see what kind of performance we can get:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rf = RandomForestClassifier()\n", + "rf.fit(X_train_tda, y_train)\n", + "\n", + "X_test_tda = tda_union.transform(X_test)\n", + "rf.score(X_test_tda, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For such a small dataset, this accuracy is not too bad but accuracies above 96% can be achieved by training on the full MNIST dataset together with feature selection strategies." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using hyperparameter search with topological pipelines " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above pipeline, we can think of our choices for the directions and centers of the filtrations as hyperparameter. To wrap up our analysis, let's see how we can run a hyperparameter search over the directions of the height filtration. We'll use a simplified pipeline to show the main steps, but note that a realistic application would involve running the search over a pipeline like the one in the previous section.\n", + "\n", + "As usual, we define our pipeline in terms of topological transformers and an estimator as the final step:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "height_pipeline = Pipeline([\n", + " ('binarizer', Binarizer(threshold=0.4)),\n", + " ('filtration', HeightFiltration()),\n", + " ('diagram', CubicalPersistence()),\n", + " ('feature', PersistenceEntropy(nan_fill_value=-1)),\n", + " ('classifier', RandomForestClassifier(random_state=42))\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we can search for the best combination of directions, homology dimensions, and number of trees in our Random Forest as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "direction_list = [[1, 0], [1, 1], [0, 1], [-1, 1], [-1, 0], [-1, -1], [0, -1], [1, -1]]\n", + "homology_dimensions_list = [[0], [1]]\n", + "n_estimators_list = [500, 1000, 2000]\n", + "\n", + "param_grid = {\n", + " \"filtration__direction\": [np.array(direction) for direction in direction_list],\n", + " \"diagram__homology_dimensions\": [\n", + " homology_dimensions for homology_dimensions in homology_dimensions_list\n", + " ],\n", + " \"classifier__n_estimators\": [n_estimators for n_estimators in n_estimators_list],\n", + "}\n", + "\n", + "grid_search = GridSearchCV(\n", + " estimator=height_pipeline, param_grid=param_grid, cv=3, n_jobs=-1\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By looking at the best hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_search.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we see that the direction [1, 0] with homology dimension 0 produces the best features. By comparing say a \"6\" and \"9\" digit, can you think of a reason why this might be the case?" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/classifying_shapes.ipynb b/examples/classifying_shapes.ipynb index 973394d32..8b38bfd2e 100644 --- a/examples/classifying_shapes.ipynb +++ b/examples/classifying_shapes.ipynb @@ -6,9 +6,9 @@ "source": [ "# Case study: Classification of shapes\n", "\n", - "This notebook explains how to use `giotto-tda` to be able to classify topologically different high-dimensional spaces.\n", + "This notebook explains how to use ``giotto-tda`` to be able to classify topologically different high-dimensional spaces.\n", "\n", - "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/classifying_shapes.ipynb).\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/classifying_shapes.ipynb) and download the source.\n", "\n", "**License: AGPLv3**" ] @@ -18,7 +18,7 @@ "metadata": {}, "source": [ "## Import libraries\n", - "The first step consists in importing relevant `giotto-tda` components and other useful libraries or modules." + "The first step consists in importing relevant ``giotto-tda`` components and other useful libraries or modules." ] }, { @@ -59,7 +59,8 @@ "outputs": [], "source": [ "# Representing the circle in 3d with parametric equations.\n", - "circle = np.asarray([[np.sin(t),np.cos(t),0] for t in range(400)])\n", + "circle = np.asarray([[np.sin(t), np.cos(t), 0]\n", + " for t in range(400)])\n", "plot_point_cloud(circle)" ] }, @@ -70,7 +71,8 @@ "outputs": [], "source": [ "# Representing the sphere in 3d with parametric equations\n", - "sphere = np.asarray([[np.cos(s)*np.cos(t),np.cos(s)*np.sin(t),np.sin(s)] for t in range(20) for s in range(20)])\n", + "sphere = np.asarray([[np.cos(s) * np.cos(t), np.cos(s) * np.sin(t), np.sin(s)]\n", + " for t in range(20) for s in range(20)])\n", "plot_point_cloud(sphere)" ] }, @@ -81,7 +83,8 @@ "outputs": [], "source": [ "# Representing the torus in 3d with parametric equations\n", - "torus = np.asarray([[(2+np.cos(s))*np.cos(t),(2+np.cos(s))*np.sin(t),np.sin(s)] for t in range(20) for s in range(20)])\n", + "torus = np.asarray([[(2 + np.cos(s)) * np.cos(t), (2 + np.cos(s)) * np.sin(t), np.sin(s)]\n", + " for t in range(20) for s in range(20)])\n", "plot_point_cloud(torus)" ] }, @@ -105,7 +108,9 @@ "\n", "We will use the Vietoris–Rips technique to generate a filtration out of a point cloud:\n", "\n", - "![SegmentLocal](https://miro.medium.com/max/1200/1*w3BiQI1OX93KXcezctRQTQ.gif \"segment\")" + "![Vietoris–Rips filtration of a point cloud](images/vietoris_rips_point_cloud.gif)\n", + "\n", + "Furthermore, throughout this notebook we will only consider the homology dimensions 0 (connected components), 1 (loops), and 2 (voids)." ] }, { @@ -114,14 +119,20 @@ "metadata": {}, "outputs": [], "source": [ - "# The homology ranks we choose to consider\n", - "homology_dimensions = (0, 1, 2)\n", - "VR = VietorisRipsPersistence(\n", - " metric='euclidean', max_edge_length=10, homology_dimensions=homology_dimensions)\n", + "homology_dimensions = (0, 1, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "VR = VietorisRipsPersistence(metric=\"euclidean\", max_edge_length=10, homology_dimensions=homology_dimensions)\n", "\n", "# Array of persistence diagrams, one per point cloud in the input\n", "diagrams = VR.fit_transform(topological_spaces)\n", - "print(f'diagrams.shape = {diagrams.shape}')" + "print(f\"diagrams.shape: {diagrams.shape}\")" ] }, { @@ -132,7 +143,7 @@ "\n", "The topological information of the point cloud is synthesised in the persistence diagram. The horizontal axis corresponds to the moment in which an homological generator is born, while the vertical axis corresponds to the moments in which a homological generator dies.\n", "\n", - "The generators of the homology groups (at given rank) are colored differently." + "The generators of the homology groups are given a different colour per rank." ] }, { @@ -171,7 +182,7 @@ "source": [ "## Conclusion of the first part\n", "\n", - "As you can see from the persistence diagrams, all the betti numbers were found. Some other persistent generators are also appearing, depending on how dense the sampling is and how much noise there is. For example, we see a rather neat persistence diagram over the Torus bottle (we see two persistent generators for $H_1$ and one persistent generator for $H_2$). Notice though that there are other persistent $H_1$ generators, possibly due to the non-uniform sampling method we used for the torus.\n", + "As you can see from the persistence diagrams, all the Betti numbers were found. Some other persistent generators are also appearing, depending on how dense the sampling is and how much noise there is. For example, we see a rather neat persistence diagram over the Torus bottle (we see two persistent generators for $H_1$ and one persistent generator for $H_2$). Notice though that there are other persistent $H_1$ generators, possibly due to the non-uniform sampling method we used for the torus.\n", "\n", "On the other hand, the persistence diagram for the circle is as perfect as it could be: one unique generator of $H_1$ and no other persistent generator, as expected." ] @@ -199,18 +210,50 @@ "n_range = 15\n", "eps = 0.3\n", "\n", - "train_Xs = [np.asarray([[np.cos(s)*np.cos(t) + eps*(np.random.rand(1)[0]-0.5),np.cos(s)*np.sin(t) + eps*(np.random.rand(1)[0]-0.5),np.sin(s) + eps*(np.random.rand(1)[0] - 0.5)] for t in range(n_range) for s in range(n_range)]) for kk in range(n_train)]\n", + "train_Xs = np.asarray([\n", + " [\n", + " [np.cos(s) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " np.cos(s) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " np.sin(s) + eps * (np.random.rand(1)[0] - 0.5)]\n", + " for t in range(n_range) for s in range(n_range)\n", + " ]\n", + " for kk in range(n_train)\n", + "])\n", "train_ys = np.zeros(n_train)\n", - "train_Xt = [np.asarray([[(2+np.cos(s))*np.cos(t) + eps*(np.random.rand(1)[0]-0.5),(2+np.cos(s))*np.sin(t) + eps*(np.random.rand(1)[0]-0.5),np.sin(s) + eps*(np.random.rand(1)[0] - 0.5)] for t in range(n_range) for s in range(n_range)]) for kk in range(n_train)]\n", + "train_Xt = np.asarray([\n", + " [\n", + " [(2 + np.cos(s)) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " (2 + np.cos(s)) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " np.sin(s) + eps * (np.random.rand(1)[0] - 0.5)]\n", + " for t in range(n_range) for s in range(n_range)\n", + " ]\n", + " for kk in range(n_train)\n", + "])\n", "train_yt = np.ones(n_train)\n", "\n", "# Training set\n", "train_X = np.concatenate((train_Xs, train_Xt))\n", "train_y = np.concatenate((train_ys, train_yt))\n", "\n", - "test_Xs = [np.asarray([[np.cos(s)*np.cos(t) + eps*(np.random.rand(1)[0]-0.5),np.cos(s)*np.sin(t) + eps*(np.random.rand(1)[0]-0.5),np.sin(s) + eps*(np.random.rand(1)[0] - 0.5)] for t in range(n_range) for s in range(n_range)]) for kk in range(n_train)]\n", + "test_Xs = np.asarray([\n", + " [\n", + " [np.cos(s) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " np.cos(s) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " np.sin(s) + eps * (np.random.rand(1)[0] - 0.5)]\n", + " for t in range(n_range) for s in range(n_range)\n", + " ]\n", + " for kk in range(n_train)\n", + "])\n", "test_ys = np.zeros(n_train)\n", - "test_Xt = [np.asarray([[(2+np.cos(s))*np.cos(t) + eps*(np.random.rand(1)[0]-0.5),(2+np.cos(s))*np.sin(t) + eps*(np.random.rand(1)[0]-0.5),np.sin(s) + eps*(np.random.rand(1)[0] - 0.5)] for t in range(n_range) for s in range(n_range)]) for kk in range(n_train)]\n", + "test_Xt = np.asarray([\n", + " [\n", + " [(2 + np.cos(s)) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " (2 + np.cos(s)) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5),\n", + " np.sin(s) + eps * (np.random.rand(1)[0] - 0.5)]\n", + " for t in range(n_range) for s in range(n_range)\n", + " ]\n", + " for kk in range(n_train)\n", + "])\n", "test_yt = np.ones(n_train)\n", "\n", "\n", @@ -225,12 +268,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Build persistence diagrams\n", - "\n", - "# The homology ranks we choose to consider\n", - "homology_dimensions = (0, 1, 2)\n", - "VR = VietorisRipsPersistence(\n", - " metric='euclidean', max_edge_length=10, homology_dimensions=homology_dimensions)\n", + "VR = VietorisRipsPersistence(metric=\"euclidean\", max_edge_length=10, homology_dimensions=homology_dimensions)\n", "\n", "# List of all the time-ordered persistence diagrams obtained from the list of correlation matrices\n", "train_diagrams = VR.fit_transform(train_X)\n", @@ -272,7 +310,7 @@ "outputs": [], "source": [ "# Training logistic regression\n", - "LR = LogisticRegression(solver='lbfgs')\n", + "LR = LogisticRegression()\n", "LR.fit(X_train, train_y)\n", "# Score\n", "LR.score(X_test, test_y)" @@ -303,19 +341,19 @@ "\n", "# This functions prepares the grid matrix with boundary identification\n", "def make_matrix(rows, cols):\n", - " n = rows*cols\n", - " M = np.zeros((n,n))\n", + " n = rows * cols\n", + " M = np.zeros((n, n))\n", " for r in range(rows):\n", " for c in range(cols):\n", - " i = r*cols + c\n", + " i = r * cols + c\n", " # Two inner diagonals\n", - " if c > 0: M[i-1,i] = M[i,i-1] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if c > 0: M[i - 1, i] = M[i, i - 1] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " # Two outer diagonals\n", - " if r > 0: M[i-cols,i] = M[i,i-cols] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if r > 0: M[i - cols, i] = M[i,i - cols] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " # vertical twisted boundary identification\n", - " if c == 0: M[n-i-1,i] = M[i,n-i-1] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if c == 0: M[n-i-1, i] = M[i, n - i - 1] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " # horizontal twisted boundary identification\n", - " if r == 0: M[n-i-1,i] = M[i,n-i-1] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if r == 0: M[n - i - 1, i] = M[i, n - i - 1] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " \n", " return M\n", "\n", @@ -326,7 +364,7 @@ "rp2 = graph_shortest_path(M)\n", "\n", "# Plot of the distance matrix\n", - "plot_heatmap(rp2, colorscale='viridis')" + "plot_heatmap(rp2, colorscale=\"viridis\")" ] }, { @@ -335,35 +373,32 @@ "metadata": {}, "outputs": [], "source": [ - "# Compute the adjacency matrix of the grid points, with boundaries identified as in the Klein bottle\n", - "from sklearn.utils.graph_shortest_path import graph_shortest_path\n", - "\n", "# This functions prepares the grid matrix with boundary identification\n", "def make_matrix(rows, cols):\n", - " n = rows*cols\n", - " M = np.zeros((n,n))\n", + " n = rows * cols\n", + " M = np.zeros((n, n))\n", " for r in range(rows):\n", " for c in range(cols):\n", - " i = r*cols + c\n", + " i = r * cols + c\n", " # Two inner diagonals\n", - " if c > 0: M[i-1,i] = M[i,i-1] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if c > 0: M[i - 1, i] = M[i, i - 1] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " # Two outer diagonals\n", - " if r > 0: M[i-cols,i] = M[i,i-cols] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if r > 0: M[i - cols, i] = M[i, i - cols] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " # vertical boundary identification\n", - " if c == 0: M[i+cols-1,i] = M[i,i+cols-1] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if c == 0: M[i + cols - 1, i] = M[i, i + cols - 1] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " # horizontal twisted boundary identification\n", - " if r == 0: M[n-i-1,i] = M[i,n-i-1] = 1 + 0.15*(np.random.rand(1)[0]-0.5)\n", + " if r == 0: M[n - i - 1, i] = M[i, n - i - 1] = 1 + 0.15 * (np.random.rand(1)[0] - 0.5)\n", " \n", " return M\n", "\n", - "M = make_matrix(20,20)\n", + "M = make_matrix(20, 20)\n", "\n", "# computing the distance matrix of the points over the Klein bottle\n", "\n", "klein = graph_shortest_path(M)\n", "\n", "# Plot of the distance matrix\n", - "plot_heatmap(klein, colorscale='viridis')" + "plot_heatmap(klein, colorscale=\"viridis\")" ] }, { @@ -382,7 +417,7 @@ "source": [ "## Computing persistent homology\n", "\n", - "In the next section we will use `giotto-tda` to compute the persistent homology groups of the topological spaces we just constructed." + "In the next section we will use ``giotto-tda`` to compute the persistent homology groups of the topological spaces we just constructed." ] }, { @@ -391,10 +426,7 @@ "metadata": {}, "outputs": [], "source": [ - "# the homology ranks we choose to consider\n", - "homology_dimensions = (0, 1, 2)\n", - "VR = VietorisRipsPersistence(\n", - " metric='precomputed', max_edge_length=np.inf, homology_dimensions=homology_dimensions)\n", + "VR = VietorisRipsPersistence(metric=\"precomputed\", max_edge_length=np.inf, homology_dimensions=homology_dimensions)\n", "\n", "# List of all the time-ordered persistence diagrams obtained from the list of correlation matrices\n", "diagrams = VR.fit_transform(topological_spaces_mat)" @@ -408,7 +440,7 @@ "\n", "The topological information of the point cloud is synthesised in the persistence diagram. The horizontal axis corresponds to the moment in which an homological generator is born, while the vertical axis corresponds to the moments in which an homological generator dies.\n", "\n", - "The generators of the homology groups (at given rank) are colored differently." + "The generators of the homology groups are given a different colour per rank." ] }, { @@ -459,7 +491,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.1" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/examples/data/generate_datasets.py b/examples/data/generate_datasets.py new file mode 100644 index 000000000..5bedcbdfc --- /dev/null +++ b/examples/data/generate_datasets.py @@ -0,0 +1,94 @@ +import numpy as np +from pathlib import Path + + +def generate_point_clouds(n_samples, n_points, eps): + sphere_point_clouds = [ + np.asarray( + [ + [ + np.cos(s) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5), + np.cos(s) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5), + np.sin(s) + eps * (np.random.rand(1)[0] - 0.5), + ] + for t in range(n_points) + for s in range(n_points) + ] + ) + for _ in range(n_samples) + ] + # label spheres with 0 + sphere_labels = np.zeros(n_samples) + + torus_point_clouds = [ + np.asarray( + [ + [ + (2 + np.cos(s)) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5), + (2 + np.cos(s)) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5), + np.sin(s) + eps * (np.random.rand(1)[0] - 0.5) + ] + for t in range(n_points) + for s in range(n_points) + ] + ) + for _ in range(n_samples) + ] + # label tori with 1 + torus_labels = np.ones(n_samples) + + point_clouds = np.concatenate((sphere_point_clouds, torus_point_clouds)) + labels = np.concatenate((sphere_labels, torus_labels)) + + return point_clouds, labels + + +def make_gravitational_waves( + path_to_data: Path, + n_signals: int = 30, + downsample_factor: int = 2, + r_min: float = 0.075, + r_max: float = 0.65, + n_snr_values: int = 10, + ): + def padrand(V, n, kr): + cut = np.random.randint(n) + rand1 = np.random.randn(cut) + rand2 = np.random.randn(n - cut) + out = np.concatenate((rand1 * kr, V, rand2 * kr)) + return out + + Rcoef = np.linspace(r_min, r_max, n_snr_values) + Npad = 500 # number of padding points on either side of the vector + gw = np.load(path_to_data / "gravitational_wave_signals.npy") + Norig = len(gw["data"][0]) + Ndat = len(gw["signal_present"]) + N = int(Norig / downsample_factor) + + ncoeff = [] + Rcoeflist = [] + + for j in range(n_signals): + ncoeff.append(10 ** (-19) * (1 / Rcoef[j % n_snr_values])) + Rcoeflist.append(Rcoef[j % n_snr_values]) + + noisy_signals = [] + gw_signals = [] + k = 0 + labels = np.zeros(n_signals) + + for j in range(n_signals): + signal = gw["data"][j % Ndat][range(0, Norig, downsample_factor)] + sigp = int((np.random.randn() < 0)) + noise = ncoeff[j] * np.random.randn(N) + labels[j] = sigp + if sigp == 1: + rawsig = padrand(signal + noise, Npad, ncoeff[j]) + if k == 0: + k = 1 + else: + rawsig = padrand(noise, Npad, ncoeff[j]) + noisy_signals.append(rawsig.copy()) + gw_signals.append(signal) + + return noisy_signals, gw_signals, labels diff --git a/examples/data/gravitational_wave_signals.npy b/examples/data/gravitational_wave_signals.npy new file mode 100644 index 000000000..c120c600e Binary files /dev/null and b/examples/data/gravitational_wave_signals.npy differ diff --git a/examples/datasets.py b/examples/datasets.py deleted file mode 100644 index 8ea38f80b..000000000 --- a/examples/datasets.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np - -def generate_point_clouds(n_samples, n_points, eps): - sphere_point_clouds = [ - np.asarray( - [ - [ - np.cos(s) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5), - np.cos(s) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5), - np.sin(s) + eps * (np.random.rand(1)[0] - 0.5), - ] - for t in range(n_points) - for s in range(n_points) - ] - ) - for kk in range(n_samples) - ] - # label spheres with 0 - sphere_labels = np.zeros(n_samples) - - torus_point_clouds = [ - np.asarray( - [ - [ - (2 + np.cos(s)) * np.cos(t) + eps * (np.random.rand(1)[0] - 0.5), - (2 + np.cos(s)) * np.sin(t) + eps * (np.random.rand(1)[0] - 0.5), - np.sin(s) + eps * (np.random.rand(1)[0] - 0.5), - ] - for t in range(n_points) - for s in range(n_points) - ] - ) - for kk in range(n_samples) - ] - # label tori with 1 - torus_labels = np.ones(n_samples) - - point_clouds = np.concatenate((sphere_point_clouds, torus_point_clouds)) - labels = np.concatenate((sphere_labels, torus_labels)) - - return point_clouds, labels diff --git a/examples/images/clique_complex_0_small.png b/examples/images/clique_complex_0_small.png new file mode 100644 index 000000000..06cfa671d Binary files /dev/null and b/examples/images/clique_complex_0_small.png differ diff --git a/examples/images/clique_complex_1_small.png b/examples/images/clique_complex_1_small.png new file mode 100644 index 000000000..0e44f242f Binary files /dev/null and b/examples/images/clique_complex_1_small.png differ diff --git a/examples/images/clique_complex_2_small.png b/examples/images/clique_complex_2_small.png new file mode 100644 index 000000000..28eee3e28 Binary files /dev/null and b/examples/images/clique_complex_2_small.png differ diff --git a/examples/images/clique_complex_3_small.png b/examples/images/clique_complex_3_small.png new file mode 100644 index 000000000..e5aa02338 Binary files /dev/null and b/examples/images/clique_complex_3_small.png differ diff --git a/examples/images/clique_complex_4_small.png b/examples/images/clique_complex_4_small.png new file mode 100644 index 000000000..d3e2bc9a3 Binary files /dev/null and b/examples/images/clique_complex_4_small.png differ diff --git a/examples/images/clique_complex_5_small.png b/examples/images/clique_complex_5_small.png new file mode 100644 index 000000000..955005458 Binary files /dev/null and b/examples/images/clique_complex_5_small.png differ diff --git a/examples/images/clique_complex_6_small.png b/examples/images/clique_complex_6_small.png new file mode 100644 index 000000000..dd51bdefd Binary files /dev/null and b/examples/images/clique_complex_6_small.png differ diff --git a/examples/images/clique_complex_7_small.png b/examples/images/clique_complex_7_small.png new file mode 100644 index 000000000..bb72f6561 Binary files /dev/null and b/examples/images/clique_complex_7_small.png differ diff --git a/examples/images/clique_complex_8_small.png b/examples/images/clique_complex_8_small.png new file mode 100644 index 000000000..2139455f3 Binary files /dev/null and b/examples/images/clique_complex_8_small.png differ diff --git a/examples/images/clique_complex_9_small.png b/examples/images/clique_complex_9_small.png new file mode 100644 index 000000000..c7d770659 Binary files /dev/null and b/examples/images/clique_complex_9_small.png differ diff --git a/examples/images/diagram_pipeline_images.png b/examples/images/diagram_pipeline_images.png new file mode 100644 index 000000000..c5c58d47d Binary files /dev/null and b/examples/images/diagram_pipeline_images.png differ diff --git a/examples/images/directions_and_centers.png b/examples/images/directions_and_centers.png new file mode 100644 index 000000000..2ac55b1af Binary files /dev/null and b/examples/images/directions_and_centers.png differ diff --git a/examples/images/example_pipeline_images.png b/examples/images/example_pipeline_images.png new file mode 100644 index 000000000..096d0983d Binary files /dev/null and b/examples/images/example_pipeline_images.png differ diff --git a/examples/images/mnist.png b/examples/images/mnist.png new file mode 100644 index 000000000..da333402d Binary files /dev/null and b/examples/images/mnist.png differ diff --git a/examples/images/nontrivial_cycle_directed_flag_complex.svg b/examples/images/nontrivial_cycle_directed_flag_complex.svg new file mode 100644 index 000000000..86170ce5f --- /dev/null +++ b/examples/images/nontrivial_cycle_directed_flag_complex.svg @@ -0,0 +1,170 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + 1 + + + + + + 2 + + + + + + 3 + + + + diff --git a/examples/images/simplex_directed_flag_complex.svg b/examples/images/simplex_directed_flag_complex.svg new file mode 100644 index 000000000..4d1a9f1bb --- /dev/null +++ b/examples/images/simplex_directed_flag_complex.svg @@ -0,0 +1,175 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + 1 + + + + + + 2 + + + + + + 3 + + + + + diff --git a/examples/images/time_delay_embedding.gif b/examples/images/time_delay_embedding.gif new file mode 100644 index 000000000..f4d141718 Binary files /dev/null and b/examples/images/time_delay_embedding.gif differ diff --git a/examples/images/vietoris_rips_point_cloud.gif b/examples/images/vietoris_rips_point_cloud.gif new file mode 100644 index 000000000..9dac1bf2d Binary files /dev/null and b/examples/images/vietoris_rips_point_cloud.gif differ diff --git a/examples/images/weighted_graph.png b/examples/images/weighted_graph.png new file mode 100644 index 000000000..b0feb064b Binary files /dev/null and b/examples/images/weighted_graph.png differ diff --git a/examples/lorenz_attractor.ipynb b/examples/lorenz_attractor.ipynb index 15fb5ec7a..2e9b1d600 100644 --- a/examples/lorenz_attractor.ipynb +++ b/examples/lorenz_attractor.ipynb @@ -8,7 +8,7 @@ "\n", "This notebook contains a full TDA pipeline to analyse the transitions of the Lorenz system to a chaotic regime from the stable one and viceversa.\n", "\n", - "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/lorenz_attractor.ipynb).\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/lorenz_attractor.ipynb) and download the source.\n", "\n", "**License: AGPLv3**" ] @@ -28,8 +28,9 @@ "outputs": [], "source": [ "# Import the gtda modules\n", - "from gtda.time_series import Resampler, TakensEmbedding, SlidingWindow, PermutationEntropy\n", - "from gtda.homology import VietorisRipsPersistence\n", + "from gtda.time_series import Resampler, SlidingWindow, takens_embedding_optimal_parameters, \\\n", + " TakensEmbedding, PermutationEntropy\n", + "from gtda.homology import WeakAlphaPersistence, VietorisRipsPersistence\n", "from gtda.diagrams import Scaler, Filtering, PersistenceEntropy, BettiCurve, PairwiseDistance\n", "from gtda.graphs import KNeighborsGraph, GraphGeodesicDistance\n", "\n", @@ -56,7 +57,6 @@ "outputs": [], "source": [ "# Plotting functions\n", - "from gtda.plotting import plot_diagram, plot_betti_surfaces\n", "from gtda.plotting import plot_point_cloud" ] }, @@ -89,8 +89,8 @@ "outputs": [], "source": [ "# Selecting the z-axis and the label rho\n", - "X = point_cloud[:,2]\n", - "y = point_cloud[:,3]" + "X = point_cloud[:, 2]\n", + "y = point_cloud[:, 3]" ] }, { @@ -144,37 +144,11 @@ "source": [ "## Takens Embedding\n", "\n", - "In order to obtain meaningful topological features from a time series, we use a delayed-time embedding technique, invented by F. Takens in the late 1960s.\n", - "The idea is simple: given a time series $X(t)$, one can extract a sequence of vectors of the form $X_i := [(X(t_i)), X(t_i + 2 \\tau), ..., X(t_i + M \\tau)]$.\n", - "The difference between $t_i$ and $t_{i-1}$ is called *stride*.\n", + "In order to obtain meaningful topological features from a time series, we use a *time-delay embedding* technique named after F. Takens who used it in the 1960s in his foundational work on dynamical systems.\n", "\n", - "$M$ and $\\tau$ are optimized automatically in this example (they can be set by the user if needed)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embedding_dimension = 10\n", - "time_delay = 3\n", - "TE = TakensEmbedding(\n", - " parameters_type='search', dimension=embedding_dimension, time_delay=time_delay)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TE.fit(X_sampled)\n", - "time_delay_ = TE.time_delay_\n", - "embedding_dimension_ = TE.dimension_\n", + "The idea is simple: given a time series $X(t)$, one can extract a sequence of vectors of the form $X_i := [(X(t_i)), X(t_i + 2 \\tau), ..., X(t_i + M \\tau)]$. The difference between $t_i$ and $t_{i-1}$ is called *stride*.\n", "\n", - "print('Optimal embedding time delay based on mutual information: ', time_delay_)\n", - "print('Optimal embedding dimension based on false nearest neighbors: ', embedding_dimension_)" + "$M$ and $\\tau$ are optimized automatically in this example according to known heuristics implemented in ``giotto-tda`` in the ``takens_embedding_optimal_parameters`` function. They can also be set by hand if preferred." ] }, { @@ -183,14 +157,24 @@ "metadata": {}, "outputs": [], "source": [ - "X_embedded, y_embedded = TE.transform_resample(X_sampled, y_sampled)" + "max_time_delay = 3\n", + "max_embedding_dimension = 10\n", + "stride = 1\n", + "optimal_time_delay, optimal_embedding_dimension = takens_embedding_optimal_parameters(\n", + " X_sampled, max_time_delay, max_embedding_dimension, stride=stride\n", + " )\n", + "\n", + "print(f\"Optimal embedding time delay based on mutual information: {optimal_time_delay}\")\n", + "print(f\"Optimal embedding dimension based on false nearest neighbors: {optimal_embedding_dimension}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also \"localise\" our Takens embedding procedure to multiple sliding windows over the data, rather than over the whole time series as we just did. The result is therefore a \"time series of point clouds\" with possibly interesting topologies, which we will be able to feed directly to our homology transformers." + "Having computed reasonable values for the parameters by looking at the whole time series, we can now perform the embedding procedure (which transforms a single time series into a single point cloud) on local sliding windows over the data. The result of this will be a \"time series of point clouds\" with possibly interesting topologies, which we will be able to feed directly to our homology transformers.\n", + "\n", + "We first construct sliding windows using ``SlidingWindow`` transformer-resampler, and then use the ``TakensEmbedding`` transformer to perform the embedding in parallel on each window, using the parameters ``optimal_time_delay`` and ``optimal_embedding_dimension`` found above." ] }, { @@ -199,18 +183,23 @@ "metadata": {}, "outputs": [], "source": [ - "window_width = 40\n", + "window_size = 41\n", "window_stride = 5\n", - "SW = SlidingWindow(width=window_width, stride=window_stride)\n", + "SW = SlidingWindow(size=window_size, stride=window_stride)\n", + "\n", + "X_windows, y_windows = SW.fit_transform_resample(X_sampled, y_sampled)\n", "\n", - "X_windows, y_windows = SW.fit_transform_resample(X_embedded, y_embedded)" + "TE = TakensEmbedding(time_delay=optimal_time_delay, dimension=optimal_embedding_dimension, stride=stride)\n", + "X_embedded = TE.fit_transform(X_windows)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can plot the Takens embedding of a specific window either by using `plot_point_cloud`, or by using the `plot` method of `SlidingWindow`, as follows (*note*: when `embedding_dimension > 3`, only the first three coordinates are plotted!):" + "We can plot the Takens embedding of a specific window either by using ``plot_point_cloud``, or by using the ``plot`` method of ``SlidingWindow``, as shown below.\n", + "\n", + "*Note*: only the first three coordinates are plotted!" ] }, { @@ -220,7 +209,7 @@ "outputs": [], "source": [ "window_number = 3\n", - "SW.plot(X_windows, sample=window_number)" + "TE.plot(X_embedded, sample=window_number)" ] }, { @@ -236,10 +225,10 @@ "metadata": {}, "outputs": [], "source": [ - "embedded_begin, embedded_end = SW._slice_windows(X_embedded)[window_number]\n", - "window_indices = np.arange(embedded_begin, embedded_end + time_delay_ * (embedding_dimension_ - 1))\n", - "fig = px.line(title=f'Resampled Lorenz solution over sliding window {window_number}')\n", - "fig.add_scatter(x=window_indices, y=X_sampled[window_indices], name='X_sampled')\n", + "embedded_begin, embedded_end = SW.slice_windows(X_windows)[window_number]\n", + "window_indices = np.arange(embedded_begin, embedded_end + optimal_time_delay * (optimal_embedding_dimension - 1))\n", + "fig = px.line(title=f\"Resampled Lorenz solution over sliding window {window_number}\")\n", + "fig.add_scatter(x=window_indices, y=X_sampled[window_indices], name=\"X_sampled\")\n", "fig.show()" ] }, @@ -259,17 +248,16 @@ "outputs": [], "source": [ "homology_dimensions = (0, 1, 2)\n", - "VR = VietorisRipsPersistence(\n", - " metric='euclidean', max_edge_length=100, homology_dimensions=homology_dimensions)\n", + "WA = WeakAlphaPersistence(homology_dimensions=homology_dimensions)\n", "\n", - "X_diagrams = VR.fit_transform(X_windows)" + "X_diagrams = WA.fit_transform(X_embedded)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can plot the persistence diagram for the embedding of the same sliding window as before. One way is using the `plot_diagram` function." + "We can plot the persistence diagram for the embedding of the same sliding window as before:" ] }, { @@ -278,17 +266,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_diagram(X_diagrams[window_number])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, we could have used the `plot` method of `VietorisRipsPersistence` as follows:\n", - "```\n", - "VR.plot(X_diagrams, sample=window_number)\n", - "```" + "WA.plot(X_diagrams, sample=window_number)" ] }, { @@ -297,7 +275,7 @@ "source": [ "## Scikit-learn–style pipeline\n", "\n", - "One of the advantages of `giotto-tda` is the compatibility with `scikit-learn`. It is possible to set up and run a full pipeline such as the one above in a few lines:" + "One of the advantages of ``giotto-tda`` is the compatibility with ``scikit-learn``. It is possible to set up and run a full pipeline such as the one above in a few lines:" ] }, { @@ -307,12 +285,10 @@ "outputs": [], "source": [ "# Steps of the Pipeline\n", - "steps = [\n", - " ('sampling', periodicSampler),\n", - " ('embedding', TE),\n", - " ('window', SW),\n", - " ('diagrams', VR)\n", - "]\n", + "steps = [('sampling', periodicSampler),\n", + " ('window', SW),\n", + " ('embedding', TE),\n", + " ('diagrams', WA)]\n", "\n", "# Define the Pipeline\n", "pipeline = Pipeline(steps)\n", @@ -334,7 +310,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_diagram(X_diagrams[window_number])" + "pipeline[-1].plot(X_diagrams, sample=window_number)" ] }, { @@ -343,7 +319,7 @@ "source": [ "## Rescaling the diagram\n", "\n", - "Rescaling a diagram means normalizing points such that the maximum \"bottleneck distance\" from the *empty diagram* (by default, across all homology dimensions) is equal to one. Notice that this means the birth and death scales are modified. We can use `Scaler` as follows:" + "By default, rescaling a diagram via ``Scaler`` means normalizing points such that the maximum \"bottleneck distance\" from the *empty diagram* (across all homology dimensions) is equal to 1. Notice that this means the birth and death scales are modified. We can do this as follows:" ] }, { @@ -365,7 +341,7 @@ "source": [ "## Filtering diagrams\n", "\n", - "Filtering a diagram means eliminating the homology generators whose lifespan is considererd too short to be significant. We can use `Filtering` as follows:" + "Filtering a diagram means eliminating the homology generators whose lifespan is considered too short to be significant. We can use ``Filtering`` as follows:" ] }, { @@ -404,22 +380,13 @@ "X_filtered = pipeline_filter.fit_transform(X)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_diagram(X_filtered[window_number])" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Persistence entropy\n", "\n", - "In this section we show how to compute the *entropy* of persistence diagrams." + "The *entropy* of persistence diagrams can be calculated via ``PersistenceEntropy``:" ] }, { @@ -441,7 +408,7 @@ "source": [ "fig = px.line(title='Persistence entropies, indexed by sliding window number')\n", "for dim in range(X_persistence_entropy.shape[1]):\n", - " fig.add_scatter(y=X_persistence_entropy[:, dim], name=f'PE in homology dimension {dim}')\n", + " fig.add_scatter(y=X_persistence_entropy[:, dim], name=f\"PE in homology dimension {dim}\")\n", "fig.show()" ] }, @@ -451,7 +418,7 @@ "source": [ "## Betti Curves\n", "\n", - "In this section we show how to compute the Betti curves of a persistence diagram. We also show the plot of the Betti surface, i.e. the time-stack of the Betti curves." + "The Betti curves of a persistence diagram can be computed and plotted using ``BettiCurve``:" ] }, { @@ -462,15 +429,8 @@ "source": [ "BC = BettiCurve()\n", "\n", - "X_betti_curves = BC.fit_transform(X_scaled)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "X_betti_curves = BC.fit_transform(X_scaled)\n", + "\n", "BC.plot(X_betti_curves, sample=window_number)" ] }, @@ -484,7 +444,7 @@ "\n", "In each case, we will obtain distance matrices whose i-th row encodes the distance of the i-th diagram from all the others.\n", "\n", - "We start with the so-called \"landscape $L^2$ distance\": when the parameter `order` is `None`, the output is one distance matrix per sample and homology dimension." + "We start with the so-called \"landscape $L^2$ distance\": when ``order`` is ``None``, the output is one distance matrix per sample and homology dimension." ] }, { @@ -495,11 +455,12 @@ "source": [ "p_L = 2\n", "n_layers = 5\n", - "PD = PairwiseDistance(\n", - " metric='landscape', metric_params={'p': p_L, 'n_layers': n_layers, 'n_bins': 1000}, order=None)\n", + "PD = PairwiseDistance(metric='landscape',\n", + " metric_params={'p': p_L, 'n_layers': n_layers, 'n_bins': 1000},\n", + " order=None)\n", "\n", "X_distance_L = PD.fit_transform(X_diagrams)\n", - "print(X_distance_L.shape)" + "X_distance_L.shape" ] }, { @@ -522,7 +483,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We now change metric and compute the \"$2$-Wasserstein distances\" between the diagrams. This one takes longer to compute!" + "We now change metric and compute the \"$2$-Wasserstein distances\" between the diagrams. This one takes longer!" ] }, { @@ -532,8 +493,9 @@ "outputs": [], "source": [ "p_W = 2\n", - "PD = PairwiseDistance(\n", - " metric='wasserstein', metric_params={'p': p_W, 'delta': 0.1}, order=None)\n", + "PD = PairwiseDistance(metric='wasserstein',\n", + " metric_params={'p': p_W, 'delta': 0.1},\n", + " order=None)\n", "\n", "X_distance_W = PD.fit_transform(X_diagrams)" ] @@ -576,10 +538,10 @@ "metadata": {}, "outputs": [], "source": [ - "n_neighbors = 3\n", + "n_neighbors = 2\n", "kNN = KNeighborsGraph(n_neighbors=n_neighbors)\n", "\n", - "X_kNN = kNN.fit_transform(X_windows)" + "X_kNN = kNN.fit_transform(X_embedded)" ] }, { @@ -613,14 +575,16 @@ "metadata": {}, "outputs": [], "source": [ - "plot_heatmap(pairwise_distances(X_windows[window_number]), colorscale='blues')" + "plot_heatmap(pairwise_distances(X_embedded[window_number]), colorscale='blues')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This is what the first few steps (before scaling and filtering) of the pipeline described above would be if you'd like persistence diagrams to be obtained using this new distance instead. Notice that we have to pass `metric='precomputed'` to the `VietorisRipsPersistence` constructor this time, because the input already consists of distance matrices:" + "This is what the first few steps (before scaling and filtering) of the pipeline described above would be if you'd like persistence diagrams to be obtained using this new distance instead.\n", + "\n", + "*Note*: ``WeakAlphaPersistence`` cannot be used now as it needs point cloud input. We can use instead an instance of ``VietorisRipsPersistence``, but we have to take care to pass ``metric='precomputed'`` to the constructor!" ] }, { @@ -632,13 +596,13 @@ "# Steps of the Pipeline\n", "steps = [\n", " ('sampling', periodicSampler),\n", - " ('embedding', TE),\n", " ('window', SW),\n", + " ('embedding', TE),\n", " ('kNN_graph', kNN),\n", " ('graph_geo_distance', GGD),\n", - " ('diagrams', VietorisRipsPersistence(\n", - " metric='precomputed', max_edge_length=100, homology_dimensions=homology_dimensions))\n", - "]\n", + " ('diagrams', VietorisRipsPersistence(metric='precomputed',\n", + " homology_dimensions=homology_dimensions))\n", + " ]\n", "\n", "# Define the Pipeline\n", "pipeline = Pipeline(steps)\n", @@ -660,8 +624,15 @@ "metadata": {}, "outputs": [], "source": [ - "plot_diagram(X_diagrams[window_number])" + "pipeline[-1].plot(X_diagrams, sample=window_number)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -680,7 +651,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/examples/mapper_quickstart.ipynb b/examples/mapper_quickstart.ipynb index 67c561b59..f459f8c13 100644 --- a/examples/mapper_quickstart.ipynb +++ b/examples/mapper_quickstart.ipynb @@ -6,9 +6,9 @@ "source": [ "# Getting started with Mapper\n", "\n", - "In this notebook we explore a few of the core features included in `giotto-tda`'s implementation of the [Mapper algorithm](https://research.math.osu.edu/tgda/mapperPBG.pdf). \n", + "In this notebook we explore a few of the core features included in ``giotto-tda``'s implementation of the [Mapper algorithm](https://research.math.osu.edu/tgda/mapperPBG.pdf). \n", "\n", - "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/mapper_quickstart.ipynb).\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/mapper_quickstart.ipynb) and download the source.\n", "\n", "## Useful references\n", "\n", @@ -79,7 +79,7 @@ "## Configure the Mapper pipeline\n", "Given a dataset ${\\cal D}$ of points $x \\in \\mathbb{R}^n$, the basic steps behind Mapper are as follows:\n", "\n", - "1. Map ${\\cal D}$ to a lower-dimensional space using a **filter function** $f: \\mathbb{R}^n \\to \\mathbb{R}^m$. Common choices for the filter function include projection onto one or more axes via PCA or density-based methods. In `giotto-tda`, you can import a variety of filter functions as follows:\n", + "1. Map ${\\cal D}$ to a lower-dimensional space using a **filter function** $f: \\mathbb{R}^n \\to \\mathbb{R}^m$. Common choices for the filter function include projection onto one or more axes via PCA or density-based methods. In ``giotto-tda``, you can import a variety of filter functions as follows:\n", "\n", "```python\n", "from gtda.mapper.filter import FilterFunctionName\n", @@ -91,7 +91,7 @@ "from gtda.mapper.cover import CoverName\n", "```\n", "\n", - "3. For each interval $U_i \\in {\\cal U}$ cluster the points in the preimage $f^{-1}(U_i)$ into sets $C_{i,1}, \\ldots , C_{i,k_i}$. The choice of clustering algorithm can be any of `scikit-learn`'s [clustering methods](https://scikit-learn.org/stable/modules/clustering.html) or an implementation of agglomerative clustering in `giotto-tda`:\n", + "3. For each interval $U_i \\in {\\cal U}$ cluster the points in the preimage $f^{-1}(U_i)$ into sets $C_{i,1}, \\ldots , C_{i,k_i}$. The choice of clustering algorithm can be any of ``scikit-learn``'s [clustering methods](https://scikit-learn.org/stable/modules/clustering.html) or an implementation of agglomerative clustering in ``giotto-tda``:\n", "\n", "```python\n", "# scikit-learn method\n", @@ -100,9 +100,9 @@ "from gtda.mapper.cluster import FirstSimpleGap\n", "```\n", "\n", - "4. Construct the topological graph whose vertices are the cluster sets $(C_{i,j})_{i\\in I, j \\in \\{1,\\ldots,k_i\\}}$ and an edge exists between two nodes if they share points in common: $C_{i,j} \\cap C_{k,l} \\neq \\emptyset$. This step is handled automatically by `giotto-tda`.\n", + "4. Construct the topological graph whose vertices are the cluster sets $(C_{i,j})_{i\\in I, j \\in \\{1,\\ldots,k_i\\}}$ and an edge exists between two nodes if they share points in common: $C_{i,j} \\cap C_{k,l} \\neq \\emptyset$. This step is handled automatically by ``giotto-tda``.\n", "\n", - "These four steps are implemented in the `MapperPipeline` object that mimics the `Pipeline` class from `scikit-learn`. We provide a convenience function `make_mapper_pipeline()` that allows you to pass the choice of filter function, cover, and clustering algorithm as arguments. For example, to project our data onto the $x$- and $y$-axes, we could setup the pipeline as follows:" + "These four steps are implemented in the ``MapperPipeline`` object that mimics the ``Pipeline`` class from ``scikit-learn``. We provide a convenience function ``make_mapper_pipeline`` that allows you to pass the choice of filter function, cover, and clustering algorithm as arguments. For example, to project our data onto the $x$- and $y$-axes, we could setup the pipeline as follows:" ] }, { @@ -111,11 +111,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Define filter function - can be any scikit-learn transformer\n", + "# Define filter function – can be any scikit-learn transformer\n", "filter_func = Projection(columns=[0, 1])\n", "# Define cover\n", "cover = CubicalCover(n_intervals=10, overlap_frac=0.3)\n", - "# Choose clustering algorithm - default is DBSCAN\n", + "# Choose clustering algorithm – default is DBSCAN\n", "clusterer = DBSCAN()\n", "\n", "# Configure parallelism of clustering step\n", @@ -142,7 +142,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With the Mapper pipeline at hand, it is now a simple matter to visualise it. To warm up, let's examine the graph in two-dimensions using the default arguments of `giotto-tda`'s plotting function:" + "With the Mapper pipeline at hand, it is now a simple matter to visualise it. To warm up, let's examine the graph in two-dimensions using the default arguments of ``giotto-tda``'s plotting function:" ] }, { @@ -167,7 +167,7 @@ "metadata": {}, "source": [ "### Configure the coloring of the Mapper graph\n", - "By default, the nodes of the Mapper graph are colored by the mean value of the points that belong to a given node. However, in this example it is more instructive to colour by the $x$- and $y$-axes. This can be achieved by toggling the `color_by_columns_dropdown`, which calculates the coloring for each column in the input data array. At the same time, let's configure the choice of colorscale:" + "By default, the nodes of the Mapper graph are colored by the mean value of the points that belong to a given node. However, in this example it is more instructive to colour by the $x$- and $y$-axes. This can be achieved by toggling the ``color_by_columns_dropdown``, which calculates the coloring for each column in the input data array. At the same time, let's configure the choice of colorscale:" ] }, { @@ -187,7 +187,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the dropdown menu, the entry `color_variable` refers to a user-defined quantity to color by - by default it is the average value of the points in each node. In general, one can configure this quantity to be an array, a `scikit-learn` transformer, or a list of indices to select from the data. For example, coloring by a PCA component can be implemented as follows:" + "In the dropdown menu, the entry ``color_variable`` refers to a user-defined quantity to color by – by default it is the average value of the points in each node. In general, one can configure this quantity to be an array, a ``scikit-learn`` transformer, or a list of indices to select from the data. For example, coloring by a PCA component can be implemented as follows:" ] }, { @@ -211,7 +211,7 @@ "source": [ "### Pass a pandas DataFrame as input\n", "\n", - "It is also possible to feed `plot_static_mapper_graph()` a pandas DataFrame:" + "It is also possible to feed ``plot_static_mapper_graph`` a pandas DataFrame:" ] }, { @@ -228,7 +228,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Before plotting we need to update the Mapper pipeline to know about the projection onto the column names. This can be achieved using the `set_params()` method as follows:" + "Before plotting we need to update the Mapper pipeline to know about the projection onto the column names. This can be achieved using the ``set_params`` method as follows:" ] }, { @@ -256,7 +256,7 @@ "source": [ "### Change the layout algorithm\n", "\n", - "By default, `plot_static_mapper_graph()` uses the Kamada–Kawai algorithm for the layout; however any of the layout algorithms defined in python-igraph are supported (see [here](https://igraph.org/python/doc/igraph.Graph-class.html) for a list of possible layouts). For example, we can switch to the Fruchterman–Reingold layout as follows:" + "By default, ``plot_static_mapper_graph`` uses the Kamada–Kawai algorithm for the layout; however any of the layout algorithms defined in python-igraph are supported (see [here](https://igraph.org/python/doc/igraph.Graph-class.html) for a list of possible layouts). For example, we can switch to the Fruchterman–Reingold layout as follows:" ] }, { @@ -287,7 +287,7 @@ "source": [ "### Change the layout dimension\n", "\n", - "It is also possible to visualise the Mapper graph in 3-dimensions by configuring the `layout_dim` argument:" + "It is also possible to visualise the Mapper graph in 3 dimensions by configuring the ``layout_dim`` argument:" ] }, { @@ -306,7 +306,7 @@ "source": [ "### Change the node size scale\n", "\n", - "In general, node sizes are proportional to the number of dataset elements contained in the nodes. Sometimes, however, the default scale leads to graphs which are difficult to decipher, due to e.g. excessively small nodes. The `node_scale` parameter can be used to configure this scale. " + "In general, node sizes are proportional to the number of dataset elements contained in the nodes. Sometimes, however, the default scale leads to graphs which are difficult to decipher, due to e.g. excessively small nodes. The ``node_scale`` parameter can be used to configure this scale." ] }, { @@ -326,7 +326,7 @@ "source": [ "## Run the Mapper pipeline\n", "\n", - "Behind the scenes of `plot_static_mapper_graph()` is a `MapperPipeline` object `pipe` that can be used like a typical `scikit-learn` estimator. For example, to extract the underlying graph data structure we can do the following:" + "Behind the scenes of ``plot_static_mapper_graph`` is a ``MapperPipeline`` object ``pipe`` that can be used like a typical ``scikit-learn`` estimator. For example, to extract the underlying graph data structure we can do the following:" ] }, { @@ -342,7 +342,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The resulting graph is a [`python-igraph`](https://igraph.org/python/) object that contains metadata that is stored in the form of dictionaries. We can access this data as follows:" + "The resulting graph is a [python-igraph](https://igraph.org/python/) object which stores node metadata in the form of attributes. We can access this data as follows:" ] }, { @@ -351,14 +351,14 @@ "metadata": {}, "outputs": [], "source": [ - "graph[\"node_metadata\"].keys()" + "graph.vs.attributes()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here `node_id` is a globally unique node identifier used to construct the graph, while `pullback_set_label` and `partial_cluster_label` refer to the interval and cluster sets described above. The `node_elements` refers to the indices of our original data that belong to each node. For example, to find which points belong to the first node of the graph we can access the desired data as follows:" + "Here ``'pullback_set_label'`` and ``'partial_cluster_label'`` refer to the interval and cluster sets described above. ``'node_elements'`` refers to the indices of our original data that belong to each node. For example, to find which points belong to the first node of the graph we can access the desired data as follows:" ] }, { @@ -367,23 +367,14 @@ "metadata": {}, "outputs": [], "source": [ - "node_id, node_elements = (\n", - " graph[\"node_metadata\"][\"node_id\"],\n", - " graph[\"node_metadata\"][\"node_elements\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\n", - " \"Node ID: {}, \\nNode elements: {}, \\nData points: {}\".format(\n", - " node_id[0], node_elements[0], data[node_elements[0]]\n", - " )\n", - ")" + "node_id = 0\n", + "node_elements = graph.vs[\"node_elements\"]\n", + "\n", + "print(f\"\"\"\n", + "Node ID: {node_id}\n", + "Node elements: {node_elements[node_id]}\n", + "Data points: {data[node_elements[node_id]]}\n", + "\"\"\")" ] }, { @@ -392,7 +383,7 @@ "source": [ "## Creating custom filter functions\n", "\n", - "In some cases, the list of filter functions provided in `gtda.mapper.filter.py` or `scikit-learn` may not be sufficient for the task at hand. In such cases, one can pass any callable to the pipeline that acts **row-wise** on the input data. For example, we can project by taking the sum of the $(x,y)$ coordinates as follows:" + "In some cases, the list of filter functions provided in ``gtda.mapper.filter.py`` or ``scikit-learn`` may not be sufficient for the task at hand. In such cases, one can pass any callable to the pipeline that acts **row-wise** on the input data. For example, we can project by taking the sum of the $(x,y)$ coordinates as follows:" ] }, { @@ -428,11 +419,11 @@ "source": [ "## Visualise the 2D Mapper graph interactively (Live Jupyter session needed)\n", "\n", - "In general, building useful Mapper graphs requires some iteration through the various parameters in the cover and clustering algorithm. To simplify that process, `giotto-tda` provides an interactive figure that can be configured in real time.\n", + "In general, building useful Mapper graphs requires some iteration through the various parameters in the cover and clustering algorithm. To simplify that process, ``giotto-tda`` provides an interactive figure that can be configured in real time.\n", "\n", "If invalid parameters are selected, the _Show logs_ checkbox can be used to see what went wrong.\n", "\n", - "To see the interactive output, please **download** the notebook from [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/mapper_quickstart.ipynb) and execute it locally." + "To see the interactive output, please **download** the notebook from [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/mapper_quickstart.ipynb) and execute it locally." ] }, { diff --git a/examples/persistent_homology_graphs.ipynb b/examples/persistent_homology_graphs.ipynb new file mode 100644 index 000000000..224b76586 --- /dev/null +++ b/examples/persistent_homology_graphs.ipynb @@ -0,0 +1,684 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Topological feature extraction from graphs\n", + "\n", + "``giotto-tda`` can extract topological features from undirected or directed graphs represented as adjacency matrices, via the following transformers:\n", + "\n", + "- [VietorisRipsPersistence](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/homology/gtda.homology.VietorisRipsPersistence.html) and [SparseRipsPersistence](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/homology/gtda.homology.VietorisRipsPersistence.html) initialized with ``metric=\"precomputed\"``, for undirected graphs;\n", + "- [FlagserPersistence](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/homology/gtda.homology.VietorisRipsPersistence.html) initialized with ``directed=True``, for directed graphs, and with ``directed=False`` for undirected ones.\n", + "\n", + "In this notebook, we build some basic intuition on how these methods are able to capture structures and patterns from such graphs. We will focus on the multi-scale nature of the information contained in the final outputs (\"persistence diagrams\"), as well as on the differences between the undirected and directed cases. Although adjacency matrices of sparsely connected and even unweighted graphs can be passed directly to these transformers, they are interpreted as *weighted* adjacency matrices according to some non-standard conventions. We will highlight these conventions below.\n", + "\n", + "The mathematical technologies used under the hood are various flavours of \"persistent homology\" (as is also the case for [EuclideanCechPersistence](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/homology/gtda.homology.EuclideanCechPersistence.html) and [CubicalPersistence](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/homology/gtda.homology.CubicalPersistence.html)). If you are interested in the details, you can start from the [theory glossary](https://giotto-ai.github.io/gtda-docs/latest/theory/glossary.html) and references therein.\n", + "\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/persistent_homology_graphs.ipynb) and download the source.\n", + "\n", + "\n", + "## See also\n", + "\n", + "- [Topological feature extraction using VietorisRipsPersistence and PersistenceEntropy](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html) which treats the \"special case\" of point clouds (see below).\n", + "- [Plotting in giotto-tda](https://giotto-ai.github.io/gtda-docs/latest/notebooks/plotting_api.html), particularly Section 1.2 (as above, treats the case of point clouds).\n", + "- [Case study: Classification of shapes](https://giotto-ai.github.io/gtda-docs/latest/notebooks/classifying_shapes.html) (a more advanced example).\n", + "- [Computing persistent homology of directed flag complexes](https://arxiv.org/abs/1906.10458) by Daniel Luetgehetmann, Dejan Govc, Jason Smith, and Ran Levi. \n", + "\n", + "**License: AGPLv3**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from numpy.random import default_rng\n", + "rng = default_rng(42) # Create a random number generator\n", + "\n", + "from scipy.spatial.distance import pdist, squareform\n", + "from scipy.sparse import csr_matrix\n", + "\n", + "from gtda.graphs import GraphGeodesicDistance\n", + "from gtda.homology import VietorisRipsPersistence, SparseRipsPersistence, FlagserPersistence\n", + "\n", + "from igraph import Graph\n", + "\n", + "from IPython.display import SVG, display" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Undirected graphs – ``VietorisRipsPersistence`` and ``SparseRipsPersistence``\n", + "\n", + "### General API\n", + "\n", + "If you have a collection ``X`` of adjacency matrices of graphs, you can instantiate transformers of class ``VietorisRipsPersistence`` or ``SparseRipsPersistence`` by setting the parameter ``metric`` as ``\"precomputed\"``, and then call ``fit_transform`` on ``X` to obtain the corresponding collection of *persistence diagrams* (see **Understanding the computation** below for an explanation).\n", + "\n", + "In the case of ``VietorisRipsPersistence``, ``X`` can be a list of sparse or dense matrices, and a basic example of topological feature extraction would look like this:\n", + "```\n", + "# Instantiate topological transformer\n", + "VR = VietorisRipsPersistence(metric=\"precomputed\")\n", + "\n", + "# Compute persistence diagrams corresponding to each graph in X\n", + "diagrams = VR.fit_transform(X)\n", + "```\n", + "\n", + "Each entry in the result can be plotted as follows (where we plot the 0th entry, i.e. `diagrams[0]`):\n", + "```\n", + "VR.plot(diagrams, sample=0)\n", + "```\n", + "\n", + "*Note*: ``SparseRipsPersistence`` implements an approximate scheme for computing the same topological quantities as ``VietorisRipsPersistence``. This can be useful for speeding up the computation on large inputs, but we will not demonstrate its use in this notebook.\n", + "\n", + "### Fully connected and weighted\n", + "\n", + "We now turn to the case of fully connected and weighted (FCW) undirected graphs. In this case, the input should be a list of 2D arrays or a single 3D array. Here is a simple example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a single weighted adjacency matrix of a FCW graph\n", + "n_vertices = 10\n", + "x = rng.random((n_vertices, n_vertices))\n", + "# Fill the diagonal with zeros (not always necessary, see below)\n", + "np.fill_diagonal(x, 0)\n", + "\n", + "# Create a trivial collection of weighted adjacency matrices, containing x only\n", + "X = [x]\n", + "\n", + "# Instantiate topological transformer\n", + "VR = VietorisRipsPersistence(metric=\"precomputed\")\n", + "\n", + "# Compute persistence diagrams corresponding to each entry (only one here) in X\n", + "diagrams = VR.fit_transform(X)\n", + "\n", + "print(f\"diagrams.shape: {diagrams.shape} ({diagrams.shape[1]} topological features)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Non-fully connected weighted graphs\n", + "\n", + "In ``giotto-tda``, a non-fully connected weighted graph can be represented by an adjacency matrix in one of two possible forms:\n", + "- a dense square array with ``np.inf`` in position $ij$ if the edge between vertex $i$ and vertex $j$ is absent.\n", + "- a sparse matrix in which the non-stored edge weights represent absent edges.\n", + "\n", + "**Important notes**\n", + "- A ``0`` in a dense array, or an explicitly stored ``0`` in a sparse matrix, does *not* denote an absent edge. It denotes an edge with weight 0, which, in a sense, means the complete opposite! See the section ***Understanding the computation*** below.\n", + "- Dense Boolean arrays are first converted to numerical ones and then interpreted as adjacency matrices of FCW graphs. ``False`` values therefore should not be used to represent absent edges.\n", + "\n", + "### Understanding the computation \n", + "\n", + "To understand what these persistence diagrams are telling us about the input weighted graphs, we briefly explain the **clique complex (or flag complex) filtration** procedure underlying the computations in ``VietorisRipsPersistence`` when ``metric=\"precomputed\"``, via an example.\n", + "\n", + "Let us start with a special case of a weighted graph with adjacency matrix as follows:\n", + "\n", + "- the diagonal entries (\"vertex weights\") are all zero;\n", + "- all off-diagonal entries (edge weights) are non-negative;\n", + "- some edge weights are infinite (or very very large).\n", + "\n", + "We can lay such a graph on the plane to visualise it, drawing only the finite edges:\n", + "\n", + "![A weighted graph](images/weighted_graph.png)\n", + "\n", + "The procedure can be explained as follows: we let a parameter $\\varepsilon$ start at 0, and as we increase it all the way to infinity we keep considering the instantaneous subgraphs made of a) all the vertices in the original graph, and b) those edges whose weight is less than or equal to the current $\\varepsilon$. We also promote these subgraphs to more general structures called **(simplicial) complexes** that, alongside vertices and edges, also possess $k$**-simplices**, i.e. selected subsets of $k + 1$ vertices (a 2-simplex is an abstract \"triangle\", a 3-simplex an abstract \"tetrahedron\", etc). Our criterion is this: for each integer $k \\geq 2$, all $(k + 1)$-cliques in each instantaneous subgraph are declared to be the $k$-simplices of the subgraph's associated complex. By definition, the $0$-simplices are the vertices and the $1$-simplices are the available edges.\n", + "\n", + "As $\\varepsilon$ increases from 0 (included) to infinity, we record the following information:\n", + "\n", + "1. How many new **connected components** are created because of the appearance of vertices (in this example, all vertices \"appear\" in one go at $\\varepsilon = 0$, by definition!), or merge because of the appearance of new edges.\n", + "2. How many new 1-dimensional \"holes\", 2-dimensional \"cavities\", or more generally $d$-dimensional **voids** are created in the instantaneous complex. A hole, cavity, or $d$-dimensional void is such only if there is no collection of \"triangles\", \"tetrahedra\", or $(d + 1)$-simplices which the void is the \"boundary\" of. *Note*: Although the edges of a triangle *alone* \"surround a hole\", these cannot occur in our particular construction because the \"filling\" triangle is also declared present in the complex when all its edges are.\n", + "3. How many $d$-dimensional voids which were present at earlier values of $\\epsilon$ are \"filled\" by $(d + 1)$-simplices which just appear.\n", + "\n", + "This process of recording the full topological history of the graph across all edge weights is called (Vietoris-Rips) **persistent homology**.\n", + "\n", + "Let us start at $\\varepsilon = 0$: Some edges had zero weight in our graph, so they already appear!\n", + "\n", + "![$\\varepsilon = 0$](images/clique_complex_0_small.png)\n", + "\n", + "There are 9 connected components, and nothing much else.\n", + "\n", + "Up to and including $\\varepsilon = 2$, a few more edges are added which make some of the connected components merge together but do not create any hole, triangles, or higher cliques. Let us look at $\\varepsilon = 3$:\n", + "\n", + "![$\\varepsilon = 3$](images/clique_complex_3_small.png)\n", + "\n", + "The newly arrived edges reduce the number of connected components further, but more interestingly they create a 1D hole!\n", + "\n", + "As an example of a \"higher\"-simplex, at $\\varepsilon = 4$ we get our first triangle:\n", + "\n", + "![$\\varepsilon = 4$](images/clique_complex_4_small.png)\n", + "\n", + "At $\\varepsilon = 5$, our 1D hole is filled:\n", + "\n", + "![$\\varepsilon = 5$](images/clique_complex_5_small.png)\n", + "\n", + "At $\\varepsilon = 8$, two new 1D holes appear:\n", + "\n", + "![$\\varepsilon = 8$](images/clique_complex_8_small.png)\n", + "\n", + "Finally, at $\\varepsilon = 9$, some more connected components merge, but no new voids are either created or destroyed:\n", + "\n", + "![$\\varepsilon = 9$](images/clique_complex_9_small.png)\n", + "\n", + "We can stop as we have reached the maximum value of $\\varepsilon$, beyond which nothing will change: there is only one connected component left, but there are also two 1D holes which will never get filled.\n", + "\n", + "Fit-transforming via ``VietorisRipsPersistence(metric=\"precomputed\")`` on the original graph's adjacency matrix would return the following 3D array of **persistence diagrams**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "diagrams = np.array([[[0., 1., 0],\n", + " [0., 2., 0],\n", + " [0., 2., 0],\n", + " [0., 3., 0],\n", + " [0., 4., 0],\n", + " [0., 5., 0],\n", + " [0., 6., 0],\n", + " [0., 7., 0],\n", + " [3., 5., 1],\n", + " [8., np.inf, 1],\n", + " [8., np.inf, 1]]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The notebook [Topological feature extraction using VietorisRipsPersistence and PersistenceEntropy](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html) explains how to interpret this output and how to make informative 2D scatter plots out of its entries. Here, we only have one entry corresponding to our graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.plotting import plot_diagram\n", + "\n", + "plot_diagram(diagrams[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Small aside*: You would be correct to expect an additional row ``[0, np.inf, 0]`` representing one connected component which lives forever. By convention, since such a row would always be present under this construction and hence give no useful information, all transformers discussed in this notebook remove this feature from the output.\n", + "\n", + "#### Advanced discussion: Non-zero vertex weights and negative edge weights\n", + "\n", + "Although we introduced the simplifying assumptions that the diagonal entries of the input adjacency matrix is zero, and that all edge weights are non-negative, for the procedure to make sense we need a lot less. Namely:\n", + "- The diagonal entry corresponding to a vertex is always interpreted as the value of the parameter $\\varepsilon$ at which that vertex \"appears\". Making all these entries equal to zero means, as in the example above, that all vertices appear simultaneously at $\\varepsilon = 0$. Generally however, different vertices can be declared to \"appear\" at different values, and even at negative ones.\n", + "- The only constraint on each edge weight is that it should be no less than the \"vertex weight\" of either of its boundary vertices.\n", + "\n", + "As a simple example, subtracting a constant from *all* entries of an adjacency matrix has the effect of shifting all birth and death values by the same constant.\n", + "\n", + "### The \"special case\" of point clouds\n", + "\n", + "The use of ``VietorisRipsPersistence`` to compute multi-scale topological features of concrete point clouds in Euclidean space is covered briefly in Section 1.2 of [Plotting in giotto-tda](https://giotto-ai.github.io/gtda-docs/latest/notebooks/plotting_api.html), and more in-depth in [Case study: Classification of shapes](https://giotto-ai.github.io/gtda-docs/latest/notebooks/classifying_shapes.html) and in [Can two-dimensional topological voids exist in two dimensions?](https://giotto-ai.github.io/gtda-docs/latest/notebooks/voids_on_the_plane.html)\n", + "\n", + "The Vietoris-Rips procedure for point clouds is often depicted as a process of growing balls of ever increasing radius $r$ around each point, and drawing edges between two points whenever their two respective $r$-balls touch for the first time. Just as in our clique complex construction above, cliques present at radius $r$ are declared to be higher-dimensional simplices in the instantaneous complex:\n", + "\n", + "![Vietoris–Rips filtration of a point cloud](images/vietoris_rips_point_cloud.gif)\n", + "\n", + "And just as in the case of weighted graphs, we record the appearance/disappearance of connected components and voids as we keep increasing $r$.\n", + "\n", + "The case of point clouds can actually be thought of as a special case of the case of FCW graphs. Namely, if:\n", + "\n", + "1. we regard each point in the cloud as an abstract vertex in a graph,\n", + "2. we compute the square matrix of pairwise (Euclidean or other) distances between points in the cloud, and\n", + "3. we run the procedure explained above with $\\varepsilon$ defined as $2r$,\n", + "then we compute exactly the \"topological summary\" of the point cloud.\n", + "\n", + "So, in ``giotto-tda``, we can do:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1 point cloud with 20 points in 5 dimensions\n", + "point_cloud = rng.random((20, 5))\n", + "# Corresponding matrix of Euclidean pairwise distances\n", + "pairwise_distances = squareform(pdist(point_cloud))\n", + "\n", + "# Default parameter for ``metric`` is \"euclidean\"\n", + "X_vr_pc = VietorisRipsPersistence().fit_transform([point_cloud])\n", + "\n", + "X_vr_graph = VietorisRipsPersistence(metric=\"precomputed\").fit_transform([pairwise_distances])\n", + "\n", + "assert np.array_equal(X_vr_pc, X_vr_graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unweighted graphs and chaining with ``GraphGeodesicDistance``\n", + "\n", + "What if, as is the case in many applications, our graphs have sparse connections and are unweighted?\n", + "\n", + "In ``giotto-tda``, there are two possibilities:\n", + "\n", + "1. Encode the graphs as adjacency matrices of non-fully connected weighted graphs, where all weights corresponding to edges which are present are equal to ``1.`` (or any other positive constant). See section ***Non-fully connected weighted graphs*** above for the different encoding conventions for sparse and dense matrices.\n", + "2. Preprocess the unweighted graph via [GraphGeodesicDistance](https://giotto-ai.github.io/gtda-docs/latest/modules/generated/graphs/processing/gtda.graphs.GraphGeodesicDistance.html) to obtain a FCW graph where edge $ij$ has as weight the length of the shortest path from vertex $i$ to vertex $j$ (and ``np.inf`` if no path exists between the two vertices in the original graph).\n", + "\n", + "### Example 1: Circle graph\n", + "\n", + "We now explore the difference between the two approaches in the simple example of a circle graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper function -- directed circles will be needed later\n", + "def make_circle_adjacency(n_vertices, directed=False):\n", + " weights = np.ones(n_vertices)\n", + " rows = np.arange(n_vertices)\n", + " columns = np.arange(1, n_vertices + 1) % n_vertices\n", + " directed_adjacency = csr_matrix((weights, (rows, columns)))\n", + " if not directed:\n", + " return directed_adjacency + directed_adjacency.T\n", + " return directed_adjacency\n", + "\n", + "n_vertices = 10\n", + "undirected_circle = make_circle_adjacency(n_vertices)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can produce an SVG of the circle using ``python-igraph``, and display it.\n", + "\n", + "*Note*: If running from a live jupyter session, this will dump a file inside your notebook's directory. If ``pycairo`` is installed, you can draw the graph directly in the notebook by instead running\n", + "```\n", + "from igraph import plot\n", + "plot(graph)\n", + "```\n", + "in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "row, col = undirected_circle.nonzero()\n", + "graph = Graph(n=n_vertices, edges=list(zip(row, col)), directed=False)\n", + "fname = \"undirected_circle.svg\"\n", + "graph.write_svg(fname)\n", + "display(SVG(filename=fname))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Approach 1 means passing the graph as is:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "VietorisRipsPersistence(metric=\"precomputed\").fit_transform_plot([undirected_circle]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The circular nature has been captured by the single point in homology dimension 1 ($H_1$) which is born at 1 and lives forever.\n", + "\n", + "Compare with what we observe when preprocessing first with ``GraphGeodesicDistance``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ggd = GraphGeodesicDistance(directed=False, unweighted=True).fit_transform([undirected_circle])\n", + "VietorisRipsPersistence(metric=\"precomputed\").fit_transform_plot(X_ggd);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is still a \"long-lived\" topological feature in dimension 1, but this time its death value is finite. This is because, at some point, we have enough triangles to completely fill the 1D hole. Indeed, when the number of vertices/edges in the circle is large, the death value is around one third of this number. So, relative to the procedure without ``GraphGeodesicDistance``, the death value now gives additional information about the *size* of the circle graph!\n", + "\n", + "### Example 2: Two disconnected circles\n", + "\n", + "Suppose our graph contains two disconnected circles of different sizes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_vertices_small, n_vertices_large = n_vertices, 2 * n_vertices\n", + "undirected_circle_small = make_circle_adjacency(n_vertices_small)\n", + "undirected_circle_large = make_circle_adjacency(n_vertices_large)\n", + "row_small, col_small = undirected_circle_small.nonzero()\n", + "row_large, col_large = undirected_circle_large.nonzero()\n", + "row = np.concatenate([row_small, row_large + n_vertices])\n", + "col = np.concatenate([col_small, col_large + n_vertices])\n", + "data = np.concatenate([undirected_circle_small.data, undirected_circle_large.data])\n", + "two_undirected_circles = csr_matrix((data, (row, col)))\n", + "\n", + "graph = Graph(n=n_vertices_small + n_vertices_large, edges=list(zip(row, col)), directed=False)\n", + "fname = \"two_undirected_circles.svg\"\n", + "graph.write_svg(fname)\n", + "display(SVG(filename=fname))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, the first procedure just says \"there are two 1D holes\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "VietorisRipsPersistence(metric=\"precomputed\").fit_transform_plot([two_undirected_circles]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The second procedure is again much more informative, yielding a persistence diagram with two points in homology dimension 1 with different finite deaths, each corresponding to one of the two circles:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ggd = GraphGeodesicDistance(directed=False, unweighted=True).fit_transform([two_undirected_circles])\n", + "VietorisRipsPersistence(metric=\"precomputed\").fit_transform_plot(X_ggd);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Directed graphs – ``FlagserPersistence``\n", + "\n", + "Together with the companion package ``pyflagser`` ([source code](https://github.com/giotto-ai/pyflagser), [API reference](https://docs-pyflagser.giotto.ai/)), ``giotto-tda`` can extract topological features from *directed* graphs via the ``FlagserPersistence`` transformer.\n", + "\n", + "Unlike ``VietorisRipsPersistence`` and ``SparseRipsPersistence``, ``FlagserPersistence`` *only* works on graph data, so there is no ``metric`` parameter to be set. The conventions on input data are the same as in the undirected case, cf. section ***Non-fully connected weighted graphs*** above.\n", + "\n", + "The ideas and constructions underlying the algorithm in this case are very similar to the ones described above for the undirected case. Again, we threshold the graph and its directed edges according to an ever-increasing parameter and the edge weights. And again we look at \"cliques\" of vertices to define simplices and hence a \"complex\" for each value of the parameter. The main difference is that here simplices are **ordered** sets (tuples) of vertices, and that in each instantaneous complex the \"clique\" $(v_0, v_1, \\ldots, v_k)$ is a $k$-simplex if and only if, for each $i < j$, $(v_i, v_j)$ is a currently present directed edge.\n", + "\n", + "| (1, 2, 3) ***is not*** a 2-simplex in the complex | (1, 2, 3) ***is*** a 2-simplex in the complex |\n", + "| :-- | :-- |\n", + "| ![Directed flag complex with a hole](images/nontrivial_cycle_directed_flag_complex.svg) | ![Directed flag complex without a hole](images/simplex_directed_flag_complex.svg) |\n", + "| (1, 2), (2, 3) and (3, 1) **form a 1D hole** | (1, 2), (2, 3) and (1, 3) form the boundary of (1, 2, 3) – **not a 1D hole** |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This has interesting consequences: in the examples above, the left complex, in which the edges of the triangle \"loop around\" in the same direction, contains a 1D hole. On the other hand, the right one does not!\n", + "\n", + "### Example 1: Directed circle\n", + "\n", + "Let's try this on a \"directed\" version of the circle from earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_vertices = 10\n", + "\n", + "directed_circle = make_circle_adjacency(n_vertices, directed=True)\n", + "row, col = directed_circle.nonzero()\n", + "\n", + "graph = Graph(n=n_vertices, edges=list(zip(row, col)), directed=True)\n", + "fname = \"directed_circle.svg\"\n", + "graph.write_svg(fname)\n", + "display(SVG(filename=fname))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Passing this directly to ``FlagserPersistence`` gives an unsurprising result:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FlagserPersistence().fit_transform_plot([directed_circle]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, we can chain with an instance of ``GraphGeodesicDistance`` to get more information:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "X_ggd = GraphGeodesicDistance(directed=True, unweighted=True).fit_transform([directed_circle])\n", + "FlagserPersistence().fit_transform_plot(X_ggd);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that this time the death time of the circular feature is circa one half of the number of vertices/edges. Compare this with the one-third factor we observed in the case of ``VietorisRipsPersistence``.\n", + "\n", + "### Example 2: Circle with alternating edge directions\n", + "\n", + "What happens when we make some of the edges flow the other way around the circle?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "row_flipped = np.concatenate([row[::2], col[1::2]])\n", + "column_flipped = np.concatenate([col[::2], row[1::2]])\n", + "\n", + "graph = Graph(n=n_vertices, edges=list(zip(row_flipped, column_flipped)), directed=True)\n", + "fname = \"directed_circle.svg\"\n", + "graph.write_svg(fname)\n", + "display(SVG(filename=fname))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Construct the adjacency matrix\n", + "weights = np.ones(n_vertices)\n", + "directed_circle_flipped = csr_matrix((weights, (row_flipped, column_flipped)),\n", + " shape=(n_vertices, n_vertices))\n", + "\n", + "# Run FlagserPersistence directly on the adjacency matrix\n", + "FlagserPersistence().fit_transform_plot([directed_circle_flipped]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is identical to the persistence diagram for the directed circle (and for the undirected circle using ``VietorisRipsPersistence``). We cannot tell the difference between the two directed graphs when the adjacency matrices are fed directly to ``FlagserPersistence``. Let's try with ``GraphGeodesicDistance``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ggd = GraphGeodesicDistance(directed=True, unweighted=True).fit_transform([directed_circle_flipped])\n", + "FlagserPersistence().fit_transform_plot(X_ggd);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As in the case of the directed circle, the one-dimensional feature is born at 1. However, unlike that case, it persists all the way to infinity even after preprocessing with ``GraphGeodesicDistance``!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 3: Two oppositely-directed semicircles\n", + "\n", + "Our final example consists of a circle one half of which \"flows\" in one direction, and the other half in the other." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "row_two_semicircles = np.concatenate([row[:n_vertices // 2], col[n_vertices // 2:]])\n", + "column_two_semicircles = np.concatenate([col[:n_vertices // 2], row[n_vertices // 2:]])\n", + "\n", + "graph = Graph(n=n_vertices, edges=list(zip(row_two_semicircles, column_two_semicircles)), directed=True)\n", + "fname = \"two_directed_semicircles.svg\"\n", + "graph.write_svg(fname)\n", + "display(SVG(filename=fname))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Construct the adjacency matrix\n", + "weights = np.ones(n_vertices)\n", + "two_semicircles = csr_matrix((weights, (row_two_semicircles, column_two_semicircles)),\n", + " shape=(n_vertices, n_vertices))\n", + "\n", + "# Run FlagserPersistence directly on the adjacency matrix\n", + "FlagserPersistence().fit_transform_plot([two_semicircles]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, we passed the adjacency matrix directly and obtained the same persistence diagram as for the undirected circle. Let's try preprocessing with ``GraphGeodesicDistance``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_ggd = GraphGeodesicDistance(directed=True, unweighted=True).fit_transform([two_semicircles])\n", + "FlagserPersistence(directed=True).fit_transform_plot(X_ggd);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is similar to the persistence diagram for the coherently directed circle, but the death time for the topological feature in dimension 1 is slightly lower." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Where to next?\n", + "\n", + "- Persistence diagrams are great for data exploration, but to feed their content to machine learning algorithms one must make sure the algorithm used is **independent of the relative ordering** of the birth-death pairs in each homology dimension. [gtda.diagrams](https://giotto-ai.github.io/gtda-docs/latest/modules/diagrams.html) contains a suite of vector representations, feature extraction methods and kernel methods that convert persistence diagrams into data structures ready for machine learning algorithms. Simple examples of their use are contained in [Topological feature extraction using VietorisRipsPersistence and PersistenceEntropy](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html), [Case study: Classification of shapes](https://giotto-ai.github.io/gtda-docs/latest/notebooks/classifying_shapes.html) and [Case study: Lorenz attractor](https://giotto-ai.github.io/gtda-docs/latest/notebooks/lorenz_attractor.html).\n", + "- In addition to ``GraphGeodesicDistance``, [gtda.graphs](https://giotto-ai.github.io/gtda-docs/latest/modules/graphs.html) also contains transformers for the creation of graphs from point cloud or time series data.\n", + "- Despite the name, [gtda.point_clouds](https://giotto-ai.github.io/gtda-docs/latest/modules/point_clouds.html) contains transformers for the alteration of distance matrices (which are just adjacency matrices of weighted graphs) as a preprocessing step for persistent homology.\n", + "- ``VietorisRipsPersistence`` builds on the [ripser.py](https://ripser.scikit-tda.org/index.html) project. Its website contains two tutorials on additional ways in which graphs can be constructed from [time series data](https://ripser.scikit-tda.org/notebooks/Lower%20Star%20Time%20Series.html) or [image data](https://ripser.scikit-tda.org/notebooks/Lower%20Star%20Image%20Filtrations.html), and fed to the clique complex filtration construction. With a few simple modifications, the code can be adapted to the API of ``VietorisRipsPersistence``." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/plotting_api.ipynb b/examples/plotting_api.ipynb index 22dc13186..a8639fd68 100644 --- a/examples/plotting_api.ipynb +++ b/examples/plotting_api.ipynb @@ -4,13 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Plotting in `giotto-tda`\n", + "# Plotting in ``giotto-tda``\n", "\n", - "`giotto-tda` includes a set of plotting functions and class methods, powered by `plotly`. The library's plotting API is designed to facilitate the exploration of intermediate results in pipelines by harnessing the highly visual nature of topological signatures.\n", + "``giotto-tda`` includes a set of plotting functions and class methods, powered by ``plotly``. The library's plotting API is designed to facilitate the exploration of intermediate results in pipelines by harnessing the highly visual nature of topological signatures.\n", "\n", - "This notebook is a quick tutorial on how to use `giotto-tda`'s plotting functionalities and unified plotting API. The plotting functions in `gtda.mapper` are not covered here as they are somewhat tailored to the Mapper algorithm, see the [dedicated tutorial](https://giotto-ai.github.io/gtda-docs/latest/notebooks/mapper_quickstart.html).\n", + "This notebook is a quick tutorial on how to use ``giotto-tda``'s plotting functionalities and unified plotting API. The plotting functions in ``gtda.mapper`` are not covered here as they are somewhat tailored to the Mapper algorithm, see the [dedicated tutorial](https://giotto-ai.github.io/gtda-docs/latest/notebooks/mapper_quickstart.html).\n", "\n", - "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/plotting_api.ipynb).\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/plotting_api.ipynb) and download the source.\n", "\n", "**License: AGPLv3**" ] @@ -19,24 +19,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 1. Basic philosophy and `plot` methods" + "## 1. Basic philosophy and ``plot`` methods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The computational building blocks of `giotto-tda` are `scikit-learn`–style estimators. Typically, they are also transformers, i.e. they possess a `transform` and/or a `fit-transform` method which:\n", + "The computational building blocks of ``giotto-tda`` are ``scikit-learn``–style estimators. Typically, they are also transformers, i.e. they possess a ``transform`` and/or a ``fit-transform`` method which:\n", "\n", - "- act on an array-like object `X` which collects a certain number of \"samples\" of a given kind;\n", - "- return a transformed array-like object `Xt` which collects a (potentially different) number of \"samples\" of a potentially different kind.\n", + "- act on an array-like object ``X`` which collects a certain number of \"samples\" of a given kind;\n", + "- return a transformed array-like object ``Xt`` which collects a (potentially different) number of \"samples\" of a potentially different kind.\n", "\n", - "The basic philosophy of `giotto-tda`'s class-level plotting API is to equip relevant transformers with `plot` methods taking two main arguments:\n", + "The basic philosophy of ``giotto-tda``'s class-level plotting API is to equip relevant transformers with ``plot`` methods taking two main arguments:\n", "\n", - "- an object such as `Xt` above (i.e. consistent with the *outputs* of `transform` or `fit-transform`);\n", - "- an integer index passed via the `sample` keyword and indicating which sample in `Xt` should be plotted.\n", + "- an object such as ``Xt`` above (i.e. consistent with the *outputs* of ``transform`` or ``fit-transform``);\n", + "- an integer index passed via the ``sample`` keyword and indicating which sample in ``Xt`` should be plotted.\n", "\n", - "In other words, `.plot(Xt, sample=i)` will produce a plot of `Xt[i]` which is tailored to the nature of the samples in `Xt`." + "In other words, ``.plot(Xt, sample=i)`` will produce a plot of ``Xt[i]`` which is tailored to the nature of the samples in ``Xt``." ] }, { @@ -45,16 +45,16 @@ "source": [ "### 1.1 Plotting functions\n", "\n", - "Several `plot` methods in `giotto-tda` actually fall back to specialised functions which can be found in the [plotting subpackage](https://giotto-ai.github.io/gtda-docs/latest/modules/plotting.html) and which can be used directly instead. However, unless the additional degree of control is necessary, `plot` methods should be preferred as they often exploit class parameters and/or attributes (e.g. those computed during `fit`) to automatically fill some parameters in the corresponding functions." + "Several ``plot`` methods in ``giotto-tda`` actually fall back to specialised functions which can be found in the [plotting subpackage](https://giotto-ai.github.io/gtda-docs/latest/modules/plotting.html) and which can be used directly instead. However, unless the additional degree of control is necessary, ``plot`` methods should be preferred as they often exploit class parameters and/or attributes (e.g. those computed during ``fit``) to automatically fill some parameters in the corresponding functions." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 1.2 Example: Plotting persistence diagrams with `VietorisRipsPersistence`\n", + "### 1.2 Example: Plotting persistence diagrams with ``VietorisRipsPersistence``\n", "\n", - "Let's take the example of `VietorisRipsPersistence` – a transformer also covered in [another notebook](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html). Let's create the input collection `X` for this transformer as a collection of randomly generated point clouds, each containing 100 points positioned along two circles." + "Let's take the example of ``VietorisRipsPersistence`` – a transformer also covered in [another notebook](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html). Let's create the input collection ``X`` for this transformer as a collection of randomly generated point clouds, each containing 100 points positioned along two circles." ] }, { @@ -78,7 +78,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Incidentally, samples in `X` can be plotted using `gtda.plotting.plot_point_cloud`." + "Incidentally, samples in ``X`` can be plotted using ``gtda.plotting.plot_point_cloud``." ] }, { @@ -96,7 +96,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let us instantiate a `VietorisRipsTransformer` object, and call the `fit-transform` method on `X` to obtain the transformed object `Xt`." + "Let us instantiate a ``VietorisRipsTransformer`` object, and call the ``fit-transform`` method on ``X`` to obtain the transformed object ``Xt``." ] }, { @@ -113,9 +113,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For any sample index i, `Xt[i]` is a two-dimensional array encoding the multi-scale topological information which can be extracted from the i-th point cloud `X[i]`.\n", + "For any sample index i, ``Xt[i]`` is a two-dimensional array encoding the multi-scale topological information which can be extracted from the i-th point cloud ``X[i]``.\n", "\n", - "It is typically too difficult to get a quick idea of the interesting information contained in `Xt[i]` by looking at the array directly. This information is best displayed as a so-called \"persistence diagram\" in 2D. The `plot` method of our `VietorisRipsPersistence` instance achieves precisely this:" + "It is typically too difficult to get a quick idea of the interesting information contained in ``Xt[i]`` by looking at the array directly. This information is best displayed as a so-called \"persistence diagram\" in 2D. The ``plot`` method of our ``VietorisRipsPersistence`` instance achieves precisely this:" ] }, { @@ -131,7 +131,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the case of `VietorisRipsPersistence`, `plot` is a thin wrapper around the function `gtda.plotting.plot_diagram`, so the same result could have been achieved by importing that function and calling `plot_diagram(Xt[i])`." + "In the case of ``VietorisRipsPersistence``, ``plot`` is a thin wrapper around the function ``gtda.plotting.plot_diagram``, so the same result could have been achieved by importing that function and calling ``plot_diagram(Xt[i])``." ] }, { @@ -145,23 +145,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2 Derived convenience methods: `transform_plot` and `fit_transform_plot`\n", + "## 2 Derived convenience methods: ``transform_plot`` and ``fit_transform_plot``\n", "\n", - "Where appropriate, `giotto-tda` transformers which have a `plot` method can also implement the two derived methods `transform_plot` and `fit_transform_plot`." + "Where appropriate, ``giotto-tda`` transformers which have a ``plot`` method can also implement the two derived methods ``transform_plot`` and ``fit_transform_plot``." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.1 `transform_plot`\n", + "### 2.1 ``transform_plot``\n", "\n", "This method takes two main arguments:\n", "\n", - "- an object such as `X` above (i.e. consistent with the *inputs* of `transform` or `fit-transform`);\n", - "- an integer index i passed via the `sample` keyword.\n", + "- an object such as ``X`` above (i.e. consistent with the *inputs* of ``transform`` or ``fit-transform``);\n", + "- an integer index i passed via the ``sample`` keyword.\n", "\n", - "The logic of `transform_plot` can be roughly described as follows: first, the sample `X[i]` is transformed; then, the result is plotted using `plot` and returned. [More technically: we first create a trivial collection `X_sing = [X[i]]`, which contains a single sample from `X`. Then, we compute `Xt_sing = .transform(X_sing)`. Assuming `Xt_sing` contains a single transformed sample, we call `.plot(Xt_sing, sample=0)`, and also return `Xt_sing`.]\n", + "The logic of ``transform_plot`` can be roughly described as follows: first, the sample ``X[i]`` is transformed; then, the result is plotted using ``plot`` and returned. [More technically: we first create a trivial collection ``X_sing = [X[i]]``, which contains a single sample from ``X``. Then, we compute ``Xt_sing = .transform(X_sing)``. Assuming ``Xt_sing`` contains a single transformed sample, we call ``.plot(Xt_sing, sample=0)``, and also return ``Xt_sing``.]\n", "\n", "In the example of Section 1.2, we would do:" ] @@ -181,9 +181,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.2 `fit_transform_plot`\n", + "### 2.2 ``fit_transform_plot``\n", "\n", - "This method is equivalent to first fitting the transformer using `X` (and, optionally, a target variable `y`), and then calling `transform_plot` on `X` and a given sample index.\n", + "This method is equivalent to first fitting the transformer using ``X`` (and, optionally, a target variable ``y``), and then calling ``transform_plot`` on ``X`` and a given sample index.\n", "\n", "The workflow in the example of Section 1.2 can be simplified even further, turning the entire process into a simple one-liner:" ] diff --git a/examples/time_series_classification.ipynb b/examples/time_series_classification.ipynb new file mode 100644 index 000000000..b10bbcfb1 --- /dev/null +++ b/examples/time_series_classification.ipynb @@ -0,0 +1,895 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Topology of time series\n", + "\n", + "This notebook explores how ``giotto-tda`` can be used to gain insights from time-varying data by using ideas from from dynamical systems and persistent homology.\n", + "\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/time_series_classification.ipynb) and download the source.\n", + "\n", + "## Useful references\n", + "\n", + "* [Topological Methods for the Analysis of Data](https://youtu.be/DZwK2gT-d8g) by Jose Perea\n", + "* The sliding window notebooks from Chris Tralie's [TDALabs](https://github.com/ctralie/TDALabs)\n", + "* [Detection of gravitational waves using topological data analysis and convolutional neural network: An improved approach](https://arxiv.org/abs/1910.08245) by Christopher Bresten and Jae-Hun Jung. We thank Christopher Bresten for sharing the code and data used in the article.\n", + "\n", + "## See also\n", + "\n", + "- [Topology in time series forecasting](https://giotto-ai.github.io/gtda-docs/latest/notebooks/time_series_forecasting.html), in which the *Takens embedding* technique that we will present here is used in time series forecasting tasks by using sliding windows.\n", + "- [Topological feature extraction using VietorisRipsPersistence and PersistenceEntropy](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html) for a quick introduction to general topological feature extraction in ``giotto-tda``.\n", + "\n", + "**License: AGPLv3**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## From time series to time delay embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first step in analysing the topology of time series is to construct a _**time delay embedding**_ or _**Takens embedding**_, named after [Floris Takens](https://en.wikipedia.org/wiki/Floris_Takens) who pioneered its use in the study of [dynamical systems](https://en.wikipedia.org/wiki/Takens's_theorem). A time delay embedding can be thought of as sliding a \"window\" of fixed size over a signal, with each window represented as a point in a (possibly) higher-dimensional space. A simple example is shown in the animation below, where pairs of points in a 1-dimensional signal are mapped to coordinates in a 2-dimensional embedding space. \n", + "\n", + "![A 2-dimensional time delay embedding](images/time_delay_embedding.gif)\n", + "\n", + "More formally, given a time series $f(t)$, one can extract a _**sequence of vectors**_ of the form $f_i = [f(t_i), f(t_i + 2 \\tau), \\ldots, f(t_i + (d-1) \\tau)] \\in \\mathbb{R}^{d}$, where $d$ is the _**embedding dimension**_ and $\\tau$ is the _**time delay**_. The quantity $(d-1)\\tau$ is known as the \"window size\" and the difference between $t_{i+1}$ and $t_i$ is called the **_stride_**. In other words, the time delay embedding of $f$ with parameters $(d,\\tau)$ is the function\n", + "\n", + "$$\n", + "TD_{d,\\tau} f : \\mathbb{R} \\to \\mathbb{R}^{d}\\,, \\qquad t \\to \\begin{bmatrix}\n", + " f(t) \\\\\n", + " f(t + \\tau) \\\\\n", + " f(t + 2\\tau) \\\\\n", + " \\vdots \\\\\n", + " f(t + (d-1)\\tau)\n", + " \\end{bmatrix}\n", + "$$\n", + "\n", + "\n", + "and the main idea we will explore in this notebook is that if $f$ has a non-trivial recurrent structure, then the image of $TD_{d,\\tau}f$ will have non-trivial topology for appropriate choices of $(d, \\tau)$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A periodic example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a warm-up, recall that a function is periodic with period $T > 0$ if $f(t + T) = f(t)$ for all $t \\in \\mathbb{R}$. For example, consider the function $f(t) = \\cos(5 t)$ which can be visualised as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import plotly.graph_objects as go\n", + "\n", + "x_periodic = np.linspace(0, 10, 1000)\n", + "y_periodic = np.cos(5 * x_periodic)\n", + "\n", + "fig = go.Figure(data=go.Scatter(x=x_periodic, y=y_periodic))\n", + "fig.update_layout(xaxis_title=\"Timestamp\", yaxis_title=\"Amplitude\")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can show that periodicity implies ellipticity of the time delay embedding. To do that we need to specify the embedding dimension $d$ and the time delay $\\tau$ for the Takens embedding, which in `giotto-tda` can be achieved as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.time_series import SingleTakensEmbedding\n", + "\n", + "embedding_dimension_periodic = 3\n", + "embedding_time_delay_periodic = 8\n", + "stride = 10\n", + "\n", + "embedder_periodic = SingleTakensEmbedding(\n", + " parameters_type=\"fixed\",\n", + " n_jobs=2,\n", + " time_delay=embedding_time_delay_periodic,\n", + " dimension=embedding_dimension_periodic,\n", + " stride=stride,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Tip:** You can use the `stride` parameter to downsample the time delay embedding. This is handy when you want to quickly compute persistence diagrams on a dense signal." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's apply this embedding to our one-dimensional time series to get a 3-dimensional _point cloud_:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_periodic_embedded = embedder_periodic.fit_transform(y_periodic)\n", + "print(f\"Shape of embedded time series: {y_periodic_embedded.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then use `giotto-tda`'s plotting API to visualise the result:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.plotting import plot_point_cloud\n", + "\n", + "plot_point_cloud(y_periodic_embedded)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As promised, the periodicity of $f$ is reflected in the ellipticity of the time delay embedding! It turns out that in general, _**periodic functions trace out ellipses**_ in $\\mathbb{R}^{d}$. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A non-periodic example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is another type of recurrent behaviour: if we let $f(t) = \\cos(t) + \\cos(\\pi t)$ then it follows that $f$ is not periodic since the ratio of the two frequencies is irrational, i.e. we say that $\\cos(t)$ and $\\cos(\\pi t)$ are _incommensurate_. Nevertheless, their sum produces recurrent behaviour:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_nonperiodic = np.linspace(0, 50, 1000)\n", + "y_nonperiodic = np.cos(x_nonperiodic) + np.cos(np.pi * x_nonperiodic)\n", + "\n", + "fig = go.Figure(data=go.Scatter(x=x_nonperiodic, y=y_nonperiodic))\n", + "fig.update_layout(xaxis_title=\"Timestamp\", yaxis_title=\"Amplitude\")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As before, let's create a time delay embedding for this signal and visualise the resulting point cloud:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_dimension_nonperiodic = 3\n", + "embedding_time_delay_nonperiodic = 16\n", + "stride = 3\n", + "\n", + "embedder_nonperiodic = SingleTakensEmbedding(\n", + " parameters_type=\"fixed\",\n", + " n_jobs=2,\n", + " time_delay=embedding_time_delay_nonperiodic,\n", + " dimension=embedding_dimension_nonperiodic,\n", + " stride=stride,\n", + ")\n", + "\n", + "y_nonperiodic_embedded = embedder_nonperiodic.fit_transform(y_nonperiodic)\n", + "\n", + "plot_point_cloud(y_nonperiodic_embedded)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## From time delay embeddings to persistence diagrams" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the examples above we saw that the resulting point clouds appear to exhibit distinct topology. We can verify this explicitly using persistent homology! First we need to reshape our point cloud arrays in a form suitable for the [VietorisRipsPersistence transformer](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html), namely `(n_samples, n_points, n_dimensions)`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_periodic_embedded = y_periodic_embedded[None, :, :]\n", + "y_nonperiodic_embedded = y_nonperiodic_embedded[None, :, :]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next step is to calculate the persistence diagrams associated with each point cloud. In `giotto-tda` we can do this with the Vietoris-Rips construction as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.homology import VietorisRipsPersistence\n", + "\n", + "# 0 - connected components, 1 - loops, 2 - voids\n", + "homology_dimensions = [0, 1, 2]\n", + "\n", + "periodic_persistence = VietorisRipsPersistence(\n", + " homology_dimensions=homology_dimensions, n_jobs=6\n", + ")\n", + "print(\"Persistence diagram for periodic signal\")\n", + "periodic_persistence.fit_transform_plot(y_periodic_embedded)\n", + "\n", + "nonperiodic_persistence = VietorisRipsPersistence(\n", + " homology_dimensions=homology_dimensions, n_jobs=6\n", + ")\n", + "print(\"Persistence diagram for nonperiodic signal\")\n", + "nonperiodic_persistence.fit_transform_plot(y_nonperiodic_embedded);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What can we conclude from these diagrams? The first thing that stands out is the different types of homology dimensions that are most persistent. In the periodic case we see a single point associated with 1-dimensional persistent homology, namely a loop! On the other hand, the non-periodic signal has revealed two points associated with 2-dimensional persistent homology, namely _voids_. These clear differences in topology make the time delay embedding technique especially powerful at classifying different time series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Picking the embedding dimension and time delay" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the examples above, we manually chose values for the embedding dimension $d$ and time delay $\\tau$. However, it turns out there are two techniques that can be used to determine these parameters _automatically_:\n", + "\n", + "* [Mutual information](https://en.wikipedia.org/wiki/Mutual_information) to determine $\\tau$\n", + "* [False nearest neighbours](https://en.wikipedia.org/wiki/False_nearest_neighbor_algorithm) to determine $d$\n", + "\n", + "In `giotto-tda`, these techniques are applied when we select `parameters_type=\"search\"` in the `SingleTakensEmbedding` transformer, e.g.\n", + "\n", + "```python\n", + "embedder = SingleTakensEmbedding(\n", + " parameters_type=\"search\", time_delay=time_delay, dimension=embedding_dimension,\n", + ")\n", + "```\n", + "\n", + "where the values of `time_delay` and `embedding_dimension` provide _**upper bounds**_ on the search algorithm. Before applying this to our sample signals, let's have a look at how these methods actually work under the hood." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mutual information\n", + "To determine an optimal value for $\\tau$ we first calculate the maximum $x_\\mathrm{max}$ and minimum $x_\\mathrm{min}$ values of the time series, and divide the interval $[x_\\mathrm{min}, x_\\mathrm{max}]$ into a large number of bins. We let $p_k$ be the probability that an element of the time series is in the $k$th bin and let $p_{j,k}$ be the probability that $x_i$ is in the $j$th bin while $x_{i+\\tau}$ is in the $k$th bin. Then the mutual information is defined as:\n", + "\n", + "$$ I(\\tau) = - \\sum_{j=1}^{n_\\mathrm{bins}} \\sum_{k=1}^{n_\\mathrm{bins}} p_{j,k}(\\tau) \\log \\frac{p_{j,k}(\\tau)}{p_j p_k} $$\n", + "\n", + "The first minimum of $I(\\tau)$ gives the optimal time delay since there we get the most information by adding $x_{i+\\tau}$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### False nearest neighbours\n", + "\n", + "The false nearest neighbours algorithm is based on the assumption that \"unfolding\" or embedding a deterministic system into successively higher dimensions is smooth. In other words, points which are close in one embedding dimension should be close in a higher one. More formally, if we have a point $p_i$ and neighbour $p_j$, we check if the normalised distance $R_i$ for the next dimension is greater than some threshold $R_\\mathrm{th}$:\n", + "\n", + "$$ R_i = \\frac{\\mid x_{i+m\\tau} - x_{j+m\\tau} \\mid}{\\lVert p_i - p_j \\rVert} > R_\\mathrm{th}$$\n", + "\n", + "If $R_i > R_\\mathrm{th}$ then we have a \"false nearest neighbour\" and the optimal embedding dimension is obtained by minimising the total number of such neighbours." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Running the search algorithm\n", + "\n", + "Let's now apply these ideas to our original signals to see what the algorithm determines as optimal choices for $d$ and $\\tau$. We will allow the search to scan up to relatively large values of $(d, \\tau)$ to ensure we do not get stuck in a sub-optimal minimum.\n", + "\n", + "For the periodic signal, we initialise the Takens embedding as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "max_embedding_dimension = 30\n", + "max_time_delay = 30\n", + "stride = 5\n", + "\n", + "embedder_periodic = SingleTakensEmbedding(\n", + " parameters_type=\"search\",\n", + " time_delay=max_time_delay,\n", + " dimension=max_embedding_dimension,\n", + " stride=stride,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a helper function to view the optimal values found during the search:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fit_embedder(embedder: SingleTakensEmbedding, y: np.ndarray, verbose: bool=True) -> np.ndarray:\n", + " \"\"\"Fits a Takens embedder and displays optimal search parameters.\"\"\"\n", + " y_embedded = embedder.fit_transform(y)\n", + "\n", + " if verbose:\n", + " print(f\"Shape of embedded time series: {y_embedded.shape}\")\n", + " print(\n", + " f\"Optimal embedding dimension is {embedder.dimension_} and time delay is {embedder.time_delay_}\"\n", + " )\n", + "\n", + " return y_embedded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_periodic_embedded = fit_embedder(embedder_periodic, y_periodic)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although the resulting embedding is in a high dimensional space, we can apply dimensionality reduction techniques like [principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) to project down to 3-dimensions for visualisation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "\n", + "pca = PCA(n_components=3)\n", + "y_periodic_embedded_pca = pca.fit_transform(y_periodic_embedded)\n", + "plot_point_cloud(y_periodic_embedded_pca)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now for the non-periodic case we have:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedder_nonperiodic = SingleTakensEmbedding(\n", + " parameters_type=\"search\",\n", + " n_jobs=2,\n", + " time_delay=max_time_delay,\n", + " dimension=max_embedding_dimension,\n", + " stride=stride,\n", + ")\n", + "\n", + "y_nonperiodic_embedded = fit_embedder(embedder_nonperiodic, y_nonperiodic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca = PCA(n_components=3)\n", + "y_nonperiodic_embedded_pca = pca.fit_transform(y_nonperiodic_embedded)\n", + "plot_point_cloud(y_nonperiodic_embedded_pca)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So we have embedding point clouds whose geometry looks clearly distinct; how about the persistence diagrams? As we did earlier, we first need to reshape our arrays into the form `(n_samples, n_points, n_dimensions)`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_periodic_embedded = y_periodic_embedded[None, :, :]\n", + "y_nonperiodic_embedded = y_nonperiodic_embedded[None, :, :]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next step is to calculate the persistence diagrams associated with each point cloud:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "homology_dimensions = [0, 1, 2]\n", + "\n", + "periodic_persistence = VietorisRipsPersistence(homology_dimensions=homology_dimensions)\n", + "print(\"Persistence diagram for periodic signal\")\n", + "periodic_persistence.fit_transform_plot(y_periodic_embedded)\n", + "\n", + "nonperiodic_persistence = VietorisRipsPersistence(\n", + " homology_dimensions=homology_dimensions, n_jobs=6\n", + ")\n", + "print(\"Persistence diagram for nonperiodic signal\")\n", + "nonperiodic_persistence.fit_transform_plot(y_nonperiodic_embedded);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case the persistence diagram for the periodic signal is essentially unchanged, but the non-periodic signal now reveals two $H_1$ points and one $H_2$ one - the signature of a hypertorus! It turns out that in general, the image of $TD_{d,\\tau}f$ is a hypertorus." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gravitational wave detection\n", + "As an application of the above ideas, let's examine how persistent homology can help detect gravitational waves in noisy signals. The following is adapted from the article by Chrisopher Bresten and Jae-Hun Jung. As shown in the videos below, we will aim to pick out the \"chirp\" signal of two colliding black holes from a very noisy backgound." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import YouTubeVideo\n", + "\n", + "YouTubeVideo(\"Y3eR49ogsF0\", width=600, height=400)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "YouTubeVideo(\"QyDcTbR-kEA\", width=600, height=400)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the article, the authors create a synthetic training set as follows: \n", + "\n", + "* Generate gravitational wave signals that correspond to non-spinning binary black hole mergers\n", + "* Generate a noisy time series and embed a gravitational wave signal with probability 0.5 at a random time.\n", + "\n", + "The result is a set of time series of the form\n", + "\n", + "$$ s = g + \\epsilon \\frac{1}{R}\\xi $$\n", + "\n", + "where $g$ is a gravitational wave signal from the reference set, $\\xi$ is Gaussian noise, $\\epsilon=10^{-19}$ scales the noise amplitude to the signal, and $R \\in (0.075, 0.65)$ is a parameter that controls the signal-to-noise-ratio (SNR)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Constant signal-to-noise ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a warmup, let's generate some noisy signals with a constant SNR of 17.98. As shown in Table 1 of the article, this corresponds to an $R$ value of 0.65. By picking the upper end of the interval, we can gain a sense for what the best possible performance is for our time series classifier. We pick a small number of samples to make the computations run fast, but in practice would scale this by 1-2 orders of magnitude as done in the original article." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from data.generate_datasets import make_gravitational_waves\n", + "from pathlib import Path\n", + "\n", + "R = 0.65\n", + "n_signals = 100\n", + "DATA = Path(\"./data\")\n", + "\n", + "noisy_signals, gw_signals, labels = make_gravitational_waves(\n", + " path_to_data=DATA, n_signals=n_signals, r_min=R, r_max=R, n_snr_values=1\n", + ")\n", + "\n", + "print(f\"Number of noisy signals: {len(noisy_signals)}\")\n", + "print(f\"Number of timesteps per series: {len(noisy_signals[0])}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next let's visualise the two different types of time series that we wish to classify: one that is pure noise vs. one that is composed of noise plus an embedded gravitational wave signal:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from plotly.subplots import make_subplots\n", + "import plotly.graph_objects as go\n", + "\n", + "# get the index corresponding to the first pure noise time series\n", + "background_idx = np.argmin(labels)\n", + "# get the index corresponding to the first noise + gravitational wave time series\n", + "signal_idx = np.argmax(labels)\n", + "\n", + "ts_noise = noisy_signals[background_idx]\n", + "ts_background = noisy_signals[signal_idx]\n", + "ts_signal = gw_signals[signal_idx]\n", + "\n", + "fig = make_subplots(rows=1, cols=2)\n", + "\n", + "fig.add_trace(\n", + " go.Scatter(x=list(range(len(ts_noise))), y=ts_noise, mode=\"lines\", name=\"noise\"),\n", + " row=1,\n", + " col=1,\n", + ")\n", + "\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=list(range(len(ts_background))),\n", + " y=ts_background,\n", + " mode=\"lines\",\n", + " name=\"background\",\n", + " ),\n", + " row=1,\n", + " col=2,\n", + ")\n", + "\n", + "fig.add_trace(\n", + " go.Scatter(x=list(range(len(ts_signal))), y=ts_signal, mode=\"lines\", name=\"signal\"),\n", + " row=1,\n", + " col=2,\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's examine what the time delay embedding of a pure gravitational wave signal looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_dimension = 30\n", + "embedding_time_delay = 30\n", + "stride = 5\n", + "\n", + "embedder = SingleTakensEmbedding(\n", + " parameters_type=\"search\", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride\n", + ")\n", + "\n", + "y_gw_embedded = fit_embedder(embedder, gw_signals[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we did in our simple examples, we can use PCA to project our high-dimensional space to 3-dimensions for visualisation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca = PCA(n_components=3)\n", + "y_gw_embedded_pca = pca.fit_transform(y_gw_embedded)\n", + "\n", + "plot_point_cloud(y_gw_embedded_pca)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the plot we can see that the decaying periodic signal generated by a black hole merger emerges as a _spiral_ in the time delay embedding space! For contrast, let's compare this to one of the pure noise time series in our sample:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_dimension = 30\n", + "embedding_time_delay = 30\n", + "stride = 5\n", + "\n", + "embedder = SingleTakensEmbedding(\n", + " parameters_type=\"search\", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride\n", + ")\n", + "\n", + "y_noise_embedded = fit_embedder(embedder, noisy_signals[background_idx])\n", + "\n", + "pca = PCA(n_components=3)\n", + "y_noise_embedded_pca = pca.fit_transform(y_noise_embedded)\n", + "\n", + "plot_point_cloud(y_noise_embedded_pca)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evidently, pure noise resembles a high-dimensional ball in the time delay embedding space. Let's see if we can use persistent homology to tease apart which time series contain a gravitational wave signal versus those that don't. To do so we will adapt the strategy from the original article:\n", + "\n", + "1. Generate 200-dimensional time delay embeddings of each time series\n", + "2. Use PCA to reduce the time delay embeddings to 3-dimensions\n", + "3. Use the Vietoris-Rips construction to calculate persistence diagrams of $H_0$ and $H_1$ generators\n", + "4. Extract feature vectors using persistence entropy\n", + "5. Train a binary classifier on the topological features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define the topological feature generation pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can do steps 1 and 2 by using the following ``giotto-tda`` tools:\n", + "\n", + "- The ``TakensEmbedding`` transformer – instead of ``SingleTakensEmbedding`` – which will transform each time series in ``noisy_signals`` separately and return a collection of point clouds;\n", + "- ``CollectionTransformer``, which is a convenience \"meta-estimator\" for applying the same PCA to each point cloud resulting from step 1.\n", + "\n", + "Using the ``Pipeline`` class from ``giotto-tda``, we can chain all operations up to and including step 4 as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.time_series import TakensEmbedding\n", + "from gtda.metaestimators import CollectionTransformer\n", + "from gtda.diagrams import PersistenceEntropy, Scaler\n", + "from gtda.pipeline import Pipeline\n", + "\n", + "embedding_dimension = 200\n", + "embedding_time_delay = 10\n", + "stride = 10\n", + "\n", + "embedder = TakensEmbedding(time_delay=embedding_time_delay,\n", + " dimension=embedding_dimension,\n", + " stride=stride)\n", + "\n", + "batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)\n", + "\n", + "persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)\n", + "\n", + "scaling = Scaler()\n", + "\n", + "entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)\n", + "\n", + "\n", + "steps = [(\"embedder\", embedder),\n", + " (\"pca\", batch_pca),\n", + " (\"persistence\", persistence),\n", + " (\"scaling\", scaling),\n", + " (\"entropy\", entropy)]\n", + "topological_transfomer = Pipeline(steps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = topological_transfomer.fit_transform(noisy_signals)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and evaluate a model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the final step, let's train a simple classifier on our topological features. As usual we create training and validation sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_valid, y_train, y_valid = train_test_split(\n", + " features, labels, test_size=0.1, random_state=42\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and then fit and evaluate our model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score, roc_auc_score\n", + "\n", + "\n", + "def print_scores(fitted_model):\n", + " res = {\n", + " \"Accuracy on train:\": accuracy_score(fitted_model.predict(X_train), y_train),\n", + " \"ROC AUC on train:\": roc_auc_score(\n", + " y_train, fitted_model.predict_proba(X_train)[:, 1]\n", + " ),\n", + " \"Accuracy on valid:\": accuracy_score(fitted_model.predict(X_valid), y_valid),\n", + " \"ROC AUC on valid:\": roc_auc_score(\n", + " y_valid, fitted_model.predict_proba(X_valid)[:, 1]\n", + " ),\n", + " }\n", + "\n", + " for k, v in res.items():\n", + " print(k, round(v, 3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)\n", + "print_scores(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a simple baseline, this model is not too bad - it outperforms the deep learning baseline in the article which typically fares little better than random on the raw data. However, the combination of deep learning and persistent homology is where significant performance gains are seen - we leave this as an exercise to the intrepid reader!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/time_series_forecasting.ipynb b/examples/time_series_forecasting.ipynb new file mode 100644 index 000000000..dc0fa1f7f --- /dev/null +++ b/examples/time_series_forecasting.ipynb @@ -0,0 +1,525 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Topology in time series forecasting\n", + "\n", + "This notebook shows how ``giotto-tda`` can be used to create topological features for time series forecasting tasks, and how to integrate them into ``scikit-learn``–compatible pipelines.\n", + "\n", + "In particular, we will concentrate on topological features which are created from consecutive **sliding windows** over the data. In sliding window models, a single time series array ``X`` of shape ``(n_timestamps, n_features)`` is turned into a time series of windows over the data, with a new shape ``(n_windows, n_samples_per_window, n_features)``. There are two main issues that arise when building forecasting models with sliding windows:\n", + "\n", + "1. ``n_windows`` is smaller than ``n_timestamps``. This is because we cannot have more windows than there are timestamps without padding ``X``, and this is not done by ``giotto-tda``. ``n_timestamps - n_windows`` is even larger if we decide to pick a large stride between consecutive windows.\n", + "2. The target variable ``y`` needs to be properly \"aligned\" with each window so that the forecasting problem is meaningful and e.g. we don't \"leak\" information from the future. In particular, ``y`` needs to be \"resampled\" so that it too has length ``n_windows``.\n", + "\n", + "To deal with these issues, ``giotto-tda`` provides a selection of transformers with ``resample``, ``transform_resample`` and ``fit_transform_resample`` methods. These are inherited from a ``TransformerResamplerMixin`` base class. Furthermore, ``giotto-tda`` provides a drop-in replacement for ``scikit-learn``'s ``Pipeline`` which extends it to allow chaining ``TransformerResamplerMixin``s with regular ``scikit-learn`` estimators.\n", + "\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/time_series_forecasting.ipynb) and download the source.\n", + "\n", + "## See also\n", + "\n", + "- [Topology of time series](https://giotto-ai.github.io/gtda-docs/latest/notebooks/time_series_classification.html) which explains how transforming time series into point clouds via the *Takens embedding* procedure makes topological feature extraction possible.\n", + "- [Topological feature extraction using VietorisRipsPersistence and PersistenceEntropy](https://giotto-ai.github.io/gtda-docs/latest/notebooks/vietoris_rips_quickstart.html) for a quick introduction to general topological feature extraction in ``giotto-tda``.\n", + "\n", + "**License: AGPLv3**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ``SlidingWindow``\n", + "\n", + "Let us start with a simple example of a \"time series\" ``X`` with a corresponding target ``y`` of the same length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "n_timestamps = 10\n", + "X, y = np.arange(n_timestamps), np.arange(n_timestamps) - n_timestamps\n", + "X, y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can instantiate our sliding window transformer-resampler and run it on the pair ``(X, y)``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.time_series import SlidingWindow\n", + "\n", + "window_size = 3\n", + "stride = 2\n", + "\n", + "SW = SlidingWindow(size=window_size, stride=stride)\n", + "X_sw, yr = SW.fit_transform_resample(X, y)\n", + "X_sw, yr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We note a couple of things:\n", + "- ``fit_transform_resample`` returns a pair: the window-transformed ``X`` and the resampled and aligned ``y``.\n", + "- ``SlidingWindow`` has made a choice for us on how to resample ``y`` and line it up with the windows from ``X``: a window on ``X`` corresponds to the *last* value in a corresponding window over ``y``. This is common in time series forecasting where, for example, ``y`` could be a shift of ``X`` by one timestamp.\n", + "- Some of the initial values of ``X`` may not be found in ``X_sw``. This is because ``SlidingWindow`` only ensures the *last* value is represented in the last window, regardless of the stride. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multivariate time series example: Sliding window + topology ``Pipeline``" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``giotto-tda``'s topology transformers expect 3D input. But our ``X_sw`` above is 2D. How do we capture interesting properties of the topology of input time series then? For univariate time series, it turns out that a good way is to use the \"time delay embedding\" or \"Takens embedding\" technique explained in the first part of [Topology of time series](https://github.com/giotto-ai/giotto-tda/blob/master/examples/time_series_classification.ipynb). But as this involves an extra layer of complexity, we leave it for later and concentrate for now on an example with a simpler API which also demonstrates the use of a ``giotto-tda`` ``Pipeline``.\n", + "\n", + "Surprisingly, this involves multivariate time series input!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rng = np.random.default_rng(42)\n", + "\n", + "n_features = 2\n", + "\n", + "X = rng.integers(0, high=20, size=(n_timestamps, n_features), dtype=int)\n", + "X" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are interpreting this input as a time series in two variables, of length ``n_timestamps``. The target variable is the same ``y`` as before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SW = SlidingWindow(size=window_size, stride=stride)\n", + "X_sw, yr = SW.fit_transform_resample(X, y)\n", + "X_sw, yr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``X_sw`` is now a complicated-looking array, but it has a simple interpretation. Again, ``X_sw[i]`` is the ``i``-th window on ``X``, and it contains ``window_size`` samples from the original time series. This time, the samples are not scalars but 1D arrays.\n", + "\n", + "What if we suspect that the way in which the **correlations** between the variables evolve over time can help forecast the target ``y``? This is a common situation in neuroscience, where each variable could be data from a single EEG sensor, for instance.\n", + "\n", + "``giotto-tda`` exposes a ``PearsonDissimilarity`` transformer which creates a 2D dissimilarity matrix from each window in ``X_sw``, and stacks them together into a single 3D object. This is the correct format (and information content!) for a typical topological transformer in ``gtda.homology``. See also [Topological feature extraction from graphs](https://github.com/giotto-ai/giotto-tda/blob/master/examples/persistent_homology_graphs.ipynb) for an in-depth look. Finally, we can extract simple scalar features using a selection of transformers in ``gtda.diagrams``." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.time_series import PearsonDissimilarity\n", + "from gtda.homology import VietorisRipsPersistence\n", + "from gtda.diagrams import Amplitude\n", + "\n", + "PD = PearsonDissimilarity()\n", + "X_pd = PD.fit_transform(X_sw)\n", + "VR = VietorisRipsPersistence(metric=\"precomputed\")\n", + "X_vr = VR.fit_transform(X_pd) # \"precomputed\" required on dissimilarity data\n", + "Ampl = Amplitude()\n", + "X_a = Ampl.fit_transform(X_vr)\n", + "X_a" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we are not acting on ``y`` above. We are simply creating features from each window using topology! *Note*: it's two features per window because we used the default value for ``homology_dimensions`` in ``VietorisRipsPersistence``, not because we had two variables in the time series initially!\n", + "\n", + "We can now put this all together into a ``giotto-tda`` ``Pipeline`` which combines both the sliding window transformation on ``X`` and resampling of ``y`` with the feature extraction from the windows on ``X``.\n", + "\n", + "*Note*: while we could import the ``Pipeline`` class and use its constructor, we use the convenience function ``make_pipeline`` instead, which is a drop-in replacement for [scikit-learn's](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import set_config\n", + "set_config(display='diagram') # For HTML representations of pipelines\n", + "\n", + "from gtda.pipeline import make_pipeline\n", + "\n", + "pipe = make_pipeline(SW, PD, VR, Ampl)\n", + "pipe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, if we have a *regression* task on ``y`` we can add a final estimator such as scikit-learn's ``RandomForestRegressor`` as a final step in the previous pipeline, and fit it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "RFR = RandomForestRegressor()\n", + "\n", + "pipe = make_pipeline(SW, PD, VR, Ampl, RFR)\n", + "pipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.fit(X, y)\n", + "y_pred = pipe.predict(X)\n", + "score = pipe.score(X, y)\n", + "y_pred, score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Univariate time series – ``TakensEmbedding`` and ``SingleTakensEmbedding``\n", + "\n", + "The first part of [Topology of time series](https://github.com/giotto-ai/giotto-tda/blob/master/examples/time_series_classification.ipynb) explains a commonly used technique for converting a univariate time series into a single **point cloud**. Since topological features can be extracted from any point cloud, this is a gateway to time series analysis using topology. The second part of that notebook shows how to transform a *batch* of time series into a batch of point clouds, and how to extract topological descriptors from each of them independently. While in that notebook this is applied to a time series classification task, in this notebook we are concerned with topology-powered *forecasting* from a single time series.\n", + "\n", + "Reasoning by analogy with the multivariate case above, we can look at sliding windows over ``X`` as small time series in their own right and track the evolution of *their* topology against the variable of interest (or against itself, if we are interested in unsupervised tasks such as anomaly detection).\n", + "\n", + "There are two ways in which we can implement this idea in ``giotto-tda``:\n", + "1. We can first apply a ``SlidingWindow``, and then an instance of ``TakensEmbedding``.\n", + "2. We can *first* compute a global Takens embedding of the time series via ``SingleTakensEmbedding``, which takes us from 1D/column data to 2D data, and *then* partition the 2D data of vectors into sliding windows via ``SlidingWindow``.\n", + "\n", + "The first route ensures that we can run our \"topological feature extraction track\" in parallel with other feature-generation pipelines from sliding windows, without experiencing shape mismatches. The second route seems a little upside-down and it is not generally recommended, but it has the advantange that globally \"optimal\" parameters for the \"time delay\" and \"embedding dimension\" parameters can be computed automatically by ``SingleTakensEmbedding``. \n", + "\n", + "Below is what each route would look like.\n", + "\n", + "*Remark:* In the presence of noise, a small sliding window size is likely to reduce the reliability of the estimate of the time series' local topology." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 1: ``SlidingWindow`` + ``TakensEmbedding``\n", + "\n", + "``TakensEmbedding`` is not a ``TransformerResamplerMixin``, but this is not a problem in the context of a ``Pipeline`` when we order things in this way." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.time_series import TakensEmbedding\n", + "\n", + "X = np.arange(n_timestamps)\n", + "\n", + "window_size = 5\n", + "stride = 2\n", + "\n", + "SW = SlidingWindow(size=window_size, stride=stride)\n", + "X_sw, yr = SW.fit_transform_resample(X, y)\n", + "X_sw, yr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time_delay = 1\n", + "dimension = 2\n", + "\n", + "TE = TakensEmbedding(time_delay=time_delay, dimension=dimension)\n", + "X_te = TE.fit_transform(X_sw)\n", + "X_te" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "VR = VietorisRipsPersistence() # No \"precomputed\" for point clouds\n", + "Ampl = Amplitude()\n", + "RFR = RandomForestRegressor()\n", + "\n", + "pipe = make_pipeline(SW, TE, VR, Ampl, RFR)\n", + "pipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.fit(X, y)\n", + "y_pred = pipe.predict(X)\n", + "score = pipe.score(X, y)\n", + "y_pred, score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 2: ``SingleTakensEmbeding`` + ``SlidingWindow``\n", + "\n", + "Note that ``SingleTakensEmbedding`` is also a ``TransformerResamplerMixin``, and that the logic for resampling/aligning ``y`` is the same as in ``SlidingWindow``." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.time_series import SingleTakensEmbedding\n", + "\n", + "X = np.arange(n_timestamps)\n", + "\n", + "STE = SingleTakensEmbedding(parameters_type=\"search\", time_delay=2, dimension=3)\n", + "X_ste, yr = STE.fit_transform_resample(X, y)\n", + "X_ste, yr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "window_size = 5\n", + "stride = 2\n", + "\n", + "SW = SlidingWindow(size=window_size, stride=stride)\n", + "X_sw, yr = SW.fit_transform_resample(X_ste, yr)\n", + "X_sw, yr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From here on, it is easy to push a very similar pipeline through as in the multivariate case:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "VR = VietorisRipsPersistence() # No \"precomputed\" for point clouds\n", + "Ampl = Amplitude()\n", + "RFR = RandomForestRegressor()\n", + "\n", + "pipe = make_pipeline(STE, SW, VR, Ampl, RFR)\n", + "pipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.fit(X, y)\n", + "y_pred = pipe.predict(X)\n", + "score = pipe.score(X, y)\n", + "y_pred, score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Integrating non-topological features\n", + "\n", + "The best results are obtained when topological methods are used not in isolation but in **combination** with other methods. Here's an example where, in parallel with the topological feature extraction from local sliding windows using **Option 2** above, we also compute the mean and variance in each sliding window. A ``scikit-learn`` ``FeatureUnion`` is used to combine these very different sets of features into a single pipeline object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import partial\n", + "from sklearn.preprocessing import FunctionTransformer\n", + "from sklearn.pipeline import FeatureUnion\n", + "from sklearn.base import clone\n", + "\n", + "mean = FunctionTransformer(partial(np.mean, axis=1, keepdims=True))\n", + "var = FunctionTransformer(partial(np.var, axis=1, keepdims=True))\n", + "\n", + "pipe_topology = make_pipeline(TE, VR, Ampl)\n", + "\n", + "feature_union = FeatureUnion([(\"window_mean\", mean),\n", + " (\"window_variance\", var),\n", + " (\"window_topology\", pipe_topology)])\n", + " \n", + "pipe = make_pipeline(SW, feature_union, RFR)\n", + "pipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.fit(X, y)\n", + "y_pred = pipe.predict(X)\n", + "score = pipe.score(X, y)\n", + "y_pred, score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Endogeneous target preparation with ``Labeller``\n", + "\n", + "Let us say that we simply wish to predict the future of a time series from itself. This is very common in the study of financial markets for example. ``giotto-tda`` provides convenience classes for target preparation from a time series. This notebook only shows a very simple example: many more options are described in ``Labeller``'s documentation.\n", + "\n", + "If we wished to create a target ``y`` from ``X`` such that ``y[i]`` is equal to ``X[i + 1]``, while also modifying ``X`` and ``y`` so that they still have the same length, we could proceed as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gtda.time_series import Labeller\n", + "\n", + "X = np.arange(10)\n", + "\n", + "Lab = Labeller(size=1, func=np.max)\n", + "Xl, yl = Lab.fit_transform_resample(X, X)\n", + "Xl, yl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we are feeding two copies of ``X`` to ``fit_transform_resample`` in this case!\n", + "\n", + "This is what fitting an end-to-end pipeline for future prediction using topology could look like. Again, you are encouraged to include your own non-topological features in the mix!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SW = SlidingWindow(size=5)\n", + "TE = TakensEmbedding(time_delay=1, dimension=2)\n", + "VR = VietorisRipsPersistence()\n", + "Ampl = Amplitude()\n", + "RFR = RandomForestRegressor()\n", + "\n", + "# Full pipeline including the regressor\n", + "pipe = make_pipeline(Lab, SW, TE, VR, Ampl, RFR)\n", + "pipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.fit(X, X)\n", + "y_pred = pipe.predict(X)\n", + "y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Where to next?\n", + "\n", + "1. There are two additional simple ``TransformerResamplerMixin``s in ``gtda.time_series``: ``Resampler`` and ``Stationarizer``.\n", + "2. The sort of pipeline for topological feature extraction using Takens embedding is a bit crude. More sophisticated methods exist for extracting robust topological summaries from (windows on) time series. A good source of inspiration is the following paper:\n", + "\n", + " > [Persistent Homology of Complex Networks for Dynamic State Detection](https://arxiv.org/abs/1904.07403), by A. Myers, E. Munch, and F. A. Khasawneh.\n", + " \n", + " The module ``gtda.graphs`` contains several transformers implementing the main algorithms proposed there.\n", + "3. Advanced users may be interested in ``ConsecutiveRescaling``, which can be found in ``gtda.point_clouds``.\n", + "4. The notebook [Case study: Lorenz attractor](https://github.com/giotto-ai/giotto-tda/blob/master/examples/lorenz_attractor.ipynb) is an advanced use-case for ``TakensEmbedding`` and other time series forecasting techniques inspired by topology." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/vietoris_rips_quickstart.ipynb b/examples/vietoris_rips_quickstart.ipynb index 50359bfa4..0f7431a88 100644 --- a/examples/vietoris_rips_quickstart.ipynb +++ b/examples/vietoris_rips_quickstart.ipynb @@ -4,11 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Topological feature extraction using `VietorisRipsPersistence` and `PersistenceEntropy`\n", + "# Topological feature extraction using ``VietorisRipsPersistence`` and ``PersistenceEntropy``\n", "\n", - "In this notebook, we showcase the ease of use of one of the core components of `giotto-tda`: `VietorisRipsPersistence`, along with vectorisation methods. We first list steps in a typical, topological-feature extraction routine and then show to encapsulate them with a standard `scikit-learn`–like pipeline.\n", + "In this notebook, we showcase the ease of use of one of the core components of ``giotto-tda``: ``VietorisRipsPersistence``, along with vectorization methods. We first list steps in a typical, topological-feature extraction routine and then show to encapsulate them with a standard ``scikit-learn``–like pipeline.\n", "\n", - "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/vietoris_rips_quickstart.ipynb).\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/vietoris_rips_quickstart.ipynb) and download the source.\n", "\n", "**License: AGPLv3**" ] @@ -17,7 +17,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Import libraries" + "## Generate data\n", + "\n", + "Let's begin by generating 3D point clouds of spheres and tori, along with a label of 0 (1) for each sphere (torus). We also add noise to each point cloud, whose effect is to displace the points sampling the surfaces by a random amount in a random direction. **Note**: You will need the auxiliary module [generate_datasets.py](https://github.com/giotto-ai/giotto-tda/blob/master/examples/data/generate_datasets.py) to run this cell. You can change the second argument of ``generate_point_clouds`` to obtain a finer or coarser sampling, or tune the level of noise via the third." ] }, { @@ -26,19 +28,21 @@ "metadata": {}, "outputs": [], "source": [ - "from gtda.diagrams import PersistenceEntropy\n", - "from gtda.homology import VietorisRipsPersistence\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.model_selection import train_test_split" + "from data.generate_datasets import generate_point_clouds\n", + "n_samples_per_class = 10\n", + "point_clouds, labels = generate_point_clouds(n_samples_per_class, 10, 0.1)\n", + "point_clouds.shape\n", + "print(f\"There are {point_clouds.shape[0]} point clouds in {point_clouds.shape[2]} dimensions, \"\n", + " f\"each with {point_clouds.shape[1]} points.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Generate data\n", + "## Calculate persistent homology\n", "\n", - "Let's begin by generating 3D point clouds of spheres and tori, along with a label of 0 (1) for each sphere (torus). We also add noise to each point cloud, whose effect is to displace the points sampling the surfaces by a random amount in a random direction. **Note**: You will need the auxiliary module [datasets.py](https://github.com/giotto-ai/giotto-tda/blob/master/examples/datasets.py) to run this cell." + "Instantiate a ``VietorisRipsPersistence`` transformer and calculate so-called **persistence diagrams** for this collection of point clouds." ] }, { @@ -47,17 +51,23 @@ "metadata": {}, "outputs": [], "source": [ - "from datasets import generate_point_clouds\n", - "point_clouds, labels = generate_point_clouds(100, 10, 0.1)" + "from gtda.homology import VietorisRipsPersistence\n", + "\n", + "VR = VietorisRipsPersistence(homology_dimensions=[0, 1, 2]) # Parameter explained in the text\n", + "diagrams = VR.fit_transform(point_clouds)\n", + "diagrams.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Calculate persistent homology\n", + "**Important note**: ``VietorisRipsPersistence``, and all other \"persistent homology\" transformers in ``gtda.homology``, expect input in the form of a 3D array or, in some cases, a list of 2D arrays. For each entry in the input (here, for each point cloud in ``point_clouds``) they compute a topological summary which is also a 2D array, and then stack all these summaries into a single output 3D array. So, in our case, ``diagrams[i]`` represents the topology of ``point_clouds[i]``. ``diagrams[i]`` is interpreted as follows:\n", + "- Each row is a triplet describing a single topological feature found in ``point_clouds[i]``.\n", + "- The first and second entries (respectively) in the triplet denote the values of the \"filtration parameter\" at which the feature appears or disappears respectively. They are referred to as the \"birth\" and \"death\" values of the feature (respectively). The meaning of \"filtration parameter\" depends on the specific transformer, but in the case of ``VietorisRipsPersistence`` on point clouds it has the interpretation of a length scale.\n", + "- A topological feature can be a connected component, 1D hole/loop, 2D cavity, or more generally $d$-dimensional \"void\" which exists in the data at scales between its birth and death values. The integer $d$ is the *homology dimension* (or degree) of the feature and is stored as the third entry in the triplet. In this example, the shapes should have 2D cavities so we explicitly tell ``VietorisRipsPersistence`` to look for these by using the ``homology_dimensions`` parameter!\n", "\n", - "Instantiate a `VietorisRipsPersistence` transformer and calculate persistence diagrams for this collection of point clouds." + "If we make one scatter plot per available homology dimension, and plot births and deaths as x- and y-coordinates of points in 2D, we end up with a 2D representation of ``diagrams[i]``, and the reason why it is called a persistence *diagram*:" ] }, { @@ -66,8 +76,17 @@ "metadata": {}, "outputs": [], "source": [ - "vietorisrips_tr = VietorisRipsPersistence()\n", - "diagrams = vietorisrips_tr.fit_transform(point_clouds)" + "from gtda.plotting import plot_diagram\n", + "\n", + "i = 0\n", + "plot_diagram(diagrams[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The notebook [Plotting in giotto-tda](https://giotto-ai.github.io/gtda-docs/latest/notebooks/plotting_api.html) delves deeper into the plotting functions and class methods available in ``giotto-tda``." ] }, { @@ -76,7 +95,7 @@ "source": [ "## Extract features\n", "\n", - "Instantiate a `PersistenceEntropy` transformer and extract features from the persistence diagrams." + "Instantiate a ``PersistenceEntropy`` transformer and extract scalar features from the persistence diagrams." ] }, { @@ -85,8 +104,10 @@ "metadata": {}, "outputs": [], "source": [ - "entropy_tr = PersistenceEntropy()\n", - "features = entropy_tr.fit_transform(diagrams)" + "from gtda.diagrams import PersistenceEntropy\n", + "\n", + "PE = PersistenceEntropy()\n", + "features = PE.fit_transform(diagrams)" ] }, { @@ -95,7 +116,7 @@ "source": [ "## Use the new features in a standard classifier\n", "\n", - "Leverage the compatibility with `scikit-learn` to perform a train-test split and score the features." + "Leverage the compatibility with ``scikit-learn`` to perform a train-test split and score the features." ] }, { @@ -104,6 +125,9 @@ "metadata": {}, "outputs": [], "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "\n", "X_train, X_valid, y_train, y_valid = train_test_split(features, labels)\n", "model = RandomForestClassifier()\n", "model.fit(X_train, y_train)\n", @@ -114,9 +138,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Encapsulates the steps above in a pipeline\n", + "## Encapsulate the steps above in a pipeline\n", "\n", - "Subdivide into train-validation first, and use the pipeline." + "1. Define an end-to-end pipeline by chaining transformers from ``giotto-tda`` with ``scikit-learn`` ones\n", + "2. Train-test split the input point cloud data and labels.\n", + "3. Fir the pipeline on the training data.\n", + "4. Score the fitted pipeline on the test data." ] }, { @@ -125,71 +152,19 @@ "metadata": {}, "outputs": [], "source": [ - "from gtda.pipeline import make_pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define the pipeline\n", + "from sklearn.pipeline import make_pipeline\n", "\n", - "Chain transformers from `giotto-tda` with `scikit-learn` ones." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "steps = [VietorisRipsPersistence(),\n", + "steps = [VietorisRipsPersistence(homology_dimensions=[0, 1, 2]),\n", " PersistenceEntropy(),\n", " RandomForestClassifier()]\n", - "pipeline = make_pipeline(*steps)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare the data\n", - "Train-test split on the point-cloud data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pcs_train, pcs_valid, labels_train, labels_valid = train_test_split(\n", - " point_clouds, labels)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train and score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "pipeline = make_pipeline(*steps)\n", + "\n", + "pcs_train, pcs_valid, labels_train, labels_valid = train_test_split(point_clouds, labels)\n", + "\n", "pipeline.fit(pcs_train, labels_train)\n", + "\n", "pipeline.score(pcs_valid, labels_valid)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -208,7 +183,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/examples/voids_on_the_plane.ipynb b/examples/voids_on_the_plane.ipynb index f67fbed3b..d8640d80a 100644 --- a/examples/voids_on_the_plane.ipynb +++ b/examples/voids_on_the_plane.ipynb @@ -10,7 +10,7 @@ "Challenge question: **Can two-dimensional topological voids arise from point clouds in two-dimensional space?**\n", "We will answer this question programmatically by computing Vietoris–Rips persistent homology of random point clouds in the square $[0, 1] \\times [0, 1] \\subset \\mathbb{R}^2$.\n", "\n", - "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/voids_on_the_plane.ipynb).\n", + "If you are looking at a static version of this notebook and would like to run its contents, head over to [GitHub](https://github.com/giotto-ai/giotto-tda/blob/master/examples/voids_on_the_plane.ipynb) and download the source.\n", "\n", "**License: AGPLv3**" ] diff --git a/gtda/__init__.py b/gtda/__init__.py index c738ddf0c..d79bbd4ee 100644 --- a/gtda/__init__.py +++ b/gtda/__init__.py @@ -1,4 +1,17 @@ from ._version import __version__ -__all__ = ['mapper', 'homology', 'time_series', 'graphs', 'diagrams', 'images', - 'utils', 'point_clouds', 'externals', 'plotting', '__version__'] +__all__ = [ + 'mapper', + 'time_series', + 'graphs', + 'images', + 'point_clouds', + 'homology', + 'diagrams', + 'curves', + 'plotting', + 'externals', + 'utils', + 'metaestimators', + '__version__' + ] diff --git a/gtda/_version.py b/gtda/_version.py index 589f7abfa..b6700d318 100644 --- a/gtda/_version.py +++ b/gtda/_version.py @@ -19,4 +19,4 @@ # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.2.2' +__version__ = "0.3.0" diff --git a/gtda/base.py b/gtda/base.py index 25a981e63..78f56bac8 100644 --- a/gtda/base.py +++ b/gtda/base.py @@ -20,8 +20,8 @@ def fit_transform(self, X, y=None, **fit_params): Input data. y : None - There is no need for a target in a transformer, yet the pipeline - API requires this parameter. + There is no need for a target, yet the pipeline API requires this + parameter. Returns ------- @@ -130,7 +130,7 @@ def transform_plot(self, X, sample=0, **plot_params): sample : int Sample to be plotted. - plot_params : dict + **plot_params Optional plotting parameters. Returns @@ -140,6 +140,5 @@ def transform_plot(self, X, sample=0, **plot_params): """ Xt = self.transform(X[sample:sample+1]) - self.plot(Xt, sample=0, **plot_params) - + self.plot({sample: Xt[0]}, sample=sample, **plot_params).show() return Xt diff --git a/gtda/curves/__init__.py b/gtda/curves/__init__.py new file mode 100644 index 000000000..0b11e0a1c --- /dev/null +++ b/gtda/curves/__init__.py @@ -0,0 +1,10 @@ +"""The module :mod:`gtda.curves` implements transformers to postprocess +curves.""" + +from .preprocessing import Derivative +from .features import StandardFeatures + +__all__ = [ + "Derivative", + "StandardFeatures" + ] diff --git a/gtda/curves/_functions.py b/gtda/curves/_functions.py new file mode 100644 index 000000000..c29996bb6 --- /dev/null +++ b/gtda/curves/_functions.py @@ -0,0 +1,52 @@ +# License: GNU AGPLv3 + +import warnings +from itertools import product + +import numpy as np +from joblib import Parallel, delayed + +_AVAILABLE_FUNCTIONS = { + "identity": {}, + "argmax": {}, + "argmin": {}, + "min": {}, + "max": {}, + "mean": {}, + "std": {}, + "median": {}, + "average": {"weights": {"type": np.ndarray}} + } + +_implemented_function_recipes = { + "identity": lambda X, axis: X.reshape(len(X), -1), + "argmax": np.argmax, + "argmin": np.argmin, + "min": np.min, + "max": np.max, + "mean": np.mean, + "std": np.std, + "median": np.median, + "average": np.average + } + + +def _parallel_featurization(Xt, function, function_params, n_jobs): + if callable(function): + return function(Xt, axis=-1, **function_params) + else: # Assume function is a list or tuple of functions or None + channel_idx = [j for j, f in enumerate(function) if f is not None] + n_samples = len(Xt) + index_pairs = product(range(n_samples), channel_idx) + Xt = Parallel(n_jobs=n_jobs)( + delayed(function[j])(Xt[i, j], **function_params[j]) + for i, j in index_pairs + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", + category=np.VisibleDeprecationWarning) + Xt = np.array(Xt) + if Xt.dtype == np.dtype('object'): + Xt = np.concatenate(list(map(np.ravel, Xt))) + + return Xt.reshape(n_samples, -1) diff --git a/gtda/curves/features.py b/gtda/curves/features.py new file mode 100644 index 000000000..403ef33fc --- /dev/null +++ b/gtda/curves/features.py @@ -0,0 +1,243 @@ +"""Feature extraction from curves.""" +# License: GNU AGPLv3 + +from copy import deepcopy +from types import FunctionType + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted, check_array + +from ._functions import _AVAILABLE_FUNCTIONS, _implemented_function_recipes, \ + _parallel_featurization +from ..utils._docs import adapt_fit_transform_docs +from ..utils.validation import validate_params + + +@adapt_fit_transform_docs +class StandardFeatures(BaseEstimator, TransformerMixin): + """Standard features from multi-channel curves. + + A multi-channel (integer sampled) curve is a 2D array of shape + ``(n_channels, n_bins)``, where each row represents the y-values in one of + the channels. This transformer applies scalar or vector-valued functions + channel-wise to extract features from each multi-channel curve in a + collection. The output is always a 2D array such that row ``i`` is the + concatenation of the outputs of the chosen functions on the channels in the + ``i``-th (multi-)curve in the collection. + + Parameters + ---------- + function : string, callable, list or tuple, optional, default: ``"max"`` + Function or list/tuple of functions to apply to each channel of each + multi-channel curve. Functions can map to scalars or to 1D arrays. If a + string (see below) or a callable, then the same function is applied to + all channels. Otherwise, `function` is a list/tuple of the same length + as the number of entries along axis 1 in the collection passed to + :meth:`fit`. Lists/tuples may contain allowed strings (see below), + callables, and ``None`` in some positions to indicate that no feature + should be extracted from the corresponding channel. Available strings + are ``"identity"``, ``"argmin"``, ``"argmax"``, ``"min"``, ``"max"``, + ``"mean"``, ``"std"``, ``"median"`` and ``"average"``. + + function_params : dict, None, list or tuple, optional, default: ``None`` + Additional keyword arguments for the function or functions in + `function`. Passing ``None`` is equivalent to passing no arguments. + Otherwise, if `function` is a single string or callable then + `function_params` must be a dictionary. For functions encoded by + allowed strings, the dictionary keys are as follows: + + - If ``function == "average"``, the only key is ``"weights"`` + (np.ndarray or None, default: ``None``). + - Otherwise, there are no allowed keys. + + If `function` is a list or tuple, `function_params` must be a list or + tuple of dictionaries (or ``None``) as above, of the same length as + `function`. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. Ignored if `function` is one of the allowed string options. + + Attributes + ---------- + n_channels_ : int + Number of channels present in the 3D array passed to :meth:`fit`. Must + match the number of channels in the 3D array passed to + :meth:`transform`. + + effective_function_ : callable or tuple + Callable, or tuple of callables or ``None``, describing the function(s) + used to compute features in each available channel. It is a single + callable only when `function` was passed as a string. + + effective_function_params_ : dict or tuple + Dictionary or tuple of dictionaries containing all information present + in `function_params` as well as relevant quantities computed in + :meth:`fit`. It is a single dict only when `function` was passed as a + string. ``None``s are converted to empty dictionaries. + + """ + _hyperparameters = { + "function": {"type": (str, FunctionType, list, tuple), + "in": tuple(_AVAILABLE_FUNCTIONS.keys()), + "of": {"type": (str, FunctionType, type(None)), + "in": tuple(_AVAILABLE_FUNCTIONS.keys())}}, + "function_params": {"type": (dict, type(None), list, tuple)}, + } + + def __init__(self, function="max", function_params=None, n_jobs=None): + self.function = function + self.function_params = function_params + self.n_jobs = n_jobs + + def _validate_params(self): + params = self.get_params().copy() + _hyperparameters = deepcopy(self._hyperparameters) + if not isinstance(self.function, str): + _hyperparameters["function"].pop("in") + try: + validate_params(params, _hyperparameters, exclude=["n_jobs"]) + # Another go if we fail because function is a list/tuple containing + # instances of FunctionType and the "in" key checks fail + except ValueError as ve: + end_string = f"which is not in " \ + f"{tuple(_AVAILABLE_FUNCTIONS.keys())}." + function = params["function"] + if ve.args[0].endswith(end_string) \ + and isinstance(function, (list, tuple)): + params["function"] = [f for f in function + if isinstance(f, str)] + validate_params(params, _hyperparameters, exclude=["n_jobs"]) + else: + raise ve + + if isinstance(self.function, (list, tuple)) \ + and isinstance(self.function_params, dict): + raise TypeError("If `function` is a list/tuple then " + "`function_params` must be a list/tuple of dict, " + "or None.") + elif isinstance(self.function, (str, FunctionType)) \ + and isinstance(self.function_params, (list, tuple)): + raise TypeError("If `function` is a string or a callable " + "function then `function_params` must be a dict " + "or None.") + + def fit(self, X, y=None): + """Compute :attr:`n_channels_` and :attr:`effective_function_params_`. + Then, return the estimator. + + This function is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_channels, n_bins) + Input data. Collection of multi-channel curves. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + self : object + + """ + check_array(X, ensure_2d=False, allow_nd=True) + if X.ndim != 3: + raise ValueError("Input must be 3-dimensional.") + self._validate_params() + + self.n_channels_ = X.shape[1] + + if isinstance(self.function, str): + self.effective_function_ = \ + _implemented_function_recipes[self.function] + + if self.function_params is None: + self.effective_function_params_ = {} + else: + validate_params(self.function_params, + _AVAILABLE_FUNCTIONS[self.function]) + self.effective_function_params_ = self.function_params.copy() + + elif isinstance(self.function, FunctionType): + self.effective_function_ = \ + tuple([self.function] * self.n_channels_) + + if self.function_params is None: + self.effective_function_params_ = \ + tuple([{}] * self.n_channels_) + else: + self.effective_function_params_ = \ + tuple([self.function_params.copy()] * self.n_channels_) + else: + n_functions = len(self.function) + if len(self.function) != self.n_channels_: + raise ValueError( + f"`function` has length {n_functions} while curves in `X` " + f"have {self.n_channels_} channels." + ) + + if self.function_params is None: + self._effective_function_params = [{}] * self.n_channels_ + else: + self._effective_function_params = self.function_params + n_function_params = len(self._effective_function_params) + if n_function_params != self.n_channels_: + raise ValueError(f"`function_params` has length " + f"{n_function_params} while curves in " + f"`X` have {self.n_channels_} channels.") + + self.effective_function_ = [] + self.effective_function_params_ = [] + for f, p in zip(self.function, self._effective_function_params): + if isinstance(f, str): + validate_params(p, _AVAILABLE_FUNCTIONS[f]) + self.effective_function_.\ + append(_implemented_function_recipes[f]) + else: + self.effective_function_.append(f) + self.effective_function_params_.append({} if p is None + else p.copy()) + self.effective_function_ = tuple(self.effective_function_) + self.effective_function_params_ = \ + tuple(self.effective_function_params_) + + return self + + def transform(self, X, y=None): + """Compute features of multi-channel curves. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_channels, n_bins) + Input collection of multi-channel curves. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features) + Output collection of features of multi-channel curves. + ``n_features`` is the sum of the number of features output by the + (non-``None``) functions on their respective channels. + + """ + check_is_fitted(self) + Xt = check_array(X, ensure_2d=False, allow_nd=True) + if Xt.ndim != 3: + raise ValueError("Input must be 3-dimensional.") + if Xt.shape[1] != self.n_channels_: + raise ValueError(f"Number of channels must be the same as in " + f"`fit`. Passed {Xt.shape[1]}, expected " + f"{self.n_channels_}.") + + Xt = _parallel_featurization(Xt, self.effective_function_, + self.effective_function_params_, + self.n_jobs) + + return Xt diff --git a/gtda/curves/preprocessing.py b/gtda/curves/preprocessing.py new file mode 100644 index 000000000..b22332a3e --- /dev/null +++ b/gtda/curves/preprocessing.py @@ -0,0 +1,199 @@ +"""Preprocessing transformers for curves.""" +# License: GNU AGPLv3 + +import numpy as np +from joblib import Parallel, delayed, effective_n_jobs +from plotly.graph_objs import Figure, Scatter +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import gen_even_slices +from sklearn.utils.validation import check_is_fitted, check_array + +from ..base import PlotterMixin +from ..utils._docs import adapt_fit_transform_docs +from ..utils.intervals import Interval +from ..utils.validation import validate_params + + +@adapt_fit_transform_docs +class Derivative(BaseEstimator, TransformerMixin, PlotterMixin): + """Derivatives of multi-channel curves. + + A multi-channel (integer sampled) curve is a 2D array of shape + ``(n_channels, n_bins)``, where each row represents the y-values in one of + the channels. This transformer computes the n-th order derivative of each + channel in each multi-channel curve in a collection, by discrete + differences. The output is another collection of multi-channel curves. + + Parameters + ---------- + order : int, optional, default: ``1`` + Order of the derivative to be taken. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + + Attributes + ---------- + n_channels_ : int + Number of channels present in the 3D array passed to :meth:`fit`. + + """ + _hyperparameters = { + 'order': {'type': int, 'in': Interval(1, np.inf, closed='left')}, + } + + def __init__(self, order=1, n_jobs=None): + self.order = order + self.n_jobs = n_jobs + + def fit(self, X, y=None): + """Compute :attr:`n_channels_`. Then, return the estimator. + + This function is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_channels, n_bins) + Input data. Collection of multi-channel curves. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + self : object + + """ + check_array(X, ensure_2d=False, allow_nd=True) + if X.ndim != 3: + raise ValueError("Input must be 3-dimensional.") + validate_params( + self.get_params(), self._hyperparameters, exclude=['n_jobs']) + + n_bins = X.shape[2] + if self.order >= n_bins: + raise ValueError( + f"Input channels have length {n_bins} but they must have at " + f"least length {self.order + 1} to calculate derivatives of " + f"order {self.order}." + ) + + self.n_channels_ = X.shape[1] + + return self + + def transform(self, X, y=None): + """Compute derivatives of multi-channel curves. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_channels, n_bins) + Input collection of multi-channel curves. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_channels, n_bins - order) + Output collection of multi-channel curves given by taking discrete + differences of order `order` in each channel in the curves in `X`. + + """ + check_is_fitted(self) + Xt = check_array(X, ensure_2d=False, allow_nd=True) + if Xt.ndim != 3: + raise ValueError("Input must be 3-dimensional.") + + Xt = Parallel(n_jobs=self.n_jobs)( + delayed(np.diff)(Xt[s], n=self.order, axis=-1) + for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs)) + ) + Xt = np.concatenate(Xt) + + return Xt + + def plot(self, Xt, sample=0, channels=None, plotly_params=None): + """Plot a sample from a collection of derivatives of multi-channel + curves arranged as in the output of :meth:`transform`. + + Parameters + ---------- + Xt : ndarray of shape (n_samples, n_channels, n_bins) + Collection of multi-channel curves, such as returned by + :meth:`transform`. + + sample : int, optional, default: ``0`` + Index of the sample in `Xt` to be plotted. + + channels : list, tuple or None, optional, default: ``None`` + Which channels to include in the plot. ``None`` means plotting the + first :attr:`n_channels_` channels. + + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + + """ + check_is_fitted(self) + + layout_axes_common = { + "type": "linear", + "ticks": "outside", + "showline": True, + "zeroline": True, + "linewidth": 1, + "linecolor": "black", + "mirror": False, + "showexponent": "all", + "exponentformat": "e" + } + layout = { + "xaxis1": { + "title": "Sample", + "side": "bottom", + "anchor": "y1", + **layout_axes_common + }, + "yaxis1": { + "title": "Derivative", + "side": "left", + "anchor": "x1", + **layout_axes_common + }, + "plot_bgcolor": "white", + "title": f"Derivative of sample {sample}" + } + + fig = Figure(layout=layout) + + if channels is None: + channels = range(self.n_channels_) + + samplings = np.arange(Xt[sample].shape[0]) + for ix, channel in enumerate(channels): + fig.add_trace(Scatter(x=samplings, + y=Xt[sample][ix], + mode="lines", + showlegend=True, + name=f"Channel {channel}")) + + # Update traces and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("traces", None)) + fig.update_layout(plotly_params.get("layout", None)) + + return fig diff --git a/gtda/curves/tests/__init__.py b/gtda/curves/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/gtda/curves/tests/test_features.py b/gtda/curves/tests/test_features.py new file mode 100644 index 000000000..f8ef8d99a --- /dev/null +++ b/gtda/curves/tests/test_features.py @@ -0,0 +1,236 @@ +"""Testing for feature extraction from curves.""" + +import numpy as np +import pytest +from numpy.testing import assert_almost_equal +from sklearn.exceptions import NotFittedError + +from gtda.curves import StandardFeatures + +# Generated on 30/09/2020 by +# np.random.seed(0); X = np.random.rand(3, 2, 20) +X = np.array([[[0.5488135, 0.71518937, 0.60276338, 0.54488318, 0.4236548, + 0.64589411, 0.43758721, 0.891773, 0.96366276, 0.38344152, + 0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606, + 0.0871293, 0.0202184, 0.83261985, 0.77815675, 0.87001215], + [0.97861834, 0.79915856, 0.46147936, 0.78052918, 0.11827443, + 0.63992102, 0.14335329, 0.94466892, 0.52184832, 0.41466194, + 0.26455561, 0.77423369, 0.45615033, 0.56843395, 0.0187898, + 0.6176355, 0.61209572, 0.616934, 0.94374808, 0.6818203]], + [[0.3595079, 0.43703195, 0.6976312, 0.06022547, 0.66676672, + 0.67063787, 0.21038256, 0.1289263, 0.31542835, 0.36371077, + 0.57019677, 0.43860151, 0.98837384, 0.10204481, 0.20887676, + 0.16130952, 0.65310833, 0.2532916, 0.46631077, 0.24442559], + [0.15896958, 0.11037514, 0.65632959, 0.13818295, 0.19658236, + 0.36872517, 0.82099323, 0.09710128, 0.83794491, 0.09609841, + 0.97645947, 0.4686512, 0.97676109, 0.60484552, 0.73926358, + 0.03918779, 0.28280696, 0.12019656, 0.2961402, 0.11872772]], + [[0.31798318, 0.41426299, 0.0641475, 0.69247212, 0.56660145, + 0.26538949, 0.52324805, 0.09394051, 0.5759465, 0.9292962, + 0.31856895, 0.66741038, 0.13179786, 0.7163272, 0.28940609, + 0.18319136, 0.58651293, 0.02010755, 0.82894003, 0.00469548], + [0.67781654, 0.27000797, 0.73519402, 0.96218855, 0.24875314, + 0.57615733, 0.59204193, 0.57225191, 0.22308163, 0.95274901, + 0.44712538, 0.84640867, 0.69947928, 0.29743695, 0.81379782, + 0.39650574, 0.8811032, 0.58127287, 0.88173536, 0.69253159]]]) + + +def scalar_fn(x): + return x[0] + + +def vector_fn(x): + return x + + +def vector_fn_2(x): + return x[:-1] + + +def test_standard_not_fitted(): + sf = StandardFeatures() + with pytest.raises(NotFittedError): + sf.transform(X) + + +@pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4, 5)]) +def test_standard_invalid_shape(shape): + sf = StandardFeatures() + + with pytest.raises(ValueError, match="Input must be 3-dimensional."): + sf.fit(np.ones(shape)) + + with pytest.raises(ValueError, match="Input must be 3-dimensional."): + sf.fit(X).transform(np.ones(shape)) + + +def test_standard_transform_channels_different_from_fit_channels(): + sf = StandardFeatures() + + with pytest.raises(ValueError, match="Number of channels must be the " + "same as in `fit`"): + sf.fit(X).transform(X[:, :-1, :]) + + +def test_standard_invalid_function_function_params(): + sf = StandardFeatures(function="wrong") + with pytest.raises(ValueError): + sf.fit(X) + + sf.set_params(function=0) + with pytest.raises(TypeError): + sf.fit(X) + + sf.set_params(function="max", function_params={"param": 2}) + with pytest.raises(KeyError): + sf.fit(X) + + sf.set_params(function_params=[]) + with pytest.raises(TypeError, match="If `function` is a string or a " + "callable function"): + sf.fit(X) + + sf.set_params(function=["wrong", "max"]) + with pytest.raises(ValueError, match="which is not in"): + sf.fit(X) + + sf.set_params(function=["max", "min"], function_params={}) + with pytest.raises(TypeError, match="If `function` is a list/tuple"): + sf.fit(X) + + sf.set_params(function_params=[{}]) + with pytest.raises(ValueError, match="`function_params` has length"): + sf.fit(X) + + sf.set_params(function=["max"], function_params=None) + with pytest.raises(ValueError, match="`function` has length"): + sf.fit(X) + + +@pytest.mark.parametrize("function, function_params, effective_function, " + "effective_function_params", + [('max', None, np.max, {}), ('max', {}, np.max, {}), + (np.max, None, (np.max, np.max), ({}, {})), + (np.max, {}, (np.max, np.max), ({}, {})), + ([np.max, np.min], [{}, None], + (np.max, np.min), ({}, {})), + ([np.max, None], [{}, None], + (np.max, None), ({}, {}))]) +def test_standard_fit_attrs(function, function_params, + effective_function, effective_function_params): + sf = StandardFeatures(function=function, function_params=function_params) + sf.fit(X) + + assert sf.n_channels_ == X.shape[1] + + assert sf.effective_function_ == effective_function \ + and sf.effective_function_params_ == effective_function_params + + +@pytest.mark.parametrize("function", ["argmax", "argmin", "min", "max", "mean", + "std", "median", "average", np.max, + scalar_fn, [scalar_fn, "max"], + [scalar_fn, np.max]]) +def test_standard_shape_scalar_function(function): + sf = StandardFeatures(function=function) + Xt = sf.fit_transform(X) + + assert Xt.shape == X.shape[:2] + + +def test_standard_shape_function_list_with_none(): + sf = StandardFeatures(function=[None, np.max]) + Xt = sf.fit_transform(X) + sf.set_params(function="max") + + assert_almost_equal(Xt, sf.fit_transform(X)[:, [1]]) + + +X_res = { + "identity": X.reshape(X.shape[0], -1), + vector_fn: X.reshape(X.shape[0], -1), + (vector_fn, vector_fn): X.reshape(X.shape[0], -1), + "argmax": np.array([[8, 0], + [12, 12], + [9, 3]]), + "argmin": np.array([[16, 14], + [3, 15], + [19, 8]]), + "min": np.array([[0.0202184, 0.0187898], + [0.06022547, 0.03918779], + [0.00469548, 0.22308163]]), + "max": np.array([[0.96366276, 0.97861834], + [0.98837384, 0.97676109], + [0.9292962, 0.96218855]]), + "mean": np.array([[0.58155482, 0.56784552], + [0.39983943, 0.40521714], + [0.40951229, 0.61738194]]), + "std": np.array([[0.27591522, 0.26865653], + [0.23900448, 0.31701912], + [0.27368227, 0.23340901]]), + "median": np.array([[0.58540397, 0.61451486], + [0.36160934, 0.28947358], + [0.36641597, 0.63492923]]), + "average": np.array([[0.58155482, 0.56784552], + [0.39983943, 0.40521714], + [0.40951229, 0.61738194]]), + np.max: np.array([[0.96366276, 0.97861834], + [0.98837384, 0.97676109], + [0.9292962, 0.96218855]]), + (np.max, np.max): np.array([[0.96366276, 0.97861834], + [0.98837384, 0.97676109], + [0.9292962, 0.96218855]]) + } + + +@pytest.mark.parametrize("function", X_res.keys()) +@pytest.mark.parametrize("n_jobs", [1, 2]) +def test_standard_transform(function, n_jobs): + sf = StandardFeatures(function=function, n_jobs=n_jobs) + + assert_almost_equal(sf.fit_transform(X), X_res[function]) + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +def test_standard_transform_mixed_vector(n_jobs): + sf = StandardFeatures(function=[vector_fn, vector_fn_2], n_jobs=n_jobs) + Xt = sf.fit_transform(X) + + assert Xt.shape == (len(X), 2 * X.shape[-1] - 1) + assert_almost_equal(Xt[:, :X.shape[2]], X[:, 0, :]) + assert_almost_equal(Xt[:, X.shape[2]:], X[:, 1, :-1]) + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +def test_standard_transform_mixed_vector_scalar(n_jobs): + sf = StandardFeatures(function=[vector_fn, scalar_fn], n_jobs=n_jobs) + Xt = sf.fit_transform(X) + + assert Xt.shape == (len(X), X.shape[-1] + 1) + assert_almost_equal(Xt[:, :X.shape[2]], X[:, 0, :]) + + sf.set_params(function=[None, vector_fn]) + Xt = sf.fit_transform(X) + + assert_almost_equal(Xt, X[:, 1, :]) + + +def test_standard_transform_function_params(): + weights = np.zeros(X.shape[-1]) + weights[0] = 1 + sf = StandardFeatures(function="average", + function_params={"weights": weights}) + Xt = sf.fit_transform(X) + + assert_almost_equal(Xt, X[:, :, 0]) + + sf.set_params(function=np.average) + Xt = sf.fit_transform(X) + + assert_almost_equal(Xt, X[:, :, 0]) + + sf.set_params(function=[np.average, np.average], + function_params=[{"weights": weights}, {"weights": weights}]) + Xt = sf.fit_transform(X) + + assert_almost_equal(Xt, X[:, :, 0]) diff --git a/gtda/curves/tests/test_preprocessing.py b/gtda/curves/tests/test_preprocessing.py new file mode 100644 index 000000000..7c28eaf36 --- /dev/null +++ b/gtda/curves/tests/test_preprocessing.py @@ -0,0 +1,65 @@ +"""Testing for curves preprocessing.""" + +import pytest +import numpy as np +import plotly.io as pio +from numpy.testing import assert_almost_equal +from sklearn.exceptions import NotFittedError +from gtda.curves import Derivative + +pio.renderers.default = 'plotly_mimetype' +line_plots_traces_params = {"mode": "lines+markers"} +layout_params = {"title": "New title"} +plotly_params = \ + {"traces": line_plots_traces_params, "layout": layout_params} + + +np.random.seed(0) +X = np.random.rand(1, 2, 5) + + +def test_derivative_not_fitted(): + d = Derivative() + + with pytest.raises(NotFittedError): + d.transform(X) + + +def test_derivative_big_order(): + d = Derivative(order=5) + + with pytest.raises(ValueError): + d.fit(X) + + +@pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4, 5)]) +def test_standard_invalid_shape(shape): + sf = Derivative() + + with pytest.raises(ValueError, match="Input must be 3-dimensional."): + sf.fit(np.ones(shape)) + + with pytest.raises(ValueError, match="Input must be 3-dimensional."): + sf.fit(X).transform(np.ones(shape)) + + +X_res = { + 1: np.array([[[0.16637586, -0.11242599, -0.05788019, -0.12122838], + [-0.2083069, 0.45418579, 0.07188976, -0.58022124]]]), + 2: np.array([[[-0.27880185, 0.0545458, -0.06334819], + [0.66249269, -0.38229603, -0.652111]]]), + } + + +@pytest.mark.parametrize('order', [1, 2]) +def test_derivative_transform(order): + d = Derivative(order) + + assert_almost_equal(d.fit_transform(X), X_res[order]) + + +@pytest.mark.parametrize("channels", [None, [1], [0, 1]]) +def test_consistent_fit_transform_plot(channels): + d = Derivative() + Xt = d.fit_transform(X) + d.plot(Xt, channels=channels, plotly_params=plotly_params) diff --git a/gtda/diagrams/__init__.py b/gtda/diagrams/__init__.py index 3f716f890..c62e6fa6e 100644 --- a/gtda/diagrams/__init__.py +++ b/gtda/diagrams/__init__.py @@ -4,7 +4,8 @@ from .preprocessing import ForgetDimension, Scaler, Filtering from .distance import PairwiseDistance -from .features import PersistenceEntropy, Amplitude +from .features import PersistenceEntropy, Amplitude, NumberOfPoints, \ + ComplexPolynomial from .representations import BettiCurve, PersistenceLandscape, HeatKernel, \ Silhouette, PersistenceImage @@ -13,11 +14,13 @@ 'Scaler', 'Filtering', 'PairwiseDistance', + 'PersistenceEntropy', 'Amplitude', + 'NumberOfPoints', + 'ComplexPolynomial', 'BettiCurve', 'PersistenceLandscape', 'HeatKernel', - 'PersistenceEntropy', 'Silhouette', 'PersistenceImage' -] + ] diff --git a/gtda/diagrams/_features.py b/gtda/diagrams/_features.py new file mode 100644 index 000000000..a4f94dd80 --- /dev/null +++ b/gtda/diagrams/_features.py @@ -0,0 +1,42 @@ +# License: GNU AGPLv3 + +import numpy as np + + +_AVAILABLE_POLYNOMIALS = {'R': {}, + 'S': {}, + 'T': {}} + + +def R_polynomial(Xd): + roots = Xd[:, 0] + 1j * Xd[:, 1] + + return roots + + +def S_polynomial(Xd): + alpha = np.linalg.norm(Xd, axis=1) + alpha = np.where(alpha == 0, np.ones(Xd.shape[0]), alpha) + roots = np.multiply( + np.multiply( + (Xd[:, 0] + 1j * Xd[:, 1]), (Xd[:, 1] - Xd[:, 0]) + ), + 1. / (np.sqrt(2) * alpha) + ) + + return roots + + +def T_polynomial(Xd): + alpha = np.linalg.norm(Xd, axis=1) + roots = np.multiply( + (Xd[:, 1] - Xd[:, 0]) / 2, np.cos(alpha) - np.sin(alpha) + + 1j * (np.cos(alpha) + np.sin(alpha)) + ) + + return roots + + +_implemented_polynomial_recipes = {'R': R_polynomial, + 'S': S_polynomial, + 'T': T_polynomial} diff --git a/gtda/diagrams/_metrics.py b/gtda/diagrams/_metrics.py index 049108647..6cbdf1d94 100644 --- a/gtda/diagrams/_metrics.py +++ b/gtda/diagrams/_metrics.py @@ -17,38 +17,46 @@ _AVAILABLE_METRICS = { 'bottleneck': { - 'delta': {'type': Real, 'in': Interval(0, 1, closed='both')}}, + 'delta': {'type': Real, 'in': Interval(0, 1, closed='both')} + }, 'wasserstein': { 'p': {'type': Real, 'in': Interval(1, np.inf, closed='left')}, - 'delta': {'type': Real, 'in': Interval(0, 1, closed='right')}}, + 'delta': {'type': Real, 'in': Interval(0, 1, closed='right')} + }, 'betti': { 'p': {'type': Real, 'in': Interval(1, np.inf, closed='both')}, - 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}}, + 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')} + }, 'landscape': { 'p': {'type': Real, 'in': Interval(1, np.inf, closed='both')}, 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'n_layers': {'type': int, 'in': Interval(1, np.inf, closed='left')}}, + 'n_layers': {'type': int, 'in': Interval(1, np.inf, closed='left')} + }, 'heat': { 'p': {'type': Real, 'in': Interval(1, np.inf, closed='both')}, 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'sigma': {'type': Real, 'in': Interval(0, np.inf, closed='neither')}}, + 'sigma': {'type': Real, 'in': Interval(0, np.inf, closed='neither')} + }, 'persistence_image': { 'p': {'type': Real, 'in': Interval(1, np.inf, closed='both')}, 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}, 'sigma': {'type': Real, 'in': Interval(0, np.inf, closed='neither')}, - 'weight_function': {'type': FunctionType, 'in': None}}, + 'weight_function': {'type': (FunctionType, type(None))} + }, 'silhouette': { 'power': {'type': Real, 'in': Interval(0, np.inf, closed='right')}, 'p': {'type': Real, 'in': Interval(1, np.inf, closed='both')}, - 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}}} - -_AVAILABLE_AMPLITUDE_METRICS = dict() -for metric, metric_params in _AVAILABLE_METRICS.items(): - if metric not in ['bottleneck', 'wasserstein']: - _AVAILABLE_AMPLITUDE_METRICS[metric] = metric_params.copy() + 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')} + } + } + +_AVAILABLE_AMPLITUDE_METRICS = {} +for _metric, _metric_params in _AVAILABLE_METRICS.items(): + if _metric not in ['bottleneck', 'wasserstein']: + _AVAILABLE_AMPLITUDE_METRICS[_metric] = _metric_params.copy() else: - _AVAILABLE_AMPLITUDE_METRICS[metric] = \ - {name: descr for name, descr in metric_params.items() + _AVAILABLE_AMPLITUDE_METRICS[_metric] = \ + {name: descr for name, descr in _metric_params.items() if name != 'delta'} @@ -74,55 +82,75 @@ def landscapes(diagrams, sampling, n_layers): return fibers -def _heat(image, sampled_diag, sigma): - _sample_image(image, sampled_diag) # modifies `heat` inplace - image[:] = gaussian_filter(image, sigma, mode="reflect") - - def heats(diagrams, sampling, step_size, sigma): - heats_ = np.zeros((diagrams.shape[0], - sampling.shape[0], sampling.shape[0])) - - diagrams[diagrams < sampling[0, 0]] = sampling[0, 0] - diagrams[diagrams > sampling[-1, 0]] = sampling[-1, 0] - diagrams = np.array((diagrams - sampling[0, 0]) / step_size, dtype=int) - - [_heat(heats_[i], sampled_diag, sigma) - for i, sampled_diag in enumerate(diagrams)] - - heats_ = heats_ - np.transpose(heats_, (0, 2, 1)) + # WARNING: modifies `diagrams` in place + heats_ = \ + np.zeros((len(diagrams), len(sampling), len(sampling)), dtype=float) + # If the step size is zero, we return a trivial image + if step_size == 0: + return heats_ + + # Set the values outside of the sampling range + first_sampling, last_sampling = sampling[0, 0, 0], sampling[-1, 0, 0] + diagrams[diagrams < first_sampling] = first_sampling + diagrams[diagrams > last_sampling] = last_sampling + + # Calculate the value of `sigma` in pixel units + sigma_pixel = sigma / step_size + + for i, diagram in enumerate(diagrams): + nontrivial_points_idx = np.flatnonzero(diagram[:, 1] != diagram[:, 0]) + diagram_nontrivial_pixel_coords = np.array( + (diagram - first_sampling) / step_size, dtype=int + )[nontrivial_points_idx] + image = heats_[i] + _sample_image(image, diagram_nontrivial_pixel_coords) + gaussian_filter(image, sigma_pixel, mode="constant", output=image) + + heats_ -= np.transpose(heats_, (0, 2, 1)) + heats_ /= (step_size ** 2) heats_ = np.rot90(heats_, k=1, axes=(1, 2)) return heats_ -def persistence_images(diagrams, sampling, step_size, weights, sigma): - persistence_images_ = np.zeros( - (diagrams.shape[0], sampling.shape[0], sampling.shape[0])) +def persistence_images(diagrams, sampling, step_size, sigma, weights): + # For persistence images, `sampling` is a tall matrix with two columns + # (the first for birth and the second for persistence), and `step_size` is + # a 2d array + # WARNING: modifies `diagrams` in place + persistence_images_ = \ + np.zeros((len(diagrams), len(sampling), len(sampling)), dtype=float) + # If either step size is zero, we return a trivial image + if (step_size == 0).any(): + return persistence_images_ + # Transform diagrams from (birth, death, dim) to (birth, persistence, dim) - diagrams[:, :, 1] = diagrams[:, :, 1] - diagrams[:, :, 0] - - for axis in [0, 1]: - # Set the values outside of the sampling range to the sampling range. - diagrams[:, :, axis][diagrams[:, :, axis] < sampling[0, axis]] = \ - sampling[0, axis] - diagrams[:, :, axis][diagrams[:, :, axis] > sampling[-1, axis]] = \ - sampling[-1, axis] - # Convert into pixel - diagrams[:, :, axis] = np.array( - (diagrams[:, :, axis] - sampling[0, axis]) / step_size[axis], - dtype=int) - # Sample the image - [_sample_image(persistence_images_[i], sampled_diag) - for i, sampled_diag in enumerate(diagrams)] - - # Apply the weights - persistence_images_ *= weights / np.max(weights) - - # Smoothen the weighted-image - for i, image in enumerate(persistence_images_): - persistence_images_[i] = gaussian_filter(image, sigma, mode="reflect") + diagrams[:, :, 1] -= diagrams[:, :, 0] + + sigma_pixel = [] + first_samplings = sampling[0] + last_samplings = sampling[-1] + for ax in [0, 1]: + diagrams_ax = diagrams[:, :, ax] + # Set the values outside of the sampling range + diagrams_ax[diagrams_ax < first_samplings[ax]] = first_samplings[ax] + diagrams_ax[diagrams_ax > last_samplings[ax]] = last_samplings[ax] + # Calculate the value of the component of `sigma` in pixel units + sigma_pixel.append(sigma / step_size[ax]) + + # Sample the image, apply the weights, smoothen + for i, diagram in enumerate(diagrams): + nontrivial_points_idx = np.flatnonzero(diagram[:, 1]) + diagram_nontrivial_pixel_coords = np.array( + (diagram - first_samplings) / step_size, dtype=int + )[nontrivial_points_idx] + image = persistence_images_[i] + _sample_image(image, diagram_nontrivial_pixel_coords) + image *= weights + gaussian_filter(image, sigma_pixel, mode="constant", output=image) persistence_images_ = np.rot90(persistence_images_, k=1, axes=(1, 2)) + persistence_images_ /= np.product(step_size) return persistence_images_ @@ -131,15 +159,17 @@ def silhouettes(diagrams, sampling, power, **kwargs): returned by _bin) of a one-dimensional range. """ sampling = np.transpose(sampling, axes=(1, 2, 0)) - weights = np.diff(diagrams, axis=2)[:, :, [0]] + weights = np.diff(diagrams, axis=2) if power > 8.: - weights = weights/np.max(weights, axis=1, keepdims=True) - weights = weights**power + weights = weights / np.max(weights, axis=1, keepdims=True) + weights = weights ** power total_weights = np.sum(weights, axis=1) + # Next line is a trick to avoid NaNs when computing `fibers_weighted_sum` + total_weights[total_weights == 0.] = np.inf midpoints = (diagrams[:, :, [1]] + diagrams[:, :, [0]]) / 2. heights = (diagrams[:, :, [1]] - diagrams[:, :, [0]]) / 2. fibers = np.maximum(-np.abs(sampling - midpoints) + heights, 0) - fibers_weighted_sum = np.sum(weights*fibers, axis=1)/total_weights + fibers_weighted_sum = np.sum(weights * fibers, axis=1) / total_weights return fibers_weighted_sum @@ -157,78 +187,108 @@ def wasserstein_distances(diagrams_1, diagrams_2, p=2, delta=0.01, **kwargs): p, delta,) for diagram_2 in diagrams_2] for diagram_1 in diagrams_1]) -def betti_distances(diagrams_1, diagrams_2, sampling, - step_size, p=2., **kwargs): +def betti_distances( + diagrams_1, diagrams_2, sampling, step_size, p=2., **kwargs + ): + step_size_factor = step_size ** (1 / p) + are_arrays_equal = np.array_equal(diagrams_1, diagrams_2) betti_curves_1 = betti_curves(diagrams_1, sampling) - if np.array_equal(diagrams_1, diagrams_2): - unnorm_dist = squareform(pdist(betti_curves_1, "minkowski", p=p)) - return (step_size ** (1 / p)) * unnorm_dist + if are_arrays_equal: + distances = pdist(betti_curves_1, "minkowski", p=p) + distances *= step_size_factor + return squareform(distances) betti_curves_2 = betti_curves(diagrams_2, sampling) - unnorm_dist = cdist(betti_curves_1, betti_curves_2, "minkowski", p=p) - return (step_size ** (1 / p)) * unnorm_dist + distances = cdist(betti_curves_1, betti_curves_2, "minkowski", p=p) + distances *= step_size_factor + return distances -def landscape_distances(diagrams_1, diagrams_2, sampling, step_size, - p=2., n_layers=1, **kwargs): +def landscape_distances( + diagrams_1, diagrams_2, sampling, step_size, p=2., n_layers=1, + **kwargs + ): + step_size_factor = step_size ** (1 / p) n_samples_1, n_points_1 = diagrams_1.shape[:2] n_layers_1 = min(n_layers, n_points_1) if np.array_equal(diagrams_1, diagrams_2): - ls_1 = landscapes(diagrams_1, sampling, - n_layers_1).reshape(n_samples_1, -1) - unnorm_dist = squareform(pdist(ls_1, "minkowski", p=p)) - return (step_size ** (1 / p)) * unnorm_dist + ls_1 = landscapes(diagrams_1, sampling, n_layers_1).\ + reshape(n_samples_1, -1) + distances = pdist(ls_1, "minkowski", p=p) + distances *= step_size_factor + return squareform(distances) n_samples_2, n_points_2 = diagrams_2.shape[:2] n_layers_2 = min(n_layers, n_points_2) n_layers = max(n_layers_1, n_layers_2) - ls_1 = landscapes(diagrams_1, sampling, - n_layers).reshape(n_samples_1, -1) - ls_2 = landscapes(diagrams_2, sampling, - n_layers).reshape(n_samples_2, -1) - unnorm_dist = cdist(ls_1, ls_2, "minkowski", p=p) - return (step_size ** (1 / p)) * unnorm_dist - - -def heat_distances(diagrams_1, diagrams_2, sampling, step_size, - sigma=1., p=2., **kwargs): - heat_1 = heats(diagrams_1, sampling, step_size, sigma).reshape( - diagrams_1.shape[0], -1) - if np.array_equal(diagrams_1, diagrams_2): - unnorm_dist = squareform(pdist(heat_1, "minkowski", p=p)) - return (step_size ** (1 / p)) * unnorm_dist - heat_2 = heats(diagrams_2, sampling, step_size, sigma).\ - reshape(diagrams_2.shape[0], -1) - unnorm_dist = cdist(heat_1, heat_2, "minkowski", p=p) - return (step_size ** (1 / p)) * unnorm_dist - - -def persistence_image_distances(diagrams_1, diagrams_2, sampling, step_size, - weight_function=lambda x: x, sigma=1., p=2., - **kwargs): - sampling_ = np.copy(sampling.reshape((-1,))) - weights = weight_function(sampling_ - sampling_[0]) - persistence_image_1 = persistence_images(diagrams_1, sampling_, step_size, - weights, sigma).reshape( - diagrams_1.shape[0], -1) - if np.array_equal(diagrams_1, diagrams_2): - unnorm_dist = squareform(pdist(persistence_image_1, "minkowski", p=p)) - return (step_size ** (1 / p)) * unnorm_dist - persistence_image_2 = persistence_images(diagrams_2, sampling_, step_size, - weights, sigma,).reshape( - diagrams_2.shape[0], -1) - unnorm_dist = cdist(persistence_image_1, persistence_image_2, - "minkowski", p=p) - return (step_size ** (1 / p)) * unnorm_dist - - -def silhouette_distances(diagrams_1, diagrams_2, sampling, step_size, - power=2., p=2., **kwargs): - silhouette_1 = silhouettes(diagrams_1, sampling, power) - if np.array_equal(diagrams_1, diagrams_2): - unnorm_dist = squareform(pdist(silhouette_1, 'minkowski', p=p)) - else: - silhouette_2 = silhouettes(diagrams_2, sampling, power) - unnorm_dist = cdist(silhouette_1, silhouette_2, 'minkowski', p=p) - return (step_size ** (1 / p)) * unnorm_dist + ls_1 = landscapes(diagrams_1, sampling, n_layers).\ + reshape(n_samples_1, -1) + ls_2 = landscapes(diagrams_2, sampling, n_layers).\ + reshape(n_samples_2, -1) + distances = cdist(ls_1, ls_2, "minkowski", p=p) + distances *= step_size_factor + return distances + + +def heat_distances( + diagrams_1, diagrams_2, sampling, step_size, sigma=0.1, p=2., **kwargs + ): + # WARNING: `heats` modifies `diagrams` in place + step_size_factor = step_size ** (2 / p) + are_arrays_equal = np.array_equal(diagrams_1, diagrams_2) + heats_1 = heats(diagrams_1, sampling, step_size, sigma).\ + reshape(len(diagrams_1), -1) + if are_arrays_equal: + distances = pdist(heats_1, "minkowski", p=p) + distances *= step_size_factor + return squareform(distances) + heats_2 = heats(diagrams_2, sampling, step_size, sigma).\ + reshape(len(diagrams_2), -1) + distances = cdist(heats_1, heats_2, "minkowski", p=p) + distances *= step_size_factor + return distances + + +def persistence_image_distances( + diagrams_1, diagrams_2, sampling, step_size, sigma=0.1, + weight_function=np.ones_like, p=2., **kwargs + ): + # For persistence images, `sampling` is a tall matrix with two columns + # (the first for birth and the second for persistence), and `step_size` is + # a 2d array + weights = weight_function(sampling[:, 1]) + step_sizes_factor = np.product(step_size) ** (1 / p) + # WARNING: `persistence_images` modifies `diagrams` in place + are_arrays_equal = np.array_equal(diagrams_1, diagrams_2) + persistence_images_1 = \ + persistence_images(diagrams_1, sampling, step_size, sigma, weights).\ + reshape(len(diagrams_1), -1) + if are_arrays_equal: + distances = pdist(persistence_images_1, "minkowski", p=p) + distances *= step_sizes_factor + return squareform(distances) + persistence_images_2 = persistence_images( + diagrams_2, sampling, step_size, sigma, weights + ).reshape(len(diagrams_2), -1) + distances = cdist( + persistence_images_1, persistence_images_2, "minkowski", p=p + ) + distances *= step_sizes_factor + return distances + + +def silhouette_distances( + diagrams_1, diagrams_2, sampling, step_size, power=1., p=2., **kwargs + ): + step_size_factor = step_size ** (1 / p) + are_arrays_equal = np.array_equal(diagrams_1, diagrams_2) + silhouettes_1 = silhouettes(diagrams_1, sampling, power) + if are_arrays_equal: + distances = pdist(silhouettes_1, 'minkowski', p=p) + distances *= step_size_factor + return squareform(distances) + silhouettes_2 = silhouettes(diagrams_2, sampling, power) + distances = cdist(silhouettes_1, silhouettes_2, 'minkowski', p=p) + distances *= step_size_factor + return distances implemented_metric_recipes = { @@ -238,38 +298,39 @@ def silhouette_distances(diagrams_1, diagrams_2, sampling, step_size, "betti": betti_distances, "heat": heat_distances, "persistence_image": persistence_image_distances, - 'silhouette': silhouette_distances, -} - + 'silhouette': silhouette_distances + } -def _matrix_wrapper(distance_func, distance_matrices, slice_, dim, - *args, **kwargs): - distance_matrices[:, slice_, int(dim)] = distance_func(*args, **kwargs) - -def _parallel_pairwise(X1, X2, metric, metric_params, - homology_dimensions, n_jobs): +def _parallel_pairwise( + X1, X2, metric, metric_params, homology_dimensions, n_jobs + ): metric_func = implemented_metric_recipes[metric] effective_metric_params = metric_params.copy() none_dict = {dim: None for dim in homology_dimensions} samplings = effective_metric_params.pop("samplings", none_dict) step_sizes = effective_metric_params.pop("step_sizes", none_dict) - - if X2 is None: - X2 = X1 - - distance_matrices = Parallel(n_jobs=n_jobs)( - delayed(metric_func)(_subdiagrams(X1, [dim], remove_dim=True), - _subdiagrams(X2[s], [dim], remove_dim=True), - sampling=samplings[dim], - step_size=step_sizes[dim], - **effective_metric_params) + if metric in ["heat", "persistence_image"]: + parallel_kwargs = {"mmap_mode": "c"} + else: + parallel_kwargs = {} + + n_columns = len(X2) + distance_matrices = Parallel(n_jobs=n_jobs, **parallel_kwargs)( + delayed(metric_func)( + _subdiagrams(X1, [dim], remove_dim=True), + _subdiagrams(X2[s], [dim], remove_dim=True), + sampling=samplings[dim], + step_size=step_sizes[dim], + **effective_metric_params + ) for dim in homology_dimensions - for s in gen_even_slices(X2.shape[0], effective_n_jobs(n_jobs))) + for s in gen_even_slices(n_columns, effective_n_jobs(n_jobs)) + ) distance_matrices = np.concatenate(distance_matrices, axis=1) distance_matrices = np.stack( - [distance_matrices[:, i * X2.shape[0]:(i + 1) * X2.shape[0]] + [distance_matrices[:, i * n_columns:(i + 1) * n_columns] for i in range(len(homology_dimensions))], axis=2) return distance_matrices @@ -286,34 +347,60 @@ def wasserstein_amplitudes(diagrams, p=2., **kwargs): def betti_amplitudes(diagrams, sampling, step_size, p=2., **kwargs): + step_size_factor = step_size ** (1 / p) bcs = betti_curves(diagrams, sampling) - return (step_size ** (1 / p)) * np.linalg.norm(bcs, axis=1, ord=p) + amplitudes = np.linalg.norm(bcs, axis=1, ord=p) + amplitudes *= step_size_factor + return amplitudes -def landscape_amplitudes(diagrams, sampling, step_size, p=2., n_layers=1, - **kwargs): +def landscape_amplitudes( + diagrams, sampling, step_size, p=2., n_layers=1, **kwargs + ): + step_size_factor = step_size ** (1 / p) ls = landscapes(diagrams, sampling, n_layers).\ reshape(len(diagrams), -1) - return (step_size ** (1 / p)) * np.linalg.norm(ls, axis=1, ord=p) - - -def heat_amplitudes(diagrams, sampling, step_size, sigma=1., p=2., **kwargs): - heat = heats(diagrams, sampling, step_size, sigma) - return np.linalg.norm(heat, axis=(1, 2), ord=p) + amplitudes = np.linalg.norm(ls, axis=1, ord=p) + amplitudes *= step_size_factor + return amplitudes -def persistence_image_amplitudes(diagrams, sampling, step_size, - weight_function=lambda x: x, sigma=1., p=2., - **kwargs): - persistence_image = persistence_images(diagrams, sampling, step_size, - weight_function, sigma) - return np.linalg.norm(persistence_image, axis=(1, 2), ord=p) - - -def silhouette_amplitudes(diagrams, sampling, step_size, power=2., p=2., - **kwargs): - sht = silhouettes(diagrams, sampling, power) - return (step_size ** (1 / p)) * np.linalg.norm(sht, axis=1, ord=p) +def heat_amplitudes(diagrams, sampling, step_size, sigma=0.1, p=2., **kwargs): + # WARNING: `heats` modifies `diagrams` in place + step_size_factor = step_size ** (2 / p) + heats_ = heats(diagrams, sampling, step_size, sigma).\ + reshape(len(diagrams), -1) + amplitudes = np.linalg.norm(heats_, axis=1, ord=p) + amplitudes *= step_size_factor + return amplitudes + + +def persistence_image_amplitudes( + diagrams, sampling, step_size, sigma=0.1, weight_function=np.ones_like, + p=2., **kwargs + ): + # For persistence images, `sampling` is a tall matrix with two columns + # (the first for birth and the second for persistence), and `step_size` is + # a 2d array + weights = weight_function(sampling[:, 1]) + step_sizes_factor = np.product(step_size) ** (1 / p) + # WARNING: `persistence_images` modifies `diagrams` in place + persistence_images_ = persistence_images( + diagrams, sampling, step_size, sigma, weights + ).reshape(len(diagrams), -1) + amplitudes = np.linalg.norm(persistence_images_, axis=1, ord=p) + amplitudes *= step_sizes_factor + return amplitudes + + +def silhouette_amplitudes( + diagrams, sampling, step_size, power=1., p=2., **kwargs + ): + step_size_factor = step_size ** (1 / p) + silhouettes_ = silhouettes(diagrams, sampling, power) + amplitudes = np.linalg.norm(silhouettes_, axis=1, ord=p) + amplitudes *= step_size_factor + return amplitudes implemented_amplitude_recipes = { @@ -322,14 +409,9 @@ def silhouette_amplitudes(diagrams, sampling, step_size, power=2., p=2., "landscape": landscape_amplitudes, "betti": betti_amplitudes, "heat": heat_amplitudes, - "persistence_image": persistence_images, - 'silhouette': silhouette_amplitudes, -} - - -def _arrays_wrapper(amplitude_func, amplitude_arrays, slice_, dim, - *args, **kwargs): - amplitude_arrays[slice_, int(dim)] = amplitude_func(*args, **kwargs) + "persistence_image": persistence_image_amplitudes, + 'silhouette': silhouette_amplitudes + } def _parallel_amplitude(X, metric, metric_params, homology_dimensions, n_jobs): @@ -338,16 +420,23 @@ def _parallel_amplitude(X, metric, metric_params, homology_dimensions, n_jobs): none_dict = {dim: None for dim in homology_dimensions} samplings = effective_metric_params.pop("samplings", none_dict) step_sizes = effective_metric_params.pop("step_sizes", none_dict) + if metric in ["heat", "persistence_image"]: + parallel_kwargs = {"mmap_mode": "c"} + else: + parallel_kwargs = {} - amplitude_arrays = Parallel(n_jobs=n_jobs)( + amplitude_arrays = Parallel(n_jobs=n_jobs, **parallel_kwargs)( delayed(amplitude_func)( - _subdiagrams(X, [dim], remove_dim=True)[s], - sampling=samplings[dim], step_size=step_sizes[dim], - **effective_metric_params) + _subdiagrams(X[s], [dim], remove_dim=True), + sampling=samplings[dim], + step_size=step_sizes[dim], + **effective_metric_params + ) for dim in homology_dimensions - for s in gen_even_slices(_num_samples(X), effective_n_jobs(n_jobs))) + for s in gen_even_slices(_num_samples(X), effective_n_jobs(n_jobs)) + ) - amplitude_arrays = (np.concatenate(amplitude_arrays).reshape( - len(homology_dimensions), X.shape[0]).T) + amplitude_arrays = np.concatenate(amplitude_arrays).\ + reshape(len(homology_dimensions), len(X)).T return amplitude_arrays diff --git a/gtda/diagrams/_utils.py b/gtda/diagrams/_utils.py index 1f417a867..3274e468f 100644 --- a/gtda/diagrams/_utils.py +++ b/gtda/diagrams/_utils.py @@ -4,59 +4,127 @@ import numpy as np +def _homology_dimensions_to_sorted_ints(homology_dimensions): + return tuple( + sorted([int(dim) if dim != np.inf else dim + for dim in homology_dimensions]) + ) + + def _subdiagrams(X, homology_dimensions, remove_dim=False): - for dim in homology_dimensions: - Xs = X[X[:, :, 2] == dim] - Xs = Xs.reshape(X.shape[0], -1, 3) + """For each diagram in a collection, extract the subdiagrams in a given + list of homology dimensions. It is assumed that all diagrams in X contain + the same number of points in each homology dimension.""" + n_samples = len(X) + X_0 = X[0] + + def _subdiagrams_single_homology_dimension(homology_dimension): + n_features_in_dim = np.sum(X_0[:, 2] == homology_dimension) + try: + # In this case, reshape ensures copy + Xs = X[X[:, :, 2] == homology_dimension].\ + reshape(n_samples, n_features_in_dim, 3) + return Xs + except ValueError as e: + if e.args[0].lower().startswith("cannot reshape array"): + raise ValueError( + f"All persistence diagrams in the collection must have " + f"the same number of birth-death-dimension triples in any " + f"given homology dimension. This is not true in homology " + f"dimension {homology_dimension}. Trivial triples for " + f"which birth = death may be added or removed to fulfill " + f"this requirement." + ) + else: + raise e + + if len(homology_dimensions) == 1: + Xs = _subdiagrams_single_homology_dimension(homology_dimensions[0]) + else: + # np.concatenate will also create a copy + Xs = np.concatenate( + [_subdiagrams_single_homology_dimension(dim) + for dim in homology_dimensions], + axis=1 + ) if remove_dim: Xs = Xs[:, :, :2] return Xs -def _pad(X, max_diagram_sizes): - X_padded = {dim: np.pad( - X[dim], - ((0, 0), (0, max_diagram_sizes[dim] - X[dim].shape[1]), - (0, 0)), 'constant') for dim in X.keys()} - return X_padded +def _sample_image(image, diagram_pixel_coords): + # WARNING: Modifies `image` in-place + unique, counts = \ + np.unique(diagram_pixel_coords, axis=0, return_counts=True) + unique = tuple(tuple(row) for row in unique.astype(np.int).T) + image[unique] = counts -def _sort(Xs): - indices = np.argsort(Xs[:, :, 1] - Xs[:, :, 0], axis=1) - indices = np.stack([indices, indices, indices], axis=2) - Xs = np.flip(np.take_along_axis(Xs, indices, axis=1), axis=1) - return Xs +def _multirange(counts): + """Given a 1D array of positive integers, generate an array equal to + np.concatenate([np.arange(c) for c in counts]), but in a faster and more + memory-efficient way.""" + cumsum = np.cumsum(counts) + reset_index = cumsum[:-1] + incr = np.ones(cumsum[-1], dtype=np.int32) + incr[0] = 0 + # For each index in reset_index, we insert the negative value necessary + # to offset the cumsum in the last line + incr[reset_index] = 1 - counts[:-1] + incr.cumsum(out=incr) -def _sample_image(image, sampled_diag): - unique, counts = np.unique(sampled_diag, axis=0, return_counts=True) - unique = tuple(tuple(row) for row in unique.astype(np.int).T) - image[unique] = counts + return incr -def _filter(Xs, filtered_homology_dimensions, cutoff): - homology_dimensions = sorted(list(set(Xs[0, :, 2]))) - unfiltered_homology_dimensions = sorted(list( - set(homology_dimensions) - set(filtered_homology_dimensions))) +def _filter(X, filtered_homology_dimensions, cutoff): + n = len(X) + homology_dimensions = sorted(np.unique(X[0, :, 2])) + unfiltered_homology_dimensions = [dim for dim in homology_dimensions if + dim not in filtered_homology_dimensions] if len(unfiltered_homology_dimensions) == 0: - Xf = np.empty((Xs.shape[0], 0, 3), dtype=Xs.dtype) + Xuf = np.empty((n, 0, 3), dtype=X.dtype) else: - Xf = _subdiagrams(Xs, unfiltered_homology_dimensions) + Xuf = _subdiagrams(X, unfiltered_homology_dimensions) + # Compute a global 2D cutoff mask once + cutoff_mask = X[:, :, 1] - X[:, :, 0] > cutoff + Xf = [] for dim in filtered_homology_dimensions: - Xdim = _subdiagrams(Xs, [dim]) - min_value = np.min(Xdim[:, :, 0]) - mask = (Xdim[:, :, 1] - Xdim[:, :, 0]) <= cutoff - Xdim[mask, :] = [min_value, min_value, dim] - max_points = np.max(np.sum(Xs[:, :, 1] != 0, axis=1)) - Xdim = Xdim[:, :max_points, :] - Xf = np.concatenate([Xf, Xdim], axis=1) + # Compute a 2D mask for persistence pairs in dimension dim + dim_mask = X[:, :, 2] == dim + # Need the indices relative to X of persistence triples in dimension + # dim surviving the cutoff + indices = np.nonzero(np.logical_and(dim_mask, cutoff_mask)) + if not indices[0].size: + Xdim = np.tile([0., 0., dim], (n, 1, 1)) + else: + # A unique element k is repeated N times *consecutively* in + # indices[0] iff there are exactly N valid persistence triples + # in the k-th diagram + unique, counts = np.unique(indices[0], return_counts=True) + max_n_points = np.max(counts) + # Make a global 2D array of all valid triples + X_indices = X[indices] + min_value = np.min(X_indices[:, 0]) # For padding + # Initialise the array of filtered subdiagrams in dimension m + Xdim = np.tile([min_value, min_value, dim], (n, max_n_points, 1)) + # Since repeated indices in indices[0] are consecutive and we know + # the counts per unique index, we can fill the top portion of + # each 2D array entry of Xdim with the filtered triples from the + # corresponding entry of X + Xdim[indices[0], _multirange(counts)] = X_indices + Xf.append(Xdim) + + Xf.append(Xuf) + Xf = np.concatenate(Xf, axis=1) return Xf -def _bin(X, metric, n_bins=100, **kw_args): - homology_dimensions = sorted(list(set(X[0, :, 2]))) +def _bin(X, metric, n_bins=100, homology_dimensions=None, **kw_args): + if homology_dimensions is None: + homology_dimensions = sorted(np.unique(X[0, :, 2])) # For some vectorizations, we force the values to be the same + widest sub_diags = {dim: _subdiagrams(X, [dim], remove_dim=True) for dim in homology_dimensions} @@ -89,10 +157,9 @@ def _bin(X, metric, n_bins=100, **kw_args): samplings = {} step_sizes = {} for dim in homology_dimensions: - samplings[dim], step_sizes[dim] = np.linspace(min_vals[dim], - max_vals[dim], - retstep=True, - num=n_bins) + samplings[dim], step_sizes[dim] = np.linspace( + min_vals[dim], max_vals[dim], retstep=True, num=n_bins + ) if metric in ['landscape', 'betti', 'heat', 'silhouette']: for dim in homology_dimensions: samplings[dim] = samplings[dim][:, [0], None] @@ -100,7 +167,20 @@ def _bin(X, metric, n_bins=100, **kw_args): return samplings, step_sizes -def _calculate_weights(X, weight_function, samplings, **kw_args): - weights = {dim: weight_function(samplings[dim][:, 1]) - for dim in samplings.keys()} - return weights +def _make_homology_dimensions_mapping(homology_dimensions, + homology_dimensions_ref): + """`homology_dimensions_ref` is assumed to be a sorted tuple as is e.g. + :attr:`homology_dimensions_` for several transformers.""" + if homology_dimensions is None: + homology_dimensions_mapping = list(enumerate(homology_dimensions_ref)) + else: + homology_dimensions_mapping = [] + for dim in homology_dimensions: + if dim not in homology_dimensions_ref: + raise ValueError(f"All homology dimensions must be in " + f"{homology_dimensions_ref}; {dim} is not.") + else: + homology_dimensions_arr = np.array(homology_dimensions_ref) + inv_idx = np.flatnonzero(homology_dimensions_arr == dim)[0] + homology_dimensions_mapping.append((inv_idx, dim)) + return homology_dimensions_mapping diff --git a/gtda/diagrams/distance.py b/gtda/diagrams/distance.py index ae37f50b7..1ff2c3eaf 100644 --- a/gtda/diagrams/distance.py +++ b/gtda/diagrams/distance.py @@ -8,7 +8,7 @@ from sklearn.utils.validation import check_is_fitted from ._metrics import _AVAILABLE_METRICS, _parallel_pairwise -from ._utils import _bin, _calculate_weights +from ._utils import _bin, _homology_dimensions_to_sorted_ints from ..utils._docs import adapt_fit_transform_docs from ..utils.intervals import Interval from ..utils.validation import check_diagrams, validate_params @@ -34,22 +34,29 @@ class PairwiseDistance(BaseEstimator, TransformerMixin): three-dimensional array, or a single distance matrix constructed by taking norms of the vectors of distances between diagram pairs. + **Important notes**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + - The shape of outputs of :meth:`transform` depends on the value of the + `order` parameter. + Parameters ---------- - metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'landscape'`` | \ - ``'betti'`` | ``'heat'`` | ``'persistence_image'``, | \ - ``'silhouette'``, optional, default: ``'landscape'`` + metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'betti'`` | \ + ``'landscape'`` | ``'silhouette'`` | ``'heat'`` | \ + ``'persistence_image'``, optional, default: ``'landscape'`` Distance or dissimilarity function between subdiagrams: - ``'bottleneck'`` and ``'wasserstein'`` refer to the identically named perfect-matching--based notions of distance. + - ``'betti'`` refers to the :math:`L^p` distance between Betti curves. - ``'landscape'`` refers to the :math:`L^p` distance between persistence landscapes. - - ``'betti'`` refers to the :math:`L^p` distance between Betti curves. - - ``'heat'`` refers to the :math:`L^p` distance between - Gaussian-smoothed diagrams. - ``'silhouette'`` refers to the :math:`L^p` distance between silhouettes. + - ``'heat'`` refers to the :math:`L^p` distance between + Gaussian-smoothed diagrams. - ``'persistence_image'`` refers to the :math:`L^p` distance between Gaussian-smoothed diagrams represented on birth-persistence axes. @@ -58,27 +65,27 @@ class PairwiseDistance(BaseEstimator, TransformerMixin): ``None`` is equivalent to passing the defaults described below): - If ``metric == 'bottleneck'`` the only argument is `delta` (float, - default: ``0.01``). When equal to ``0.``, an exact algorithm is - used; otherwise, a faster approximate algorithm is used. + default: ``0.01``). When equal to ``0.``, an exact algorithm is used; + otherwise, a faster approximate algorithm is used. - If ``metric == 'wasserstein'`` the available arguments are `p` (float, default: ``2.``) and `delta` (float, default: ``0.01``). - Unlike the case of ``'bottleneck'``, `delta` cannot be set to - ``0.`` and an exact algorithm is not available. + Unlike the case of ``'bottleneck'``, `delta` cannot be set to ``0.`` + and an exact algorithm is not available. - If ``metric == 'betti'`` the available arguments are `p` (float, default: ``2.``) and `n_bins` (int, default: ``100``). - - If ``metric == 'landscape'`` the available arguments are `p` - (float, default: ``2.``), `n_bins` (int, default: ``100``) and - `n_layers` (int, default: ``1``). - - If ``metric == 'heat'`` the available arguments are `p` - (float, default: ``2.``), `sigma` (float, default: ``1.``) and - `n_bins` (int, default: ``100``). - - If ``metric == 'silhouette'`` the available arguments are `p` - (float, default: ``2.``), `order` (float, default: ``1.``) and - `n_bins` (int, default: ``100``). + - If ``metric == 'landscape'`` the available arguments are `p` (float, + default: ``2.``), `n_bins` (int, default: ``100``) and `n_layers` + (int, default: ``1``). + - If ``metric == 'silhouette'`` the available arguments are `p` (float, + default: ``2.``), `power` (float, default: ``1.``) and `n_bins` (int, + default: ``100``). + - If ``metric == 'heat'`` the available arguments are `p` (float, + default: ``2.``), `sigma` (float, default: ``0.1``) and `n_bins` + (int, default: ``100``). - If ``metric == 'persistence_image'`` the available arguments are `p` - (float, default: ``2.``), `sigma` (float, default: ``1.``), - `n_bins` (int, default: ``100``) and `weight_function` - (callable or None, default: ``None``). + (float, default: ``2.``), `sigma` (float, default: ``0.1``), `n_bins` + (int, default: ``100``) and `weight_function` (callable or None, + default: ``None``). order : float or None, optional, default: ``2.`` If ``None``, :meth:`transform` returns for each pair of diagrams a @@ -95,9 +102,9 @@ class PairwiseDistance(BaseEstimator, TransformerMixin): ---------- effective_metric_params_ : dict Dictionary containing all information present in `metric_params` as - well as on any relevant quantities computed in :meth:`fit`. + well as relevant quantities computed in :meth:`fit`. - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`, sorted in ascending order. See also @@ -123,7 +130,8 @@ class PairwiseDistance(BaseEstimator, TransformerMixin): 'metric': {'type': str, 'in': _AVAILABLE_METRICS.keys()}, 'order': {'type': (Real, type(None)), 'in': Interval(0, np.inf, closed='right')}, - 'metric_params': {'type': (dict, type(None))}} + 'metric_params': {'type': (dict, type(None))} + } def __init__(self, metric='landscape', metric_params=None, order=2., n_jobs=None): @@ -146,6 +154,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. y : None There is no need for a target in a transformer, yet the pipeline @@ -167,15 +178,23 @@ def fit(self, X, y=None): validate_params( self.effective_metric_params_, _AVAILABLE_METRICS[self.metric]) - self.homology_dimensions_ = sorted(set(X[0, :, 2])) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self.effective_metric_params_['samplings'], \ self.effective_metric_params_['step_sizes'] = \ - _bin(X, metric=self.metric, **self.effective_metric_params_) + _bin(X, self.metric, **self.effective_metric_params_) if self.metric == 'persistence_image': - self.effective_metric_params_['weights'] = \ - _calculate_weights(X, **self.effective_metric_params_) + weight_function = self.effective_metric_params_.get( + 'weight_function', None + ) + weight_function = \ + np.ones_like if weight_function is None else weight_function + self.effective_metric_params_['weight_function'] = weight_function self._X = X return self @@ -190,6 +209,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. y : None There is no need for a target in a transformer, yet the pipeline @@ -197,9 +219,9 @@ def transform(self, X, y=None): Returns ------- - Xt : ndarray of shape (n_samples_fit, n_samples, \ + Xt : ndarray of shape (n_samples, n_samples_fit, \ n_homology_dimensions) if `order` is ``None``, else \ - (n_samples_fit, n_samples) + (n_samples, n_samples_fit) Distance matrix or collection of distance matrices between diagrams in `X` and diagrams seen in :meth:`fit`. In the second case, index i along axis 2 corresponds to the i-th @@ -207,14 +229,9 @@ def transform(self, X, y=None): """ check_is_fitted(self) - X = check_diagrams(X, copy=True) - - if np.array_equal(X, self._X): - X2 = None - else: - X2 = X + Xt = check_diagrams(X, copy=True) - Xt = _parallel_pairwise(self._X, X2, self.metric, + Xt = _parallel_pairwise(Xt, self._X, self.metric, self.effective_metric_params_, self.homology_dimensions_, self.n_jobs) diff --git a/gtda/diagrams/features.py b/gtda/diagrams/features.py index 3d10db805..7ce885a56 100644 --- a/gtda/diagrams/features.py +++ b/gtda/diagrams/features.py @@ -5,12 +5,14 @@ import numpy as np from joblib import Parallel, delayed, effective_n_jobs +from scipy.stats import entropy from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import gen_even_slices from sklearn.utils.validation import check_is_fitted from ._metrics import _AVAILABLE_AMPLITUDE_METRICS, _parallel_amplitude -from ._utils import _subdiagrams, _bin, _calculate_weights +from ._features import _AVAILABLE_POLYNOMIALS, _implemented_polynomial_recipes +from ._utils import _subdiagrams, _bin, _homology_dimensions_to_sorted_ints from ..utils._docs import adapt_fit_transform_docs from ..utils.intervals import Interval from ..utils.validation import validate_params, check_diagrams @@ -21,14 +23,36 @@ class PersistenceEntropy(BaseEstimator, TransformerMixin): """:ref:`Persistence entropies ` of persistence diagrams. - Given a persistence diagrams consisting of birth-death-dimension triples + Given a persistence diagram consisting of birth-death-dimension triples [b, d, q], subdiagrams corresponding to distinct homology dimensions are considered separately, and their respective persistence entropies are - calculated as the (base e) entropies of the collections of differences - d - b, normalized by the sum of all such differences. + calculated as the (base 2) Shannon entropies of the collections of + differences d - b ("lifetimes"), normalized by the sum of all such + differences. Optionally, these entropies can be normalized according to a + simple heuristic, see `normalize`. + + **Important notes**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + - By default, persistence subdiagrams containing only triples with zero + lifetime will have corresponding (normalized) entropies computed as + ``numpy.nan``. To avoid this, set a value of `nan_fill_value` + different from ``None``. Parameters ---------- + normalize : bool, optional, default: ``False`` + When ``True``, the persistence entropy of each diagram is normalized by + the logarithm of the sum of lifetimes of all points in the diagram. + Can aid comparison between diagrams in an input collection when these + have different numbers of (non-trivial) points. [1]_ + + nan_fill_value : float or None, optional, default: ``-1.`` + If a float, (normalized) persistence entropies initially computed as + ``numpy.nan`` are replaced with this value. If ``None``, these values + are left as ``numpy.nan``. + n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all @@ -36,25 +60,44 @@ class PersistenceEntropy(BaseEstimator, TransformerMixin): Attributes ---------- - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`, sorted in ascending order. See also -------- - BettiCurve, PersistenceLandscape, HeatKernel, Amplitude, \ - PersistenceImage, PairwiseDistance, Silhouette, \ - gtda.homology.VietorisRipsPersistence + NumberOfPoints, Amplitude, BettiCurve, PersistenceLandscape, HeatKernel, \ + Silhouette, PersistenceImage + + References + ---------- + .. [1] A. Myers, E. Munch, and F. A. Khasawneh, "Persistent Homology of + Complex Networks for Dynamic State Detection"; *Phys. Rev. E* + **100**, 022314, 2019; `DOI: 10.1103/PhysRevE.100.022314 + `_. """ - def __init__(self, n_jobs=None): + _hyperparameters = { + 'normalize': {'type': bool}, + 'nan_fill_value': {'type': (Real, type(None))} + } + + def __init__(self, normalize=False, nan_fill_value=-1., n_jobs=None): + self.normalize = normalize + self.nan_fill_value = nan_fill_value self.n_jobs = n_jobs - def _persistence_entropy(self, X): + @staticmethod + def _persistence_entropy(X, normalize=False, nan_fill_value=None): X_lifespan = X[:, :, 1] - X[:, :, 0] - X_normalized = X_lifespan / np.sum(X_lifespan, axis=1).reshape(-1, 1) - return - np.sum(np.nan_to_num( - X_normalized * np.log(X_normalized)), axis=1).reshape(-1, 1) + X_entropy = entropy(X_lifespan, base=2, axis=1) + if normalize: + lifespan_sums = np.sum(X_lifespan, axis=1) + X_entropy /= np.log2(lifespan_sums) + if nan_fill_value is not None: + np.nan_to_num(X_entropy, nan=nan_fill_value, copy=False) + X_entropy = X_entropy[:, None] + return X_entropy def fit(self, X, y=None): """Store all observed homology dimensions in @@ -69,6 +112,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. y : None There is no need for a target in a transformer, yet the pipeline @@ -80,8 +126,14 @@ def fit(self, X, y=None): """ X = check_diagrams(X) + validate_params( + self.get_params(), self._hyperparameters, exclude=['n_jobs']) - self.homology_dimensions_ = sorted(set(X[0, :, 2])) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self._n_dimensions = len(self.homology_dimensions_) return self @@ -95,6 +147,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. y : None There is no need for a target in a transformer, yet the pipeline @@ -104,8 +159,8 @@ def transform(self, X, y=None): ------- Xt : ndarray of shape (n_samples, n_homology_dimensions) Persistence entropies: one value per sample and per homology - dimension seen in :meth:`fit`. Index i along axis 1 corresponds - to the i-th homology dimension in :attr:`homology_dimensions_`. + dimension seen in :meth:`fit`. Index i along axis 1 corresponds to + the i-th homology dimension in :attr:`homology_dimensions_`. """ check_is_fitted(self) @@ -113,18 +168,23 @@ def transform(self, X, y=None): with np.errstate(divide='ignore', invalid='ignore'): Xt = Parallel(n_jobs=self.n_jobs)( - delayed(self._persistence_entropy)(_subdiagrams(X, [dim])[s]) + delayed(self._persistence_entropy)( + _subdiagrams(X[s], [dim]), + normalize=self.normalize, + nan_fill_value=self.nan_fill_value + ) for dim in self.homology_dimensions_ - for s in gen_even_slices( - X.shape[0], effective_n_jobs(self.n_jobs)) - ) - Xt = np.concatenate(Xt).reshape(self._n_dimensions, X.shape[0]).T + for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs)) + ) + Xt = np.concatenate(Xt).reshape(self._n_dimensions, len(X)).T + return Xt @adapt_fit_transform_docs class Amplitude(BaseEstimator, TransformerMixin): - """:ref:`Amplitudes ` of persistence diagrams. + """:ref:`Amplitudes ` of persistence + diagrams. For each persistence diagram in a collection, a vector of amplitudes or a single scalar amplitude is calculated according to the following steps: @@ -135,57 +195,64 @@ class Amplitude(BaseEstimator, TransformerMixin): parameters `metric` and `metric_params`. This gives a vector of amplitudes, :math:`\\mathbf{a} = (a_{q_1}, \\ldots, a_{q_n})` where the :math:`q_i` range over the available homology dimensions. - 3. The final result is either :math:`\\mathbf{a}` itself or - a norm of :math:`\\mathbf{a}`, specified by the parameter `order`. + 3. The final result is either :math:`\\mathbf{a}` itself or a norm of + :math:`\\mathbf{a}`, specified by the parameter `order`. + + **Important notes**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + - The shape of outputs of :meth:`transform` depends on the value of the + `order` parameter. Parameters ---------- - metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'landscape'`` | \ - ``'betti'`` | ``'heat'`` | ``'silhouette'`` | \ + metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'betti'`` | \ + ``'landscape'`` | ``'silhouette'`` | ``'heat'`` | \ ``'persistence_image'``, optional, default: ``'landscape'`` - Distance or dissimilarity function used to define the amplitude of - a subdiagram as its distance from the (trivial) diagonal diagram: + Distance or dissimilarity function used to define the amplitude of a + subdiagram as its distance from the (trivial) diagonal diagram: - ``'bottleneck'`` and ``'wasserstein'`` refer to the identically named perfect-matching--based notions of distance. + - ``'betti'`` refers to the :math:`L^p` distance between Betti curves. - ``'landscape'`` refers to the :math:`L^p` distance between persistence landscapes. - - ``'betti'`` refers to the :math:`L^p` distance between Betti curves. - - ``'heat'`` refers to the :math:`L^p` distance between - Gaussian-smoothed diagrams. - ``'silhouette'`` refers to the :math:`L^p` distance between silhouettes. + - ``'heat'`` refers to the :math:`L^p` distance between + Gaussian-smoothed diagrams. - ``'persistence_image'`` refers to the :math:`L^p` distance between Gaussian-smoothed diagrams represented on birth-persistence axes. metric_params : dict or None, optional, default: ``None`` - Additional keyword arguments for the metric function (passing - ``None`` is equivalent to passing the defaults described below): + Additional keyword arguments for the metric function (passing ``None`` + is equivalent to passing the defaults described below): - If ``metric == 'bottleneck'`` there are no available arguments. - If ``metric == 'wasserstein'`` the only argument is `p` (float, default: ``2.``). - - If ``metric == 'landscape'`` the available arguments are `p` - (float, default: ``2.``), `n_bins` (int, default: ``100``) and - `n_layers` (int, default: ``1``). - If ``metric == 'betti'`` the available arguments are `p` (float, default: ``2.``) and `n_bins` (int, default: ``100``). + - If ``metric == 'landscape'`` the available arguments are `p` (float, + default: ``2.``), `n_bins` (int, default: ``100``) and `n_layers` + (int, default: ``1``). + - If ``metric == 'silhouette'`` the available arguments are `p` (float, + default: ``2.``), `power` (float, default: ``1.``) and `n_bins` (int, + default: ``100``). - If ``metric == 'heat'`` the available arguments are `p` (float, - default: ``2.``), `sigma` (float, default: ``1.``) and `n_bins` + default: ``2.``), `sigma` (float, default: ``0.1``) and `n_bins` (int, default: ``100``). - - If ``metric == 'silhouette'`` the available arguments are `p` - (float, default: ``2.``), `order` (float, default: ``1.``) and - `n_bins` (int, default: ``100``). - If ``metric == 'persistence_image'`` the available arguments are `p` - (float, default: ``2.``), `sigma` (float, default: ``1.``), - `n_bins` (int, default: ``100``) and `weight_function` - (callable or None, default: ``None``). + (float, default: ``2.``), `sigma` (float, default: ``0.1``), `n_bins` + (int, default: ``100``) and `weight_function` (callable or None, + default: ``None``). - order : float or None, optional, default: ``2.`` + order : float or None, optional, default: ``None`` If ``None``, :meth:`transform` returns for each diagram a vector of amplitudes corresponding to the dimensions in - :attr:`homology_dimensions_`. Otherwise, the :math:`p`-norm of - these vectors with :math:`p` equal to `order` is taken. + :attr:`homology_dimensions_`. Otherwise, the :math:`p`-norm of these + vectors with :math:`p` equal to `order` is taken. n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless @@ -196,17 +263,15 @@ class Amplitude(BaseEstimator, TransformerMixin): ---------- effective_metric_params_ : dict Dictionary containing all information present in `metric_params` as - well as on any relevant quantities computed in :meth:`fit`. + well as relevant quantities computed in :meth:`fit`. - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`, sorted in ascending order. See also -------- - PairwiseDistance, Scaler, Filtering, \ - BettiCurve, PersistenceLandscape, \ - HeatKernel, Silhouette, \ - gtda.homology.VietorisRipsPersistence + NumberOfPoints, PersistenceEntropy, PairwiseDistance, Scaler, Filtering, \ + BettiCurve, PersistenceLandscape, HeatKernel, Silhouette, PersistenceImage Notes ----- @@ -220,9 +285,10 @@ class Amplitude(BaseEstimator, TransformerMixin): 'metric': {'type': str, 'in': _AVAILABLE_AMPLITUDE_METRICS.keys()}, 'order': {'type': (Real, type(None)), 'in': Interval(0, np.inf, closed='right')}, - 'metric_params': {'type': (dict, type(None))}} + 'metric_params': {'type': (dict, type(None))} + } - def __init__(self, metric='landscape', metric_params=None, order=2., + def __init__(self, metric='landscape', metric_params=None, order=None, n_jobs=None): self.metric = metric self.metric_params = metric_params @@ -243,6 +309,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -264,15 +333,23 @@ def fit(self, X, y=None): validate_params(self.effective_metric_params_, _AVAILABLE_AMPLITUDE_METRICS[self.metric]) - self.homology_dimensions_ = sorted(set(X[0, :, 2])) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self.effective_metric_params_['samplings'], \ self.effective_metric_params_['step_sizes'] = \ - _bin(X, metric=self.metric, **self.effective_metric_params_) + _bin(X, self.metric, **self.effective_metric_params_) if self.metric == 'persistence_image': - self.effective_metric_params_['weights'] = \ - _calculate_weights(X, **self.effective_metric_params_) + weight_function = self.effective_metric_params_.get( + 'weight_function', None + ) + weight_function = \ + np.ones_like if weight_function is None else weight_function + self.effective_metric_params_['weight_function'] = weight_function return self @@ -285,6 +362,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -295,8 +375,8 @@ def transform(self, X, y=None): Xt : ndarray of shape (n_samples, n_homology_dimensions) if `order` \ is ``None``, else (n_samples, 1) Amplitudes or amplitude vectors of the diagrams in `X`. In the - second case, index i along axis 1 corresponds to the i-th - homology dimension in :attr:`homology_dimensions_`. + second case, index i along axis 1 corresponds to the i-th homology + dimension in :attr:`homology_dimensions_`. """ check_is_fitted(self) @@ -306,7 +386,333 @@ def transform(self, X, y=None): self.effective_metric_params_, self.homology_dimensions_, self.n_jobs) - if self.order is None: - return Xt - Xt = np.linalg.norm(Xt, axis=1, ord=self.order).reshape(-1, 1) + if self.order is not None: + Xt = np.linalg.norm(Xt, axis=1, ord=self.order).reshape(-1, 1) + + return Xt + + +@adapt_fit_transform_docs +class NumberOfPoints(BaseEstimator, TransformerMixin): + """Number of off-diagonal points in persistence diagrams, per homology + dimension. + + Given a persistence diagram consisting of birth-death-dimension triples + [b, d, q], subdiagrams corresponding to distinct homology dimensions are + considered separately, and their respective numbers of off-diagonal points + are calculated. + + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + + Parameters + ---------- + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + + Attributes + ---------- + homology_dimensions_ : list + Homology dimensions seen in :meth:`fit`, sorted in ascending order. + + See also + -------- + PersistenceEntropy, Amplitude, BettiCurve, PersistenceLandscape, + HeatKernel, Silhouette, PersistenceImage + + """ + + def __init__(self, n_jobs=None): + self.n_jobs = n_jobs + + @staticmethod + def _number_points(X): + return np.count_nonzero(X[:, :, 1] - X[:, :, 0], axis=1) + + def fit(self, X, y=None): + """Store all observed homology dimensions in + :attr:`homology_dimensions_`. Then, return the estimator. + + This method is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features, 3) + Input data. Array of persistence diagrams, each a collection of + triples [b, d, q] representing persistent topological features + through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + self : object + + """ + X = check_diagrams(X) + + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) + self._n_dimensions = len(self.homology_dimensions_) + + return self + + def transform(self, X, y=None): + """Compute a vector of numbers of off-diagonal points for each diagram + in `X`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features, 3) + Input data. Array of persistence diagrams, each a collection of + triples [b, d, q] representing persistent topological features + through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_homology_dimensions) + Number of points: one value per sample and per homology dimension + seen in :meth:`fit`. Index i along axis 1 corresponds to the i-th + homology dimension in :attr:`homology_dimensions_`. + + """ + check_is_fitted(self) + X = check_diagrams(X) + + Xt = Parallel(n_jobs=self.n_jobs)( + delayed(self._number_points)(_subdiagrams(X, [dim])[s]) + for dim in self.homology_dimensions_ + for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs)) + ) + Xt = np.concatenate(Xt).reshape(self._n_dimensions, len(X)).T + + return Xt + + +@adapt_fit_transform_docs +class ComplexPolynomial(BaseEstimator, TransformerMixin): + """Coefficients of complex polynomials whose roots are obtained from points + in persistence diagrams. + + Given a persistence diagram consisting of birth-death-dimension triples + [b, d, q], subdiagrams corresponding to distinct homology dimensions are + first considered separately. For each subdiagram, the polynomial whose + roots are complex numbers obtained from its birth-death pairs is + computed, and its :attr:`n_coefficients_` highest-degree complex + coefficients excluding the top one are stored into a single real vector + by concatenating the vector of all real parts with the vector of all + imaginary parts [1]_ (if not enough coefficients are available to form a + vector of the required length, padding with zeros is performed). Finally, + all such vectors coming from different subdiagrams are concatenated to + yield a single vector for the diagram. + + There are three possibilities for mapping birth-death pairs :math:`(b, d)` + to complex polynomial roots. They are: + + .. math:: + :nowrap: + + \\begin{gather*} + R(b, d) = b + \\mathrm{i} d, \\\\ + S(b, d) = \\frac{d - b}{\\sqrt{2} r} (b + \\mathrm{i} d), \\\\ + T(b, d) = \\frac{d - b}{2} [\\cos{r} - \\sin{r} + \ + \\mathrm{i}(\\cos{r} + \\sin{r})], + \\end{gather*} + + where :math:`r = \\sqrt{b^2 + d^2}`. + + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + + Parameters + ---------- + polynomial_type : ``'R'`` | ``'S'`` | ``'T'``, optional, default: ``'R'`` + Type of complex polynomial to compute. + + n_coefficients : list, int or None, optional, default: ``10`` + Number of complex coefficients per homology dimension. If an int then + the number of coefficients will be equal to that value for each + homology dimension. If ``None`` then, for each homology dimension in + the collection of persistence diagrams seen in :meth:`fit`, the number + of complex coefficients is defined to be the largest number of + off-diagonal points seen among all subdiagrams in that homology + dimension, minus one. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + + Attributes + ---------- + homology_dimensions_ : list + Homology dimensions seen in :meth:`fit`, sorted in ascending order. + + n_coefficients_ : list + Effective number of complex coefficients per homology dimension. Set in + :meth:`fit`. + + See also + -------- + Amplitude, PersistenceEntropy + + References + ---------- + .. [1] B. Di Fabio and M. Ferri, "Comparing Persistence Diagrams Through + Complex Vectors"; in *Image Analysis and Processing — ICIAP 2015*, + 2015; `DOI: 10.1007/978-3-319-23231-7_27 + _. + + """ + _hyperparameters = { + 'n_coefficients': {'type': (int, type(None), list), + 'in': Interval(1, np.inf, closed='left'), + 'of': {'type': int, + 'in': Interval(1, np.inf, closed='left')}}, + 'polynomial_type': {'type': str, + 'in': _AVAILABLE_POLYNOMIALS.keys()} + } + + def __init__(self, n_coefficients=10, polynomial_type='R', n_jobs=None): + self.n_coefficients = n_coefficients + self.polynomial_type = polynomial_type + self.n_jobs = n_jobs + + def fit(self, X, y=None): + """Store all observed homology dimensions in + :attr:`homology_dimensions_` and compute :attr:`n_coefficients_`. Then, + return the estimator. + + This method is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features, 3) + Input data. Array of persistence diagrams, each a collection of + triples [b, d, q] representing persistent topological features + through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + self : object + + """ + validate_params( + self.get_params(), self._hyperparameters, exclude=['n_jobs']) + X = check_diagrams(X) + + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit, counts = np.unique(X[0, :, 2], + return_counts=True) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) + + _n_homology_dimensions = len(self.homology_dimensions_) + _homology_dimensions_counts = dict(zip(homology_dimensions_fit, + counts)) + if self.n_coefficients is None: + self.n_coefficients_ = [_homology_dimensions_counts[dim] + for dim in self.homology_dimensions_] + elif type(self.n_coefficients) == list: + if len(self.n_coefficients) != _n_homology_dimensions: + raise ValueError( + f'`n_coefficients` has been passed as a list of length ' + f'{len(self.n_coefficients)} while diagrams in `X` have ' + f'{_n_homology_dimensions} homology dimensions.' + ) + self.n_coefficients_ = self.n_coefficients + else: + self.n_coefficients_ = \ + [self.n_coefficients] * _n_homology_dimensions + + self._polynomial_function = \ + _implemented_polynomial_recipes[self.polynomial_type] + + return self + + def _complex_polynomial(self, X, n_coefficients): + Xt = np.zeros(2 * n_coefficients,) + X = X[X[:, 0] != X[:, 1]] + + roots = self._polynomial_function(X) + coefficients = np.poly(roots) + + coefficients = np.array(coefficients[1:]) + dimension = min(n_coefficients, coefficients.shape[0]) + Xt[:dimension] = coefficients[:dimension].real + Xt[n_coefficients:n_coefficients + dimension] = \ + coefficients[:dimension].imag + + return Xt + + def transform(self, X, y=None): + """Compute vectors of real and imaginary parts of coefficients of + complex polynomials obtained from each diagram in `X`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features, 3) + Input data. Array of persistence diagrams, each a collection of + triples [b, d, q] representing persistent topological features + through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_homology_dimensions * 2 \ + * n_coefficients_) + Polynomial coefficients: real and imaginary parts of the complex + polynomials obtained in each homology dimension from each diagram + in `X`. + + """ + check_is_fitted(self) + Xt = check_diagrams(X, copy=True) + + Xt = Parallel(n_jobs=self.n_jobs)( + delayed(self._complex_polynomial)( + _subdiagrams(Xt[[s]], [dim], remove_dim=True)[0], + self.n_coefficients_[d]) + for s in range(len(X)) + for d, dim in enumerate(self.homology_dimensions_) + ) + Xt = np.concatenate(Xt).reshape(len(X), -1) + return Xt diff --git a/gtda/diagrams/preprocessing.py b/gtda/diagrams/preprocessing.py index 56276ac89..1f1d0464e 100644 --- a/gtda/diagrams/preprocessing.py +++ b/gtda/diagrams/preprocessing.py @@ -9,7 +9,7 @@ from sklearn.utils.validation import check_is_fitted from ._metrics import _AVAILABLE_AMPLITUDE_METRICS, _parallel_amplitude -from ._utils import _sort, _filter, _bin, _calculate_weights +from ._utils import _filter, _bin, _homology_dimensions_to_sorted_ints from ..base import PlotterMixin from ..plotting.persistence_diagrams import plot_diagram from ..utils._docs import adapt_fit_transform_docs @@ -89,7 +89,7 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0): + def plot(Xt, sample=0, plotly_params=None): """Plot a sample from a collection of persistence diagrams. Parameters @@ -101,9 +101,23 @@ def plot(Xt, sample=0): sample : int, optional, default: ``0`` Index of the sample in `Xt` to be plotted. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_diagram( - Xt[sample], homology_dimensions=[np.inf]) + Xt[sample], homology_dimensions=[np.inf], + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -118,18 +132,24 @@ class Scaler(BaseEstimator, TransformerMixin, PlotterMixin): The value of :attr:`scale_` depends on two things: - A way of computing, for each homology dimension, the :ref:`amplitude - ` in that dimension of a persistence diagram consisting - of birth-death-dimension triples [b, d, q]. Together, `metric` and - `metric_params` define this in the same way as in :class:`Amplitude`. + ` in that dimension of a + persistence diagram consisting of birth-death-dimension triples + [b, d, q]. Together, `metric` and `metric_params` define this in the + same way as in :class:`Amplitude`. - A scalar-valued function which is applied to the resulting two-dimensional array of amplitudes (one per diagram and homology dimension) to obtain :attr:`scale_`. + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + Parameters ---------- metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'betti'`` | \ - ``'landscape'`` | ``'heat'`` | ``'persistence_image'`` | \ - ``'silhouette'``, optional, default: ``'bottleneck'`` + ``'landscape'`` |``'silhouette'`` | ``'heat'`` | \ + ``'persistence_image'``, optional, default: ``'bottleneck'`` See the corresponding parameter in :class:`Amplitude`. metric_params : dict or None, optional, default: ``None`` @@ -140,17 +160,17 @@ class Scaler(BaseEstimator, TransformerMixin, PlotterMixin): amplitude vectors in :meth:`fit`. Must map 2D arrays to scalars. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. Attributes ---------- effective_metric_params_ : dict Dictionary containing all information present in `metric_params` as - well as on any relevant quantities computed in :meth:`fit`. + well as relevant quantities computed in :meth:`fit`. - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`, sorted in ascending order. scale_ : float @@ -176,7 +196,7 @@ class Scaler(BaseEstimator, TransformerMixin, PlotterMixin): 'metric': {'type': str, 'in': _AVAILABLE_AMPLITUDE_METRICS.keys()}, 'metric_params': {'type': (dict, type(None))}, 'function': {'type': (FunctionType, type(None))} - } + } def __init__(self, metric='bottleneck', metric_params=None, function=np.max, n_jobs=None): @@ -196,6 +216,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -217,15 +240,23 @@ def fit(self, X, y=None): validate_params(self.effective_metric_params_, _AVAILABLE_AMPLITUDE_METRICS[self.metric]) - self.homology_dimensions_ = sorted(set(X[0, :, 2])) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self.effective_metric_params_['samplings'], \ self.effective_metric_params_['step_sizes'] = \ - _bin(X, metric=self.metric, **self.effective_metric_params_) + _bin(X, self.metric, **self.effective_metric_params_) if self.metric == 'persistence_image': - self.effective_metric_params_['weights'] = \ - _calculate_weights(X, **self.effective_metric_params_) + weight_function = self.effective_metric_params_.get( + 'weight_function', None + ) + weight_function = \ + np.ones_like if weight_function is None else weight_function + self.effective_metric_params_['weight_function'] = weight_function amplitude_array = _parallel_amplitude(X, self.metric, self.effective_metric_params_, @@ -244,6 +275,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -257,18 +291,18 @@ def transform(self, X, y=None): """ check_is_fitted(self) - Xs = check_diagrams(X) + Xs = check_diagrams(X, copy=True) Xs[:, :, :2] /= self.scale_ return Xs def inverse_transform(self, X): - """Scale back the data to the original representation. Multiplies - by the scale found in :meth:`fit`. + """Scale back the data to the original representation. Multiplies by + the scale found in :meth:`fit`. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) - Data to apply the inverse transform to. + Data to apply the inverse transform to, c.f. :meth:`transform`. Returns ------- @@ -278,11 +312,11 @@ def inverse_transform(self, X): """ check_is_fitted(self) - Xs = check_diagrams(X) + Xs = check_diagrams(X, copy=True) Xs[:, :, :2] *= self.scale_ return Xs - def plot(self, Xt, sample=0, homology_dimensions=None): + def plot(self, Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. @@ -299,6 +333,18 @@ def plot(self, Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` is equivalent to passing :attr:`homology_dimensions_`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ if homology_dimensions is None: _homology_dimensions = self.homology_dimensions_ @@ -306,7 +352,9 @@ def plot(self, Xt, sample=0, homology_dimensions=None): _homology_dimensions = homology_dimensions return plot_diagram( - Xt[sample], homology_dimensions=_homology_dimensions) + Xt[sample], homology_dimensions=_homology_dimensions, + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -314,29 +362,33 @@ class Filtering(BaseEstimator, TransformerMixin, PlotterMixin): """Filtering of persistence diagrams. Filtering a diagram means discarding all points [b, d, q] representing - topological features whose lifetime d - b is less than or equal to a - cutoff value. Technically, discarded points are replaced by points on the - diagonal (i.e. whose birth and death values coincide), which carry no + non-trivial topological features whose lifetime d - b is less than or equal + to a cutoff value. Points on the diagonal (i.e. for which b and d are + equal) may still appear in the output for padding purposes, but carry no information. + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + Parameters ---------- homology_dimensions : list, tuple, or None, optional, default: ``None`` When set to ``None``, subdiagrams corresponding to all homology - dimensions seen in :meth:`fit` will be filtered. - Otherwise, it contains the homology dimensions (as non-negative - integers) at which filtering should occur. + dimensions seen in :meth:`fit` will be filtered. Otherwise, it contains + the homology dimensions (as non-negative integers) at which filtering + should occur. epsilon : float, optional, default: ``0.01`` The cutoff value controlling the amount of filtering. Attributes ---------- - homology_dimensions_ : list - If `homology_dimensions` is set to ``None``, then this is the list - of homology dimensions seen in :meth:`fit`, sorted in ascending - order. Otherwise, it is a similarly sorted version of - `homology_dimensions`. + homology_dimensions_ : tuple + If `homology_dimensions` is set to ``None``, contains the homology + dimensions seen in :meth:`fit`, sorted in ascending order. Otherwise, + it is a similarly sorted version of `homology_dimensions`. See also -------- @@ -347,9 +399,10 @@ class Filtering(BaseEstimator, TransformerMixin, PlotterMixin): _hyperparameters = { 'homology_dimensions': { 'type': (list, tuple, type(None)), - 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')}}, + 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} + }, 'epsilon': {'type': Real, 'in': Interval(0, np.inf, closed='left')} - } + } def __init__(self, homology_dimensions=None, epsilon=0.01): self.homology_dimensions = homology_dimensions @@ -368,6 +421,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of `X`. y : None There is no need for a target in a transformer, yet the pipeline @@ -383,10 +439,13 @@ def fit(self, X, y=None): self.get_params(), self._hyperparameters) if self.homology_dimensions is None: - self.homology_dimensions_ = [int(dim) for dim in set(X[0, :, 2])] + # Find the unique homology dimensions in the 3D array X passed to + # `fit` assuming that they can all be found in its zero-th entry + homology_dimensions = np.unique(X[0, :, 2]) else: - self.homology_dimensions_ = self.homology_dimensions - self.homology_dimensions_ = sorted(self.homology_dimensions_) + homology_dimensions = self.homology_dimensions + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions) return self @@ -399,6 +458,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -406,20 +468,19 @@ def transform(self, X, y=None): Returns ------- - Xt : ndarray of shape (n_samples, n_features, 3) + Xt : ndarray of shape (n_samples, n_features_filtered, 3) Filtered persistence diagrams. Only the subdiagrams corresponding to dimensions in :attr:`homology_dimensions_` are filtered. - Discarded points are replaced by points on the diagonal. + ``n_features_filtered`` is less than or equal to ``n_features``. """ check_is_fitted(self) X = check_diagrams(X) - X = _sort(X) Xt = _filter(X, self.homology_dimensions_, self.epsilon) return Xt - def plot(self, Xt, sample=0, homology_dimensions=None): + def plot(self, Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. @@ -436,6 +497,18 @@ def plot(self, Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` is equivalent to passing :attr:`homology_dimensions_`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ if homology_dimensions is None: _homology_dimensions = self.homology_dimensions_ @@ -443,4 +516,6 @@ def plot(self, Xt, sample=0, homology_dimensions=None): _homology_dimensions = homology_dimensions return plot_diagram( - Xt[sample], homology_dimensions=_homology_dimensions) + Xt[sample], homology_dimensions=_homology_dimensions, + plotly_params=plotly_params + ) diff --git a/gtda/diagrams/representations.py b/gtda/diagrams/representations.py index 05f8fdf36..6c5a1c132 100644 --- a/gtda/diagrams/representations.py +++ b/gtda/diagrams/representations.py @@ -5,15 +5,17 @@ from numbers import Real import numpy as np -import plotly.graph_objects as gobj from joblib import Parallel, delayed, effective_n_jobs +from plotly.graph_objects import Figure, Scatter +from plotly.subplots import make_subplots from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import gen_even_slices from sklearn.utils.validation import check_is_fitted from ._metrics import betti_curves, landscapes, heats, \ persistence_images, silhouettes -from ._utils import _subdiagrams, _bin, _calculate_weights +from ._utils import _subdiagrams, _bin, _make_homology_dimensions_mapping, \ + _homology_dimensions_to_sorted_ints from ..base import PlotterMixin from ..plotting import plot_heatmap from ..utils._docs import adapt_fit_transform_docs @@ -21,11 +23,6 @@ from ..utils.validation import validate_params, check_diagrams -def identity(x): - """The identity function.""" - return x - - @adapt_fit_transform_docs class BettiCurve(BaseEstimator, TransformerMixin, PlotterMixin): """:ref:`Betti curves ` of persistence diagrams. @@ -35,6 +32,11 @@ class BettiCurve(BaseEstimator, TransformerMixin, PlotterMixin): considered separately, and their respective Betti curves are obtained by evenly sampling the :ref:`filtration parameter `. + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + Parameters ---------- n_bins : int, optional, default: ``100`` @@ -42,13 +44,13 @@ class BettiCurve(BaseEstimator, TransformerMixin, PlotterMixin): dimension, to sample during :meth:`fit`. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. Attributes ---------- - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`, sorted in ascending order. samplings_ : dict @@ -59,7 +61,7 @@ class BettiCurve(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- PersistenceLandscape, PersistenceEntropy, HeatKernel, Amplitude, \ - PairwiseDistance, Silhouette, PersistenceImage,\ + PairwiseDistance, Silhouette, PersistenceImage, \ gtda.homology.VietorisRipsPersistence Notes @@ -72,7 +74,8 @@ class BettiCurve(BaseEstimator, TransformerMixin, PlotterMixin): """ _hyperparameters = { - 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}} + "n_bins": {"type": int, "in": Interval(1, np.inf, closed="left")} + } def __init__(self, n_bins=100, n_jobs=None): self.n_bins = n_bins @@ -93,6 +96,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -105,11 +111,19 @@ def fit(self, X, y=None): """ X = check_diagrams(X) validate_params( - self.get_params(), self._hyperparameters, exclude=['n_jobs']) + self.get_params(), self._hyperparameters, exclude=["n_jobs"]) - self.homology_dimensions_ = sorted(list(set(X[0, :, 2]))) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self._n_dimensions = len(self.homology_dimensions_) - self._samplings, _ = _bin(X, metric='betti', n_bins=self.n_bins) + + self._samplings, _ = _bin( + X, "betti", n_bins=self.n_bins, + homology_dimensions=self.homology_dimensions_ + ) self.samplings_ = {dim: s.flatten() for dim, s in self._samplings.items()} @@ -124,6 +138,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -142,20 +159,19 @@ def transform(self, X, y=None): X = check_diagrams(X) Xt = Parallel(n_jobs=self.n_jobs)(delayed(betti_curves)( - _subdiagrams(X, [dim], remove_dim=True)[s], + _subdiagrams(X[s], [dim], remove_dim=True), self._samplings[dim]) for dim in self.homology_dimensions_ - for s in gen_even_slices(X.shape[0], - effective_n_jobs(self.n_jobs))) + for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt).\ - reshape(self._n_dimensions, X.shape[0], -1).\ + reshape(self._n_dimensions, len(X), -1).\ transpose((1, 0, 2)) + return Xt - def plot(self, Xt, sample=0, homology_dimensions=None): - """Plot a sample from a collection of Betti curves arranged as in - the output of :meth:`transform`. Include homology in multiple - dimensions. + def plot(self, Xt, sample=0, homology_dimensions=None, plotly_params=None): + """Plot a sample from a collection of Betti curves arranged as in the + output of :meth:`transform`. Include homology in multiple dimensions. Parameters ---------- @@ -169,63 +185,68 @@ def plot(self, Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` means plotting all dimensions present in :attr:`homology_dimensions_`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ check_is_fitted(self) - if homology_dimensions is None: - _homology_dimensions = list(enumerate(self.homology_dimensions_)) - else: - _homology_dimensions = [] - for dim in homology_dimensions: - if dim not in self.homology_dimensions_: - raise ValueError( - f"All homology dimensions must be in " - f"self.homology_dimensions_ which is " - f"{self.homology_dimensions_}. {dim} is not.") - else: - homology_dimensions_arr = np.array( - self.homology_dimensions_) - ix = np.flatnonzero(homology_dimensions_arr == dim)[0] - _homology_dimensions.append((ix, dim)) - - layout = dict( - xaxis1=dict( - title="Filtration parameter", - side="bottom", - type="linear", - ticks="outside", - anchor="x1", - showline=True, - zeroline=True, - showexponent="all", - exponentformat="e" - ), - yaxis1=dict( - title="Betti number", - side="left", - type="linear", - ticks="outside", - anchor="y1", - showline=True, - zeroline=True, - showexponent="all", - exponentformat="e" - ), - plot_bgcolor="white" - ) - fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", - mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", - mirror=False) - - for ix, dim in _homology_dimensions: - fig.add_trace(gobj.Scatter(x=self.samplings_[dim], - y=Xt[sample][ix], - mode='lines', showlegend=True, - name=f"H{int(dim)}")) - - fig.show() + homology_dimensions_mapping = _make_homology_dimensions_mapping( + homology_dimensions, self.homology_dimensions_ + ) + + layout_axes_common = { + "type": "linear", + "ticks": "outside", + "showline": True, + "zeroline": True, + "linewidth": 1, + "linecolor": "black", + "mirror": False, + "showexponent": "all", + "exponentformat": "e" + } + layout = { + "xaxis1": { + "title": "Filtration parameter", + "side": "bottom", + "anchor": "y1", + **layout_axes_common + }, + "yaxis1": { + "title": "Betti number", + "side": "left", + "anchor": "x1", + **layout_axes_common + }, + "plot_bgcolor": "white", + "title": f"Betti curves from diagram {sample}" + } + + fig = Figure(layout=layout) + + for ix, dim in homology_dimensions_mapping: + fig.add_trace(Scatter(x=self.samplings_[dim], + y=Xt[sample][ix], + mode="lines", + showlegend=True, + name=f"H{dim}")) + + # Update traces and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("traces", None)) + fig.update_layout(plotly_params.get("layout", None)) + + return fig @adapt_fit_transform_docs @@ -239,6 +260,11 @@ class PersistenceLandscape(BaseEstimator, TransformerMixin, PlotterMixin): landscapes are obtained by evenly sampling the :ref:`filtration parameter `. + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + Parameters ---------- n_layers : int, optional, default: ``1`` @@ -255,7 +281,7 @@ class PersistenceLandscape(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`. samplings_ : dict @@ -265,23 +291,23 @@ class PersistenceLandscape(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - BettiCurve, PersistenceEntropy, HeatKernel, Amplitude, \ - PairwiseDistance, Silhouette, PersistenceImage, \ - gtda.homology.VietorisRipsPersistence + BettiCurve, PersistenceEntropy, HeatKernel, Amplitude, PairwiseDistance, \ + Silhouette, PersistenceImage, gtda.homology.VietorisRipsPersistence Notes ----- The samplings in :attr:`samplings_` are in general different between different homology dimensions. This means that the j-th entry of the - k-layer of a persistence landscape in homology dimension q typically - arises from a different parameter value to the j-th entry of a k-layer in + k-layer of a persistence landscape in homology dimension q typically arises + from a different parameter value to the j-th entry of a k-layer in dimension q'. """ _hyperparameters = { - 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'n_layers': {'type': int, 'in': Interval(1, np.inf, closed='left')}} + "n_bins": {"type": int, "in": Interval(1, np.inf, closed="left")}, + "n_layers": {"type": int, "in": Interval(1, np.inf, closed="left")} + } def __init__(self, n_layers=1, n_bins=100, n_jobs=None): self.n_layers = n_layers @@ -290,9 +316,9 @@ def __init__(self, n_layers=1, n_bins=100, n_jobs=None): def fit(self, X, y=None): """Store all observed homology dimensions in - :attr:`homology_dimensions_` and, for each dimension separately, - store evenly sample filtration parameter values in :attr:`samplings_`. - Then, return the estimator. + :attr:`homology_dimensions_` and, for each dimension separately, store + evenly sample filtration parameter values in :attr:`samplings_`. Then, + return the estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -303,6 +329,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -315,11 +344,19 @@ def fit(self, X, y=None): """ X = check_diagrams(X) validate_params( - self.get_params(), self._hyperparameters, exclude=['n_jobs']) + self.get_params(), self._hyperparameters, exclude=["n_jobs"]) - self.homology_dimensions_ = sorted(list(set(X[0, :, 2]))) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self._n_dimensions = len(self.homology_dimensions_) - self._samplings, _ = _bin(X, metric="landscape", n_bins=self.n_bins) + + self._samplings, _ = _bin( + X, "landscape", n_bins=self.n_bins, + homology_dimensions=self.homology_dimensions_ + ) self.samplings_ = {dim: s.flatten() for dim, s in self._samplings.items()} @@ -334,6 +371,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -341,31 +381,34 @@ def transform(self, X, y=None): Returns ------- - Xt : ndarray of shape (n_samples, n_homology_dimensions, \ - n_layers, n_bins) - Persistence lanscapes: one landscape (represented as a - two-dimensional array) per sample and per homology dimension seen - in :meth:`fit`. Each landscape contains a number `n_layers` of - layers. Index i along axis 1 corresponds to the i-th homology - dimension in :attr:`homology_dimensions_`. + Xt : ndarray of shape (n_samples, n_homology_dimensions * n_layers, \ + n_bins) + Persistence landscapes, where ``n_homology_dimensions`` is the + number of distinct homology dimensions seen in :meth:`fit`. + Landscapes coming from different homology dimensions are stacked + for each sample, so layer ``k`` of the landscape in the ``j``-th + homology dimension in :attr:`homology_dimensions_` is + ``X[i, n_homology_dimensions * j + k]``. """ check_is_fitted(self) X = check_diagrams(X) - Xt = Parallel(n_jobs=self.n_jobs)(delayed(landscapes)( - _subdiagrams(X, [dim], remove_dim=True)[s], - self._samplings[dim], - self.n_layers) + Xt = Parallel(n_jobs=self.n_jobs)( + delayed(landscapes)(_subdiagrams(X[s], [dim], remove_dim=True), + self._samplings[dim], + self.n_layers) for dim in self.homology_dimensions_ - for s in gen_even_slices(X.shape[0], - effective_n_jobs(self.n_jobs))) - Xt = np.concatenate(Xt).reshape(self._n_dimensions, X.shape[0], - self.n_layers, self.n_bins).\ - transpose((1, 0, 2, 3)) + for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs)) + ) + Xt = np.concatenate(Xt).\ + reshape(self._n_dimensions, len(X), self.n_layers, self.n_bins).\ + transpose((1, 0, 2, 3)).\ + reshape(len(X), self._n_dimensions * self.n_layers, self.n_bins) + return Xt - def plot(self, Xt, sample=0, homology_dimensions=None): + def plot(self, Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence landscapes arranged as in the output of :meth:`transform`. Include homology in multiple dimensions. @@ -385,69 +428,83 @@ def plot(self, Xt, sample=0, homology_dimensions=None): ``None`` means plotting all dimensions present in :attr:`homology_dimensions_`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ check_is_fitted(self) - if homology_dimensions is None: - _homology_dimensions = list(enumerate(self.homology_dimensions_)) - else: - _homology_dimensions = [] - for dim in homology_dimensions: - if dim not in self.homology_dimensions_: - raise ValueError( - f"All homology dimensions must be in " - f"self.homology_dimensions_ which is " - f"{self.homology_dimensions_}. {dim} is not.") - else: - homology_dimensions_arr = np.array( - self.homology_dimensions_) - ix = np.flatnonzero(homology_dimensions_arr == dim)[0] - _homology_dimensions.append((ix, dim)) - - layout = dict( - xaxis1=dict( - side="bottom", - type="linear", - ticks="outside", - anchor="y1", - showline=True, - zeroline=True, - showexponent="all", - exponentformat="e" - ), - yaxis1=dict( - side="left", - type="linear", - ticks="outside", - anchor="x1", - showline=True, - zeroline=True, - showexponent="all", - exponentformat="e" - ), - plot_bgcolor="white" - ) + homology_dimensions_mapping = _make_homology_dimensions_mapping( + homology_dimensions, self.homology_dimensions_ + ) + + layout_axes_common = { + "type": "linear", + "ticks": "outside", + "showline": True, + "zeroline": True, + "linewidth": 1, + "linecolor": "black", + "mirror": False, + "showexponent": "all", + "exponentformat": "e" + } + layout = { + "xaxis1": { + "side": "bottom", + "anchor": "y1", + **layout_axes_common + }, + "yaxis1": { + "side": "left", + "anchor": "x1", + **layout_axes_common + }, + "plot_bgcolor": "white", + } Xt_sample = Xt[sample] - for ix, dim in _homology_dimensions: - layout_dim = layout.copy() - layout_dim['title'] = "Persistence landscape for homology " + \ - "dimension {}".format(int(dim)) - fig = gobj.Figure(layout=layout_dim) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", - mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", - mirror=False) - - n_layers = Xt_sample.shape[1] + n_dims = len(self.homology_dimensions_) + n_layers = Xt_sample.shape[0] // n_dims + subplot_titles = [f"H{dim}" for _, dim in homology_dimensions_mapping] + fig = make_subplots(rows=len(homology_dimensions_mapping), cols=1, + subplot_titles=subplot_titles) + has_many_homology_dim = len(homology_dimensions_mapping) - 1 + for i, (inv_idx, dim) in enumerate(homology_dimensions_mapping): + hom_dim_str = \ + f" ({subplot_titles[i]})" if has_many_homology_dim else "" for layer in range(n_layers): - fig.add_trace(gobj.Scatter(x=self.samplings_[dim], - y=Xt_sample[ix, layer], - mode='lines', showlegend=True, - hoverinfo='none', - name=f"Layer {layer + 1}")) - - fig.show() + fig.add_trace( + Scatter(x=self.samplings_[dim], + y=Xt_sample[inv_idx * n_layers + layer], + mode="lines", + showlegend=True, + hoverinfo="none", + name=f"Layer {layer + 1}{hom_dim_str}"), + row=i + 1, + col=1 + ) + + fig.update_layout( + title_text=f"Landscape representations of diagram {sample}", + **layout.copy() + ) + + # Update traces and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("traces", None)) + fig.update_layout(plotly_params.get("layout", None)) + + return fig @adapt_fit_transform_docs @@ -464,9 +521,14 @@ class HeatKernel(BaseEstimator, TransformerMixin, PlotterMixin): diagonal, and the difference between the results of the two convolutions is computed. The result can be thought of as a (multi-channel) raster image. + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + Parameters ---------- - sigma : float, optional default ``1.`` + sigma : float, optional default ``0.1`` Standard deviation for Gaussian kernel. n_bins : int, optional, default: ``100`` @@ -480,7 +542,7 @@ class HeatKernel(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`. samplings_ : dict @@ -507,16 +569,17 @@ class HeatKernel(BaseEstimator, TransformerMixin, PlotterMixin): .. [1] J. Reininghaus, S. Huber, U. Bauer, and R. Kwitt, "A Stable Multi-Scale Kernel for Topological Machine Learning"; *2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)*, - pp. 4741--4748, 2015; doi: `10.1109/CVPR.2015.7299106 + pp. 4741--4748, 2015; `DOI: 10.1109/CVPR.2015.7299106 `_. """ _hyperparameters = { - 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'sigma': {'type': Real, 'in': Interval(0, np.inf, closed='neither')}} + "n_bins": {"type": int, "in": Interval(1, np.inf, closed="left")}, + "sigma": {"type": Real, "in": Interval(0, np.inf, closed="neither")} + } - def __init__(self, sigma=1., n_bins=100, n_jobs=None): + def __init__(self, sigma=0.1, n_bins=100, n_jobs=None): self.sigma = sigma self.n_bins = n_bins self.n_jobs = n_jobs @@ -536,6 +599,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -548,12 +614,19 @@ def fit(self, X, y=None): """ X = check_diagrams(X) validate_params( - self.get_params(), self._hyperparameters, exclude=['n_jobs']) + self.get_params(), self._hyperparameters, exclude=["n_jobs"]) - self.homology_dimensions_ = sorted(list(set(X[0, :, 2]))) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self._n_dimensions = len(self.homology_dimensions_) + self._samplings, self._step_size = _bin( - X, metric='heat', n_bins=self.n_bins) + X, "heat", n_bins=self.n_bins, + homology_dimensions=self.homology_dimensions_ + ) self.samplings_ = {dim: s.flatten() for dim, s in self._samplings.items()} @@ -569,6 +642,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -587,20 +663,20 @@ def transform(self, X, y=None): check_is_fitted(self) X = check_diagrams(X, copy=True) - Xt = Parallel(n_jobs=self.n_jobs)(delayed( - heats)(_subdiagrams(X, [dim], remove_dim=True)[s], + Xt = Parallel(n_jobs=self.n_jobs, mmap_mode="c")(delayed( + heats)(_subdiagrams(X[s], [dim], remove_dim=True), self._samplings[dim], self._step_size[dim], self.sigma) for dim in self.homology_dimensions_ - for s in gen_even_slices(X.shape[0], - effective_n_jobs(self.n_jobs))) - Xt = np.concatenate(Xt).reshape(self._n_dimensions, X.shape[0], - self.n_bins, self.n_bins).\ + for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs))) + Xt = np.concatenate(Xt).\ + reshape(self._n_dimensions, len(X), self.n_bins, self.n_bins).\ transpose((1, 0, 2, 3)) return Xt - def plot(self, Xt, sample=0, homology_dimension_ix=0, colorscale='blues'): - """Plot a single channel – corresponding to a given homology - dimension – in a sample from a collection of heat kernel images. + def plot(self, Xt, sample=0, homology_dimension_idx=0, colorscale="blues", + plotly_params=None): + """Plot a single channel –- corresponding to a given homology + dimension -- in a sample from a collection of heat kernel images. Parameters ---------- @@ -612,27 +688,47 @@ def plot(self, Xt, sample=0, homology_dimension_ix=0, colorscale='blues'): sample : int, optional, default: ``0`` Index of the sample in `Xt` to be selected. - homology_dimension_ix : int, optional, default: ``0`` - Index of the channel in the selected sample to be plotted. If - `Xt` is the result of a call to :meth:`transform` and this - index is i, the plot corresponds to the homology dimension given by - the i-th entry in :attr:`homology_dimensions_`. + homology_dimension_idx : int, optional, default: ``0`` + Index of the channel in the selected sample to be plotted. If `Xt` + is the result of a call to :meth:`transform` and this index is i, + the plot corresponds to the homology dimension given by the i-th + entry in :attr:`homology_dimensions_`. - colorscale : str, optional, default: ``'blues'`` + colorscale : str, optional, default: ``"blues"`` Color scale to be used in the heat map. Can be anything allowed by :class:`plotly.graph_objects.Heatmap`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ check_is_fitted(self) - return plot_heatmap(Xt[sample][homology_dimension_ix], - x=self.samplings_[homology_dimension_ix], - y=self.samplings_[homology_dimension_ix], - colorscale=colorscale) + homology_dimension = self.homology_dimensions_[homology_dimension_idx] + if homology_dimension != np.inf: + homology_dimension = int(homology_dimension) + x = self.samplings_[homology_dimension] + + return plot_heatmap( + Xt[sample][homology_dimension_idx], x=x, y=x[::-1], + colorscale=colorscale, origin="lower", + title=f"Heat kernel representation of diagram {sample} in " + f"homology dimension {homology_dimension}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs class PersistenceImage(BaseEstimator, TransformerMixin, PlotterMixin): - """:ref:`Persistence images ` of persistence + """:ref:`Persistence images ` of persistence diagrams. Based on ideas in [1]_. Given a persistence diagram consisting of @@ -645,9 +741,14 @@ class PersistenceImage(BaseEstimator, TransformerMixin, PlotterMixin): `. The result can be thought of as a (multi-channel) raster image. + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. + Parameters ---------- - sigma : float, optional default ``1.`` + sigma : float, optional default ``0.1`` Standard deviation for Gaussian kernel. n_bins : int, optional, default: ``100`` @@ -655,9 +756,10 @@ class PersistenceImage(BaseEstimator, TransformerMixin, PlotterMixin): dimension, to sample during :meth:`fit`. weight_function : callable or None, default: ``None`` - Function mapping the 1D array of persistence values of the points of an - input diagram to a 1D array of weights. ``None`` is equivalent to - passing the identity function. + Function mapping the 1D array of sampled persistence values (see + :attr:`samplings_`) to a 1D array of weights. ``None`` is equivalent to + passing ``numpy.ones_like``. More weight can be given to regions of + high persistence by passing a monotonic function, e.g. the identity. n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless @@ -670,13 +772,14 @@ class PersistenceImage(BaseEstimator, TransformerMixin, PlotterMixin): Effective function corresponding to `weight_function`. Set in :meth:`fit`. - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`. samplings_ : dict - For each number in `homology_dimensions_`, a discrete sampling of - filtration parameters, calculated during :meth:`fit` according to the - minimum birth and maximum death values observed across all samples. + For each dimension in `homology_dimensions_`, a discrete sampling of + birth parameters and one of persistence values, calculated during + :meth:`fit` according to the minimum birth and maximum death values + observed across all samples. weights_ : dict For each number in `homology_dimensions_`, an array of weights @@ -702,17 +805,18 @@ class PersistenceImage(BaseEstimator, TransformerMixin, PlotterMixin): S. Chepushtanova, E. Hanson, F. Motta, and L. Ziegelmeier, "Persistence Images: A Stable Vector Representation of Persistent Homology"; *Journal of Machine Learning Research 18, 1*, - pp. 218-252, 2017; doi: `10.5555/3122009.3122017 + pp. 218-252, 2017; `DOI: 10.5555/3122009.3122017 `_. """ _hyperparameters = { - 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'sigma': {'type': Real, 'in': Interval(0, np.inf, closed='neither')}, - 'weight_function': {'type': (types.FunctionType, type(None))}} + "n_bins": {"type": int, "in": Interval(1, np.inf, closed="left")}, + "sigma": {"type": Real, "in": Interval(0, np.inf, closed="neither")}, + "weight_function": {"type": (types.FunctionType, type(None))} + } - def __init__(self, sigma=1., n_bins=100, weight_function=None, + def __init__(self, sigma=0.1, n_bins=100, weight_function=None, n_jobs=None): self.sigma = sigma self.n_bins = n_bins @@ -734,6 +838,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -746,21 +853,29 @@ def fit(self, X, y=None): """ X = check_diagrams(X) validate_params( - self.get_params(), self._hyperparameters, exclude=['n_jobs']) + self.get_params(), self._hyperparameters, exclude=["n_jobs"]) if self.weight_function is None: - self.effective_weight_function_ = identity + self.effective_weight_function_ = np.ones_like else: self.effective_weight_function_ = self.weight_function - self.homology_dimensions_ = sorted(list(set(X[0, :, 2]))) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self._n_dimensions = len(self.homology_dimensions_) + self._samplings, self._step_size = _bin( - X, metric='persistence_image', n_bins=self.n_bins) - self.samplings_ = {dim: s.transpose() - for dim, s in self._samplings.items()} - self.weights_ = _calculate_weights(X, self.effective_weight_function_, - self._samplings) + X, "persistence_image", n_bins=self.n_bins, + homology_dimensions=self.homology_dimensions_ + ) + self.weights_ = { + dim: self.effective_weight_function_(samplings_dim[:, 1]) + for dim, samplings_dim in self._samplings.items() + } + self.samplings_ = {dim: s.T for dim, s in self._samplings.items()} return self @@ -774,6 +889,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -782,7 +900,7 @@ def transform(self, X, y=None): Returns ------- Xt : ndarray of shape (n_samples, n_homology_dimensions, n_bins, \ - n_bins) + n_bins) Multi-channel raster images: one image per sample and one channel per homology dimension seen in :meth:`fit`. Index i along axis 1 corresponds to the i-th homology dimension in @@ -792,25 +910,26 @@ def transform(self, X, y=None): check_is_fitted(self) X = check_diagrams(X, copy=True) - Xt = Parallel(n_jobs=self.n_jobs)( - delayed(persistence_images)(_subdiagrams(X, [dim], - remove_dim=True)[s], - self._samplings[dim], - self._step_size[dim], - self.weights_[dim], - self.sigma) + Xt = Parallel(n_jobs=self.n_jobs, mmap_mode="c")( + delayed(persistence_images)( + _subdiagrams(X[s], [dim], remove_dim=True), + self._samplings[dim], + self._step_size[dim], + self.sigma, + self.weights_[dim] + ) for dim in self.homology_dimensions_ - for s in gen_even_slices(X.shape[0], - effective_n_jobs(self.n_jobs)) - ) - Xt = np.concatenate(Xt).reshape(self._n_dimensions, X.shape[0], - self.n_bins, self.n_bins).\ + for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs)) + ) + Xt = np.concatenate(Xt).\ + reshape(self._n_dimensions, len(X), self.n_bins, self.n_bins).\ transpose((1, 0, 2, 3)) return Xt - def plot(self, Xt, sample=0, homology_dimension_ix=0, colorscale='blues'): - """Plot a single channel – corresponding to a given homology - dimension – in a sample from a collection of persistence images. + def plot(self, Xt, sample=0, homology_dimension_idx=0, colorscale="blues", + plotly_params=None): + """Plot a single channel -– corresponding to a given homology + dimension -– in a sample from a collection of persistence images. Parameters ---------- @@ -822,23 +941,45 @@ def plot(self, Xt, sample=0, homology_dimension_ix=0, colorscale='blues'): sample : int, optional, default: ``0`` Index of the sample in `Xt` to be selected. - homology_dimension_ix : int, optional, default: ``0`` - Index of the channel in the selected sample to be plotted. If - `Xt` is the result of a call to :meth:`transform` and this - index is i, the plot corresponds to the homology dimension given by - the i-th entry in :attr:`homology_dimensions_`. + homology_dimension_idx : int, optional, default: ``0`` + Index of the channel in the selected sample to be plotted. If `Xt` + is the result of a call to :meth:`transform` and this index is i, + the plot corresponds to the homology dimension given by the i-th + entry in :attr:`homology_dimensions_`. - colorscale : str, optional, default: ``'blues'`` + colorscale : str, optional, default: ``"blues"`` Color scale to be used in the heat map. Can be anything allowed by :class:`plotly.graph_objects.Heatmap`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ check_is_fitted(self) - samplings_x, samplings_y = self.samplings_[homology_dimension_ix] - return plot_heatmap(Xt[sample][homology_dimension_ix], - x=samplings_x, - y=samplings_y, - colorscale=colorscale) + homology_dimension = self.homology_dimensions_[homology_dimension_idx] + if homology_dimension != np.inf: + homology_dimension = int(homology_dimension) + samplings_x, samplings_y = self.samplings_[homology_dimension] + + return plot_heatmap( + Xt[sample][homology_dimension_idx], + x=samplings_x, + y=samplings_y[::-1], + colorscale=colorscale, + origin="lower", + title=f"Persistence image representation of diagram {sample} in " + f"homology dimension {homology_dimension}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -849,9 +990,14 @@ class Silhouette(BaseEstimator, TransformerMixin, PlotterMixin): Based on ideas in [1]_. Given a persistence diagram consisting of birth-death-dimension triples [b, d, q], subdiagrams corresponding to distinct homology dimensions are considered separately, and their - respective silhouette by sampling the silhouette function over evenly - spaced locations from appropriate ranges of the :ref:`filtration parameter - `. + respective silhouettes are obtained by sampling the silhouette function + over evenly spaced locations from appropriate ranges of the + :ref:`filtration parameter `. + + **Important note**: + + - Input collections of persistence diagrams for this transformer must + satisfy certain requirements, see e.g. :meth:`fit`. Parameters ---------- @@ -864,13 +1010,13 @@ class Silhouette(BaseEstimator, TransformerMixin, PlotterMixin): dimension, to sample during :meth:`fit`. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. Attributes ---------- - homology_dimensions_ : list + homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`, sorted in ascending order. samplings_ : dict @@ -886,10 +1032,9 @@ class Silhouette(BaseEstimator, TransformerMixin, PlotterMixin): Notes ----- The samplings in :attr:`samplings_` are in general different between - different homology dimensions. This means that the j-th entry of - a silhouette in homology dimension q typically arises from - a different parameter values to the j-th entry of a curve - in dimension q'. + different homology dimensions. This means that the j-th entry of a + silhouette in homology dimension q typically arises from a different + parameter values to the j-th entry of a curve in dimension q'. References ---------- @@ -897,14 +1042,15 @@ class Silhouette(BaseEstimator, TransformerMixin, PlotterMixin): "Stochastic Convergence of Persistence Landscapes and Silhouettes"; *In Proceedings of the thirtieth annual symposium on Computational Geometry*, Kyoto, Japan, 2014, pp. 474–483; - doi: `10.1145/2582112.2582128 + `DOI: 10.1145/2582112.2582128 `_. """ _hyperparameters = { - 'n_bins': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'power': {'type': Real, 'in': Interval(0, np.inf, closed='right')}} + "power": {"type": Real, "in": Interval(0, np.inf, closed="right")}, + "n_bins": {"type": int, "in": Interval(1, np.inf, closed="left")} + } def __init__(self, power=1., n_bins=100, n_jobs=None): self.power = power @@ -926,6 +1072,9 @@ def fit(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -938,11 +1087,19 @@ def fit(self, X, y=None): """ X = check_diagrams(X) validate_params( - self.get_params(), self._hyperparameters, exclude=['n_jobs']) + self.get_params(), self._hyperparameters, exclude=["n_jobs"]) - self.homology_dimensions_ = sorted(list(set(X[0, :, 2]))) + # Find the unique homology dimensions in the 3D array X passed to `fit` + # assuming that they can all be found in its zero-th entry + homology_dimensions_fit = np.unique(X[0, :, 2]) + self.homology_dimensions_ = \ + _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self._n_dimensions = len(self.homology_dimensions_) - self._samplings, _ = _bin(X, metric='silhouette', n_bins=self.n_bins) + + self._samplings, _ = _bin( + X, "silhouette", n_bins=self.n_bins, + homology_dimensions=self.homology_dimensions_ + ) self.samplings_ = {dim: s.flatten() for dim, s in self._samplings.items()} @@ -957,6 +1114,9 @@ def transform(self, X, y=None): Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). + It is important that, for each possible homology dimension, the + number of triples for which q equals that homology dimension is + constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline @@ -975,21 +1135,19 @@ def transform(self, X, y=None): X = check_diagrams(X) Xt = (Parallel(n_jobs=self.n_jobs) - (delayed(silhouettes)(_subdiagrams(X, [dim], remove_dim=True)[s], + (delayed(silhouettes)(_subdiagrams(X[s], [dim], remove_dim=True), self._samplings[dim], power=self.power) for dim in self.homology_dimensions_ - for s in gen_even_slices(X.shape[0], - effective_n_jobs(self.n_jobs)))) + for s in gen_even_slices(len(X), effective_n_jobs(self.n_jobs)))) - Xt = np.concatenate(Xt). \ - reshape(self._n_dimensions, X.shape[0], -1). \ + Xt = np.concatenate(Xt).\ + reshape(self._n_dimensions, len(X), -1).\ transpose((1, 0, 2)) return Xt - def plot(self, Xt, sample=0, homology_dimensions=None): - """Plot a sample from a collection of silhouettes arranged as in - the output of :meth:`transform`. Include homology in multiple - dimensions. + def plot(self, Xt, sample=0, homology_dimensions=None, plotly_params=None): + """Plot a sample from a collection of silhouettes arranged as in the + output of :meth:`transform`. Include homology in multiple dimensions. Parameters ---------- @@ -1003,60 +1161,65 @@ def plot(self, Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` means plotting all dimensions present in :attr:`homology_dimensions_`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ check_is_fitted(self) - if homology_dimensions is None: - _homology_dimensions = list(enumerate(self.homology_dimensions_)) - else: - _homology_dimensions = [] - for dim in homology_dimensions: - if dim not in self.homology_dimensions_: - raise ValueError( - f"All homology dimensions must be in " - f"self.homology_dimensions_ which is " - f"{self.homology_dimensions_}. {dim} is not.") - else: - homology_dimensions_arr = np.array( - self.homology_dimensions_) - ix = np.flatnonzero(homology_dimensions_arr == dim)[0] - _homology_dimensions.append((ix, dim)) - - layout = dict( - xaxis1=dict( - title="Filtration parameter", - side="bottom", - type="linear", - ticks="outside", - anchor="x1", - showline=True, - zeroline=True, - showexponent="all", - exponentformat="e" - ), - yaxis1=dict( - side="left", - type="linear", - ticks="outside", - anchor="y1", - showline=True, - zeroline=True, - showexponent="all", - exponentformat="e" - ), - plot_bgcolor="white" - ) - fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", - mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", - mirror=False) - - for ix, dim in _homology_dimensions: - fig.add_trace(gobj.Scatter(x=self.samplings_[dim], - y=Xt[sample][ix], - mode="lines", showlegend=True, - hoverinfo="none", - name=f"H{int(dim)}")) - - fig.show() + homology_dimensions_mapping = _make_homology_dimensions_mapping( + homology_dimensions, self.homology_dimensions_ + ) + + layout_axes_common = { + "type": "linear", + "ticks": "outside", + "showline": True, + "zeroline": True, + "linewidth": 1, + "linecolor": "black", + "mirror": False, + "showexponent": "all", + "exponentformat": "e" + } + layout = { + "xaxis1": { + "title": "Filtration parameter", + "side": "bottom", + "anchor": "y1", + **layout_axes_common + }, + "yaxis1": { + "side": "left", + "anchor": "x1", + **layout_axes_common + }, + "plot_bgcolor": "white", + "title": f"Silhouette representation of diagram {sample}" + } + + fig = Figure(layout=layout) + + for ix, dim in homology_dimensions_mapping: + fig.add_trace(Scatter(x=self.samplings_[dim], + y=Xt[sample][ix], + mode="lines", + showlegend=True, + hoverinfo="none", + name=f"H{dim}")) + + # Update traces and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("traces", None)) + fig.update_layout(plotly_params.get("layout", None)) + + return fig diff --git a/gtda/diagrams/tests/test_distance.py b/gtda/diagrams/tests/test_distance.py index 6c41519cb..06b2fd46d 100644 --- a/gtda/diagrams/tests/test_distance.py +++ b/gtda/diagrams/tests/test_distance.py @@ -8,7 +8,7 @@ from gtda.diagrams import PairwiseDistance, Amplitude -X_1 = np.array([ +X1 = np.array([ [[0., 0.36905774, 0], [0., 0.37293977, 0], [0., 0.38995215, 0], @@ -118,7 +118,7 @@ [0., 0., 2], [0., 0., 2]]]) -X_2 = np.array([ +X2 = np.array([ [[0., 0.36905774, 0], [0., 0.37293977, 0], [0., 0.38995215, 0], @@ -210,6 +210,8 @@ [0., 0., 2], [0., 0., 2]]]) +n_homology_dimensions = len(np.unique(X1[:, :, 2])) + X_bottleneck = np.array([ [[0, 1, 0.], [0, 0, 0.], @@ -225,84 +227,137 @@ ]) X_bottleneck_res_exp = np.array([ - [1/2, 2], - [1, 0], - [1/4, 2] -]) - + [1/2, 2], + [1, 0], + [1/4, 2] + ]) -def test_not_fitted(): - dd = PairwiseDistance() - da = Amplitude() - - with pytest.raises(NotFittedError): - dd.transform(X_1) +@pytest.mark.parametrize('transformer', [PairwiseDistance(), Amplitude()]) +def test_not_fitted(transformer): with pytest.raises(NotFittedError): - da.transform(X_1) + transformer.transform(X1) parameters_distance = [ ('bottleneck', None), ('wasserstein', {'p': 2, 'delta': 0.1}), ('betti', {'p': 2.1, 'n_bins': 10}), - ('landscape', {'n_bins': 10}), - ('heat', {'n_bins': 10})] + ('landscape', {'p': 2.1, 'n_bins': 10, 'n_layers': 2}), + ('silhouette', {'p': 2.1, 'power': 1.2, 'n_bins': 10}), + ('heat', {'p': 2.1, 'sigma': 0.5, 'n_bins': 10}), + ('persistence_image', + {'p': 2.1, 'sigma': 0.5, 'n_bins': 10}), + ('persistence_image', + {'p': 2.1, 'sigma': 0.5, 'n_bins': 10, 'weight_function': lambda x: x}) + ] @pytest.mark.parametrize(('metric', 'metric_params'), parameters_distance) -@pytest.mark.parametrize('n_jobs', [1, 2, 4]) -@pytest.mark.parametrize('order', [2, None]) +@pytest.mark.parametrize('order', [2., None]) +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) def test_dd_transform(metric, metric_params, order, n_jobs): # X_fit == X_transform dd = PairwiseDistance(metric=metric, metric_params=metric_params, order=order, n_jobs=n_jobs) - X_res = dd.fit_transform(X_1) - assert (X_res.shape[0], X_res.shape[1]) == (X_1.shape[0], X_1.shape[0]) + X_res = dd.fit_transform(X1) + assert (X_res.shape[0], X_res.shape[1]) == (X1.shape[0], X1.shape[0]) + if order is None: + assert X_res.shape[2] == n_homology_dimensions # X_fit != X_transform dd = PairwiseDistance(metric=metric, metric_params=metric_params, order=order, n_jobs=n_jobs) - X_res = dd.fit(X_1).transform(X_2) - assert (X_res.shape[0], X_res.shape[1]) == (X_1.shape[0], X_2.shape[0]) - + X_res = dd.fit(X1).transform(X2) + assert (X_res.shape[0], X_res.shape[1]) == (X2.shape[0], X1.shape[0]) if order is None: - assert X_res.shape[2] == len(np.unique(X_2[:, :, 2])) + assert X_res.shape[2] == n_homology_dimensions # X_fit != X_transform, default metric_params dd = PairwiseDistance(metric=metric, order=order, n_jobs=n_jobs) - X_res = dd.fit(X_1).transform(X_2) - assert (X_res.shape[0], X_res.shape[1]) == (X_1.shape[0], X_2.shape[0]) + X_res = dd.fit(X1).transform(X2) + assert (X_res.shape[0], X_res.shape[1]) == (X2.shape[0], X1.shape[0]) + if order is None: + assert X_res.shape[2] == n_homology_dimensions parameters_amplitude = [ ('bottleneck', None), ('wasserstein', {'p': 2}), ('betti', {'p': 2.1, 'n_bins': 10}), - ('landscape', {'n_bins': 10}), - ('heat', {'n_bins': 10})] + ('landscape', {'p': 2.1, 'n_bins': 10, 'n_layers': 2}), + ('silhouette', {'p': 2.1, 'power': 1.2, 'n_bins': 10}), + ('heat', {'p': 2.1, 'sigma': 0.5, 'n_bins': 10}), + ('persistence_image', + {'p': 2.1, 'sigma': 0.5, 'n_bins': 10}), + ('persistence_image', + {'p': 2.1, 'sigma': 0.5, 'n_bins': 10, 'weight_function': lambda x: x}) + ] @pytest.mark.parametrize(('metric', 'metric_params'), parameters_amplitude) -@pytest.mark.parametrize('n_jobs', [1, 2, 4]) -def test_da_transform(metric, metric_params, n_jobs): - da = Amplitude(metric=metric, metric_params=metric_params, +@pytest.mark.parametrize('order', [None, 2.]) +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_da_transform(metric, metric_params, order, n_jobs): + n_expected_columns = n_homology_dimensions if order is None else 1 + + da = Amplitude(metric=metric, metric_params=metric_params, order=order, n_jobs=n_jobs) - X_res = da.fit_transform(X_1) - assert X_res.shape == (X_1.shape[0], 1) + X_res = da.fit_transform(X1) + assert X_res.shape == (X1.shape[0], n_expected_columns) # X_fit != X_transform - da = Amplitude(metric=metric, metric_params=metric_params, + da = Amplitude(metric=metric, metric_params=metric_params, order=order, n_jobs=n_jobs) - X_res = da.fit(X_1).transform(X_2) - assert X_res.shape == (X_2.shape[0], 1) + X_res = da.fit(X1).transform(X2) + assert X_res.shape == (X2.shape[0], n_expected_columns) @pytest.mark.parametrize(('metric', 'metric_params', 'order'), [('bottleneck', None, None)]) -@pytest.mark.parametrize('n_jobs', [1, 2, 4]) +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) def test_da_transform_bottleneck(metric, metric_params, order, n_jobs): da = Amplitude(metric=metric, metric_params=metric_params, order=order, n_jobs=n_jobs) X_bottleneck_res = da.fit_transform(X_bottleneck) assert_almost_equal(X_bottleneck_res, X_bottleneck_res_exp) + + +@pytest.mark.parametrize('order', [None, 2.]) +@pytest.mark.parametrize('transformer_cls', [PairwiseDistance, Amplitude]) +@pytest.mark.parametrize('Xnew', [X1, X2]) +def test_pi_zero_weight_function(transformer_cls, order, Xnew): + """Test that, if a zero weight function is passed to `metric_params` in + Amplitude or PairwiseDistance when `metric` is 'persistence_image', the + result is zero.""" + metric = 'persistence_image' + metric_params = { + 'sigma': 0.1, 'weight_function': lambda x: x * 0., 'n_bins': 10 + } + transformer = transformer_cls( + metric=metric, metric_params=metric_params, order=order + ) + X_res = transformer.fit(X1).transform(Xnew) + + assert np.array_equal(X_res, np.zeros_like(X_res)) + + +@pytest.mark.parametrize('metric', ['heat', 'persistence_image']) +@pytest.mark.parametrize('transformer_cls', [Amplitude, PairwiseDistance]) +def test_large_hk_pi_parallel(metric, transformer_cls): + """Test that Amplitude and PairwiseDistance do not break with a read-only + error when the input array is at least 1MB, the metric is either 'heat' + or 'persistence_image', and more than 1 process is used (triggering + joblib's use of memmaps).""" + X = np.linspace(0, 100, 300000) + n_bins = 10 + diagrams = np.expand_dims( + np.stack([X, X, np.zeros(len(X))]).transpose(), axis=0 + ) + + transformer = transformer_cls( + metric=metric, metric_params={'sigma': 1, 'n_bins': n_bins}, n_jobs=2 + ) + Xt = transformer.fit_transform(diagrams) + + assert_almost_equal(Xt, np.zeros_like(Xt)) diff --git a/gtda/diagrams/tests/test_features_representations.py b/gtda/diagrams/tests/test_features_representations.py index 0c036d684..961c55d65 100644 --- a/gtda/diagrams/tests/test_features_representations.py +++ b/gtda/diagrams/tests/test_features_representations.py @@ -4,114 +4,206 @@ import numpy as np import plotly.io as pio import pytest -from hypothesis import given +from hypothesis import given, settings from hypothesis.extra.numpy import arrays, array_shapes from hypothesis.strategies import floats, integers from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.diagrams import PersistenceEntropy, BettiCurve, \ - PersistenceLandscape, HeatKernel, PersistenceImage, Silhouette +from gtda.diagrams import PersistenceEntropy, NumberOfPoints, \ + ComplexPolynomial, BettiCurve, PersistenceLandscape, HeatKernel, \ + PersistenceImage, Silhouette pio.renderers.default = 'plotly_mimetype' -X = np.array([[[0., 1., 0.], [2., 3., 0.], [4., 6., 1.], [2., 6., 1.]]]) +X = np.array([[[0., 0., 0.], [0., 1., 0.], [2., 3., 0.], + [4., 6., 1.], [2., 6., 1.]]]) +line_plots_traces_params = {"mode": "lines+markers"} +heatmap_trace_params = {"colorscale": "viridis"} +layout_params = {"title": "New title"} -def test_not_fitted(): - with pytest.raises(NotFittedError): - PersistenceEntropy().transform(X) +@pytest.mark.parametrize('transformer', + [PersistenceEntropy(), NumberOfPoints(), + ComplexPolynomial(), BettiCurve(), + PersistenceLandscape(), HeatKernel(), + PersistenceImage(), Silhouette()]) +def test_not_fitted(transformer): with pytest.raises(NotFittedError): - BettiCurve().transform(X) + transformer.transform(X) - with pytest.raises(NotFittedError): - PersistenceLandscape().transform(X) - with pytest.raises(NotFittedError): - HeatKernel().transform(X) +@pytest.mark.parametrize('transformer', + [HeatKernel(), PersistenceImage()]) +@pytest.mark.parametrize('hom_dim_idx', [0, 1]) +def test_fit_transform_plot_one_hom_dim(transformer, hom_dim_idx): + plotly_params = \ + {"trace": heatmap_trace_params, "layout": layout_params} + transformer.fit_transform_plot( + X, sample=0, homology_dimension_idx=hom_dim_idx, + plotly_params=plotly_params + ) - with pytest.raises(NotFittedError): - PersistenceImage().transform(X) - with pytest.raises(NotFittedError): - Silhouette().transform(X) +@pytest.mark.parametrize('transformer', + [BettiCurve(), PersistenceLandscape(), Silhouette()]) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_fit_transform_plot_many_hom_dims(transformer, hom_dims): + plotly_params = \ + {"traces": line_plots_traces_params, "layout": layout_params} + transformer.fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims, plotly_params=plotly_params + ) -@pytest.mark.parametrize('hom_dim_ix', [0, 1]) -def test_fit_transform_plot_one_hom_dim(hom_dim_ix): - HeatKernel().fit_transform_plot( - X, sample=0, homology_dimension_ix=hom_dim_ix) - PersistenceImage().fit_transform_plot( - X, sample=0, homology_dimension_ix=hom_dim_ix) +@pytest.mark.parametrize('transformer', + [HeatKernel(), PersistenceImage(), BettiCurve(), + PersistenceLandscape(), Silhouette()]) +def test_fit_transform_plot_infinite_hom_dims(transformer): + X_infinite_hom_dim = X.copy() + X_infinite_hom_dim[:, :, 2] = np.inf + transformer.fit_transform_plot(X_infinite_hom_dim, sample=0) -@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) -def test_fit_transform_plot_many_hom_dims(hom_dims): - BettiCurve().fit_transform_plot( - X, sample=0, homology_dimensions=hom_dims) - PersistenceLandscape().fit_transform_plot( - X, sample=0, homology_dimensions=hom_dims) - Silhouette().fit_transform_plot( - X, sample=0, homology_dimensions=hom_dims) +@pytest.mark.parametrize('transformer', + [BettiCurve(), PersistenceLandscape(), Silhouette()]) +def test_fit_transform_plot_wrong_hom_dims(transformer): + with pytest.raises(ValueError): + transformer.fit_transform_plot(X, sample=0, homology_dimensions=(2,)) -def test_pe_transform(): - pe = PersistenceEntropy() - diagram_res = np.array([[0.69314718, 0.63651417]]) +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_pe_transform(n_jobs): + pe = PersistenceEntropy(n_jobs=n_jobs) + diagram_res = np.array([[1., 0.91829583405]]) assert_almost_equal(pe.fit_transform(X), diagram_res) - -@pytest.mark.parametrize('n_bins', range(10, 51, 10)) -def test_bc_transform_shape(n_bins): - bc = BettiCurve(n_bins=n_bins) + pe_normalize = PersistenceEntropy(normalize=True) + diagram_res = np.array([[1., 0.355245321276]]) + assert_almost_equal(pe_normalize.fit_transform(X), diagram_res) + + +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_nop_transform(n_jobs): + nop = NumberOfPoints(n_jobs=n_jobs) + diagram_res = np.array([[2, 2]]) + + assert_almost_equal(nop.fit_transform(X), diagram_res) + + +@pytest.mark.parametrize('n_coefficients', [2, [2, 2]]) +def test_cp_transform(n_coefficients): + cp = ComplexPolynomial(n_coefficients=n_coefficients, polynomial_type='R') + diagram_res = np.array([[-2., -3., -4., 2., -6., -28., -12., 36.]]) + assert_almost_equal(cp.fit_transform(X), diagram_res) + + cp.set_params(polynomial_type='S') + diagram_res = np.array( + [[-np.sqrt(2/13), -3 / (2 * np.sqrt(13)), + (-3 - np.sqrt(13)) / np.sqrt(26), 1 / np.sqrt(13), + -2 * (np.sqrt(2/13) + 1 / np.sqrt(5)), -np.sqrt(8 / (13 * 5)) * 7, + -3 * (np.sqrt(2/13) + 2 / np.sqrt(5)), np.sqrt(8 / (13 * 5)) * 9]] + ) + assert_almost_equal(cp.fit_transform(X), diagram_res) + + cp.set_params(polynomial_type='T') + u_01, v_01 = (np.cos(1) - np.sin(1), + np.cos(1) + np.sin(1)) + u_02, v_02 = (np.cos(np.sqrt(13)) - np.sin(np.sqrt(13)), + np.cos(np.sqrt(13)) + np.sin(np.sqrt(13))) + u_11, v_11 = (np.cos(np.sqrt(52)) - np.sin(np.sqrt(52)), + np.cos(np.sqrt(52)) + np.sin(np.sqrt(52))) + u_12, v_12 = (np.cos(np.sqrt(40)) - np.sin(np.sqrt(40)), + np.cos(np.sqrt(40)) + np.sin(np.sqrt(40))) + diagram_res = np.array( + [[-1/2 * (u_01 + u_02), 1/4 * (u_01 * u_02 - v_01 * v_02), + -1/2 * (v_01 + v_02), 1/4 * (u_01 * v_02 + u_02 * v_01), + -(u_11 + 2 * u_12), 2 * (u_11 * u_12 - v_11 * v_12), + -(v_11 + 2 * v_12), 2 * (u_11 * v_12 + u_12 * v_11)]] + ) + assert_almost_equal(cp.fit_transform(X), diagram_res) + + +@pytest.mark.parametrize('n_bins', list(range(10, 51, 10))) +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_bc_transform_shape(n_bins, n_jobs): + bc = BettiCurve(n_bins=n_bins, n_jobs=n_jobs) X_res = bc.fit_transform(X) assert X_res.shape == (1, bc._n_dimensions, n_bins) -@pytest.mark.parametrize('n_bins', range(10, 51, 10)) -@pytest.mark.parametrize('n_layers', range(1, 10)) -def test_pl_transform_shape(n_bins, n_layers): - pl = PersistenceLandscape(n_bins=n_bins, n_layers=n_layers) +@pytest.mark.parametrize('n_bins', list(range(10, 51, 10))) +@pytest.mark.parametrize('n_layers', list(range(1, 10))) +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_pl_transform_shape(n_bins, n_layers, n_jobs): + pl = PersistenceLandscape(n_bins=n_bins, n_layers=n_layers, n_jobs=n_jobs) X_res = pl.fit_transform(X) - assert X_res.shape == (1, pl._n_dimensions, n_layers, n_bins) + assert X_res.shape == (1, pl._n_dimensions * n_layers, n_bins) + + +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_pi_zero_weight_function(n_jobs): + pi = PersistenceImage(weight_function=lambda x: x * 0., n_jobs=n_jobs) + X_res = pi.fit_transform(X) + assert np.array_equal(X_res, np.zeros_like(X_res)) @given(X=arrays(dtype=np.float, unique=True, - elements=integers(min_value=-1e10, max_value=1e6), + elements=floats(min_value=-10, max_value=10), shape=array_shapes(min_dims=1, max_dims=1, min_side=11))) def test_pi_null(X): """Test that, if one trivial diagram (all pts on the diagonal) is provided, - (along with a non-trivial one), then its pi is null""" - pi = PersistenceImage(sigma=1, n_bins=10) + along with a non-trivial one, then its persistence image is null""" + n_bins = 10 X = np.append(X, 1 + X[-1]) - diagrams = np.expand_dims(np.stack([X, X, - np.zeros((X.shape[0],), - dtype=int)]).transpose(), - axis=0) + diagrams = np.expand_dims( + np.stack([X, X, np.zeros(len(X))]).transpose(), axis=0 + ) diagrams = np.repeat(diagrams, 2, axis=0) diagrams[1, :, 1] += 1 + sigma = (np.max(diagrams[:, :, 1] - np.min(diagrams[:, :, 0]))) / 2 + pi = PersistenceImage(sigma=sigma, n_bins=n_bins) + assert_almost_equal(pi.fit_transform(diagrams)[0], 0) @given(pts=arrays(dtype=np.float, unique=True, elements=floats(allow_nan=False, allow_infinity=False, - min_value=-1e10, - max_value=1e6), + min_value=-10, + max_value=10), shape=(20, 2))) def test_pi_positive(pts): - pi = PersistenceImage(sigma=1) - diagrams = np.expand_dims(np.concatenate([ - np.sort(pts, axis=1), np.zeros((pts.shape[0], 1))], - axis=1), axis=0) + diagrams = np.expand_dims( + np.concatenate([np.sort(pts, axis=1), np.zeros((pts.shape[0], 1))], + axis=1), + axis=0 + ) + sigma = (np.max(diagrams[:, :, 1] - np.min(diagrams[:, :, 0]))) / 2 + pi = PersistenceImage(sigma=sigma) assert np.all(pi.fit_transform(diagrams) >= 0.) -def test_silhouette_transform(): - sht = Silhouette(n_bins=31, power=1.) +def test_large_pi_null_parallel(): + """Test that pi is computed correctly when the input array is at least 1MB + and more than 1 process is used, triggering joblib's use of memmaps""" + X = np.linspace(0, 100, 300000) + pi = PersistenceImage(sigma=1, n_bins=10, n_jobs=2) + diagrams = np.expand_dims( + np.stack([X, X, np.zeros(len(X))]).transpose(), axis=0 + ) + diagrams = np.repeat(diagrams, 2, axis=0) + diagrams[1, :, 1] += 1 + + assert_almost_equal(pi.fit_transform(diagrams)[0], 0) + + +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_silhouette_transform(n_jobs): + sht = Silhouette(n_bins=31, power=1., n_jobs=n_jobs) X_sht_res = np.array([0., 0.05, 0.1, 0.15, 0.2, 0.25, 0.2, 0.15, 0.1, 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0., 0.05, 0.1, 0.15, 0.2, 0.25, 0.2, 0.15, 0.1, 0.05, 0.]) @@ -119,9 +211,10 @@ def test_silhouette_transform(): assert_almost_equal(sht.fit_transform(X)[0][0], X_sht_res) -def test_silhouette_big_order(): +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_silhouette_big_order(n_jobs): diagrams = np.array([[[0, 2, 0], [1, 4, 0]]]) - sht_10 = Silhouette(n_bins=41, power=10.) + sht_10 = Silhouette(n_bins=41, power=10., n_jobs=n_jobs) X_sht_res = np.array([0., 0.00170459, 0.00340919, 0.00511378, 0.00681837, 0.00852296, 0.01022756, 0.01193215, 0.01363674, 0.01534133, 0.01704593, 0.11363674, 0.21022756, @@ -136,24 +229,32 @@ def test_silhouette_big_order(): assert_almost_equal(sht_10.fit_transform(diagrams)[0][0], X_sht_res) +@pytest.mark.parametrize('transformer_cls', [HeatKernel, PersistenceImage]) +@pytest.mark.parametrize('n_jobs', [1, 2, -1]) +def test_all_pts_the_same(transformer_cls, n_jobs): + X = np.zeros((1, 4, 3)) + X_res = transformer_cls(n_jobs=n_jobs).fit_transform(X) + assert np.array_equal(X_res, np.zeros_like(X_res)) + + pts_gen = arrays( dtype=np.float, elements=floats(allow_nan=False, allow_infinity=False, min_value=1., - max_value=10), + max_value=10.), shape=(1, 20, 2), unique=True -) + ) dims_gen = arrays( dtype=np.int, elements=integers(min_value=0, max_value=3), shape=(1, 20, 1) -) + ) def _validate_distinct(X): - """Check if, in X, there is any persistence X for which all births + """Check if, in X, there is any persistence diagram for which all births and deaths are equal.""" unique_values = [np.unique(x[:, 0:2]) for x in X] if np.any([len(u) < 2 for u in unique_values]): @@ -172,49 +273,52 @@ def get_input(pts, dims): return X -def test_all_pts_the_same(): - X = np.zeros((1, 4, 3)) - hk = HeatKernel(sigma=1) - with pytest.raises(IndexError): - _ = hk.fit(X).transform(X) - - +@pytest.mark.parametrize('n_jobs', [1, 2]) +@settings(deadline=None) @given(pts_gen, dims_gen) -def test_hk_shape(pts, dims): +def test_hk_shape(n_jobs, pts, dims): n_bins = 10 - x = get_input(pts, dims) + X = get_input(pts, dims) + sigma = (np.max(X[:, :, :2]) - np.min(X[:, :, :2])) / 2 - hk = HeatKernel(sigma=1, n_bins=n_bins) + hk = HeatKernel(sigma=sigma, n_bins=n_bins, n_jobs=n_jobs) num_dimensions = len(np.unique(dims)) - x_t = hk.fit(x).transform(x) + X_t = hk.fit_transform(X) - assert x_t.shape == (x.shape[0], num_dimensions, n_bins, n_bins) + assert X_t.shape == (X.shape[0], num_dimensions, n_bins, n_bins) @given(pts_gen, dims_gen) def test_hk_positive(pts, dims): - """ We expect the points above the PD-diagonal to be non-negative, - (up to a numerical error)""" + """We expect the points above the PD-diagonal to be non-negative (up to a + numerical error)""" n_bins = 10 - hk = HeatKernel(sigma=1, n_bins=n_bins) + X = get_input(pts, dims) + sigma = (np.max(X[:, :, :2]) - np.min(X[:, :, :2])) / 2 - x = get_input(pts, dims) - x_t = hk.fit(x).transform(x) + hk = HeatKernel(sigma=sigma, n_bins=n_bins) + X_t = hk.fit_transform(X) - assert np.all((np.tril(x_t[:, :, ::-1, :]) + 1e-13) >= 0.) + assert np.all((np.tril(X_t[:, :, ::-1, :]) + 1e-13) >= 0.) +@pytest.mark.parametrize('transformer_cls', [HeatKernel, PersistenceImage]) @given(pts_gen, dims_gen) -def test_hk_big_sigma(pts, dims): - """We expect that with a huge sigma, the diagrams are so diluted that - they are almost 0. Effectively, verifies that the smoothing is applied.""" +def test_hk_pi_big_sigma(transformer_cls, pts, dims): + """We expect that with a huge sigma, the diagrams are so diluted that they + are almost 0. Effectively, verifies that the smoothing is applied.""" n_bins = 10 - x = get_input(pts, dims) + X = get_input(pts, dims) + # To make the test less flaky, it helps to set al homology dimensions equal + X[:, :, 2] = 0. + max_difference = np.max(X[:, :, :2]) - np.min(X[:, :, :2]) + sigma = 100 * (max_difference) - hk = HeatKernel(sigma=100*np.max(np.abs(x)), n_bins=n_bins) - x_t = hk.fit_transform(x) + hk = transformer_cls(sigma=sigma, n_bins=n_bins) + X_t = hk.fit_transform(X) - assert np.all(np.abs(x_t) <= 1e-4) + max_hk_abs_value = np.max(np.abs(X_t)) + assert max_hk_abs_value <= 1e-3 @given(pts_gen) @@ -224,13 +328,30 @@ def test_hk_with_diag_points(pts): n_bins = 10 hk = HeatKernel(sigma=1, n_bins=n_bins) - x = get_input(pts, np.zeros((pts.shape[0], pts.shape[1], 1))) + X = get_input(pts, np.zeros((pts.shape[0], pts.shape[1], 1))) diag_points = np.array([[[2, 2, 0], [3, 3, 0], [7, 7, 0]]]) - x_with_diag_points = np.concatenate([x, diag_points], axis=1) + X_with_diag_points = np.concatenate([X, diag_points], axis=1) + + hk = hk.fit(X_with_diag_points) + + X_t, X_with_diag_points_t = [hk.transform(X_) + for X_ in [X, X_with_diag_points]] - hk = hk.fit(x_with_diag_points) + assert_almost_equal(X_with_diag_points_t, X_t, decimal=13) + + +def test_large_hk_shape_parallel(): + """Test that HeatKernel returns something of the right shape when the input + array is at least 1MB and more than 1 process is used, triggering joblib's + use of memmaps""" + X = np.linspace(0, 100, 300000) + n_bins = 10 + diagrams = np.expand_dims( + np.stack([X, X, np.zeros(len(X))]).transpose(), axis=0 + ) - x_t, x_with_diag_points_t = [hk.transform(x_) - for x_ in [x, x_with_diag_points]] + hk = HeatKernel(sigma=1, n_bins=n_bins, n_jobs=2) + num_dimensions = 1 + x_t = hk.fit_transform(diagrams) - assert_almost_equal(x_with_diag_points_t, x_t, decimal=13) + assert x_t.shape == (diagrams.shape[0], num_dimensions, n_bins, n_bins) diff --git a/gtda/diagrams/tests/test_preprocessing.py b/gtda/diagrams/tests/test_preprocessing.py index f145a9b03..325620283 100644 --- a/gtda/diagrams/tests/test_preprocessing.py +++ b/gtda/diagrams/tests/test_preprocessing.py @@ -10,6 +10,8 @@ from gtda.diagrams import ForgetDimension, Scaler, Filtering pio.renderers.default = 'plotly_mimetype' +plotly_params = {"trace": {"marker_size": 20}, + "layout": {"title": "New title"}} X_1 = np.array([[[0., 0.36905774, 0], [0., 0.37293977, 0], @@ -227,16 +229,22 @@ def test_not_fitted(): def test_forg_fit_transform_plot(): - ForgetDimension().fit_transform_plot(X_1, sample=0) + ForgetDimension().fit_transform_plot( + X_1, sample=0, plotly_params=plotly_params + ) @pytest.mark.parametrize('hom_dims', [None, (0,), (1,)]) def test_fit_transform_plot(hom_dims): Scaler().fit_transform_plot( - X_1, sample=0, homology_dimensions=hom_dims) + X_1, sample=0, homology_dimensions=hom_dims, + plotly_params=plotly_params + ) Filtering().fit_transform_plot( - X_1, sample=0, homology_dimensions=hom_dims) + X_1, sample=0, homology_dimensions=hom_dims, + plotly_params=plotly_params + ) @pytest.mark.parametrize('X', [X_1, X_2]) @@ -246,9 +254,18 @@ def test_forg_transform_shape(X): assert X_res.shape == X.shape -parameters_sc = [('wasserstein', {'p': 2}), - ('betti', {'n_bins': 10}), - ('bottleneck', None)] +parameters_sc = [ + ('bottleneck', None), + ('wasserstein', {'p': 2}), + ('betti', {'p': 2.1, 'n_bins': 10}), + ('landscape', {'p': 2.1, 'n_bins': 10, 'n_layers': 2}), + ('silhouette', {'p': 2.1, 'power': 1.2, 'n_bins': 10}), + ('heat', {'p': 2.1, 'sigma': 0.5, 'n_bins': 10}), + ('persistence_image', + {'p': 2.1, 'sigma': 0.5, 'n_bins': 10}), + ('persistence_image', + {'p': 2.1, 'sigma': 0.5, 'n_bins': 10, 'weight_function': lambda x: x}) + ] @pytest.mark.parametrize(('metric', 'metric_params'), parameters_sc) @@ -269,6 +286,28 @@ def test_filt_transform_zero(X): assert_almost_equal(X_res, X[:, [0], :]) +def total_lifetimes_in_dims(X, dims): + return sum([ + np.sum(np.diff(X[X[:, :, 2] == dim], axis=1)[:, 0]) for dim in dims + ]) + + +@pytest.mark.parametrize('homology_dimensions', [None, (0, 1, 2), (0,), (1,), + (2,), (0, 1), (0, 2), (1, 2)]) +def test_filt_transform_unfiltered_hom_dims(homology_dimensions): + filt = Filtering(epsilon=2., homology_dimensions=homology_dimensions) + X_1_res = filt.fit_transform(X_1) + if homology_dimensions is None: + unfiltered_hom_dims = [] + else: + unfiltered_hom_dims = [ + dim for dim in filt.homology_dimensions_ + if dim not in homology_dimensions + ] + assert total_lifetimes_in_dims(X_1, unfiltered_hom_dims) == \ + total_lifetimes_in_dims(X_1_res, unfiltered_hom_dims) + + lifetimes_1 = X_1[:, :, 1] - X_1[:, :, 0] epsilons_1 = np.linspace(np.min(lifetimes_1), np.max(lifetimes_1), num=3) @@ -277,7 +316,7 @@ def test_filt_transform_zero(X): def test_filt_transform(epsilon): filt = Filtering(epsilon=epsilon) X_res_1 = filt.fit_transform(X_1) - assert X_res_1.shape == X_1.shape + assert X_res_1.shape[1] <= X_1.shape[1] lifetimes_res_1 = X_res_1[:, :, 1] - X_res_1[:, :, 0] assert not ((lifetimes_res_1 > 0.) & (lifetimes_res_1 <= epsilon)).any() diff --git a/gtda/externals/__init__.py b/gtda/externals/__init__.py index 00bffafff..b39577ddf 100644 --- a/gtda/externals/__init__.py +++ b/gtda/externals/__init__.py @@ -3,6 +3,8 @@ from .modules.gtda_bottleneck import bottleneck_distance from .modules.gtda_wasserstein import wasserstein_distance +from .modules.gtda_collapser import flag_complex_collapse_edges_dense, \ + flag_complex_collapse_edges_sparse, flag_complex_collapse_edges_coo from .python import ripser, SparseRipsComplex, CechComplex, CubicalComplex, \ PeriodicCubicalComplex, SimplexTree, WitnessComplex, StrongWitnessComplex @@ -11,10 +13,13 @@ 'wasserstein_distance', 'ripser', 'SparseRipsComplex', - 'CechComplex', + 'CechComplex', 'CubicalComplex', 'PeriodicCubicalComplex', 'SimplexTree', 'WitnessComplex', - 'StrongWitnessComplex' + 'StrongWitnessComplex', + 'flag_complex_collapse_edges_dense', + 'flag_complex_collapse_edges_sparse', + 'flag_complex_collapse_edges_coo' ] diff --git a/gtda/externals/bindings/collapser_bindings.cpp b/gtda/externals/bindings/collapser_bindings.cpp new file mode 100644 index 000000000..19a7f7a6a --- /dev/null +++ b/gtda/externals/bindings/collapser_bindings.cpp @@ -0,0 +1,136 @@ +/****************************************************************************** + * Description: gudhi's collapser interfacing with pybind11 + * License: AGPL3 + *****************************************************************************/ + +#include + +#include +#include +#include +#include + +namespace py = pybind11; + +/* GUDHI Collapser required types */ +using Filtration_value = float; +using Vertex_handle = int32_t; +using Filtered_edge = + std::tuple; +using Filtered_edge_list = std::vector; + +/* Sparse matrix input types */ +using Sparse_matrix = Eigen::SparseMatrix; +using triplet_vec = Eigen::Triplet; + +/* COO data input types */ +using Row_idx = std::vector; +using Col_idx = std::vector; +using Filtration_values = std::vector; +using COO_data = std::tuple; + +/* Dense matrix input types */ +using Distance_matrix_np = py::array_t; + +/* constants */ +const Filtration_value filtration_max = + std::numeric_limits::infinity(); + +/* Generates COO sparse matrix data from a filtered edge list + * This function is called after computing edge collapse + */ +static COO_data gen_coo_matrix(Filtered_edge_list& collapsed_edges) { + Row_idx row; + Col_idx col; + Filtration_values data; + + /* allocate memory beforehand */ + row.reserve(collapsed_edges.size()); + col.reserve(collapsed_edges.size()); + data.reserve(collapsed_edges.size()); + + for (auto& t : collapsed_edges) { + row.push_back(std::get<0>(t)); + col.push_back(std::get<1>(t)); + data.push_back(std::get<2>(t)); + } + + return COO_data(row, col, data); +} + +PYBIND11_MODULE(gtda_collapser, m) { + using namespace pybind11::literals; + + m.doc() = "Collapser bindings for GUDHI implementation"; + m.def("flag_complex_collapse_edges_sparse", + [](Sparse_matrix& sm, Filtration_value thresh = filtration_max) { + Filtered_edge_list graph; + + /* Convert from sparse format to Filtered_edge_list */ + /* Applying threshold to the input data */ + int size = sm.outerSize(); + for (size_t k = 0; k < size; ++k) + for (Eigen::SparseMatrix::InnerIterator it(sm, k); + it; ++it) { + if (it.value() <= thresh) + graph.push_back(Filtered_edge(it.row(), it.col(), it.value())); + } + + /* Start collapser */ + auto vec_triples = + Gudhi::collapse::flag_complex_collapse_edges(graph); + + return gen_coo_matrix(vec_triples); + }, + "sm"_a, "thresh"_a = filtration_max, + "Implicitly constructs a flag complex from edges, " + "collapses edges while preserving the persistent homology"); + + m.def("flag_complex_collapse_edges_coo", + [](py::array_t& row_, py::array_t& col_, + py::array_t& data_, + Filtration_value thresh = filtration_max) { + Filtered_edge_list graph; + + Vertex_handle* row = (Vertex_handle*)row_.request().ptr; + Vertex_handle* col = (Vertex_handle*)col_.request().ptr; + Filtration_value* data = (Filtration_value*)data_.request().ptr; + + /* Convert from COO input format to Filtered_edge_list */ + /* Applying threshold to the input data */ + int size = data_.size(); + for (size_t k = 0; k < size; ++k) + if (data[k] <= thresh) + graph.push_back(Filtered_edge(row[k], col[k], data[k])); + + /* Start collapser */ + auto vec_triples = + Gudhi::collapse::flag_complex_collapse_edges(graph); + + return gen_coo_matrix(vec_triples); + }, + "row"_a, "column"_a, "data"_a, "thresh"_a = filtration_max, + "Implicitly constructs a flag complex from edges, " + "collapses edges while preserving the persistent homology"); + + m.def("flag_complex_collapse_edges_dense", + [](Distance_matrix_np& dm, Filtration_value thresh = filtration_max) { + Filtered_edge_list graph; + + /* Convert from dense format to Filtered edge list */ + /* Applying threshold to the input data */ + for (size_t i = 0; i < dm.shape(0); i++) + for (size_t j = 0; j < dm.shape(1); j++) + if (j > i && (*(dm.data(i, j)) <= thresh)) + graph.push_back(Filtered_edge(i, j, *(dm.data(i, j)))); + + /* Start collapser */ + auto vec_triples = + Gudhi::collapse::flag_complex_collapse_edges(graph); + + return gen_coo_matrix(vec_triples); + }, + "dm"_a, "thresh"_a = filtration_max, + "Implicitly constructs a flag complex from edges, " + "collapses edges while preserving the persistent homology"); +} diff --git a/gtda/externals/bindings/periodic_cubical_complex_bindings.cpp b/gtda/externals/bindings/periodic_cubical_complex_bindings.cpp index 92e62c862..a105e62d8 100644 --- a/gtda/externals/bindings/periodic_cubical_complex_bindings.cpp +++ b/gtda/externals/bindings/periodic_cubical_complex_bindings.cpp @@ -48,6 +48,8 @@ PYBIND11_MODULE(gtda_periodic_cubical_complex, m) { Bitmap_cubical_complex_periodic_boundary_conditions_base< double>>*, bool>()) + .def("compute_persistence", + &Persistent_cohomology_interface_inst::compute_persistence) .def("get_persistence", &Persistent_cohomology_interface_inst::get_persistence) .def("betti_numbers", @@ -58,4 +60,3 @@ PYBIND11_MODULE(gtda_periodic_cubical_complex, m) { &Persistent_cohomology_interface_inst::intervals_in_dimension); m.doc() = "GUDHI periocal cubical complex function interfacing"; } - diff --git a/gtda/externals/bindings/persistent_cohomology_bindings.cpp b/gtda/externals/bindings/persistent_cohomology_bindings.cpp index 1e3b804d5..a2db284bf 100644 --- a/gtda/externals/bindings/persistent_cohomology_bindings.cpp +++ b/gtda/externals/bindings/persistent_cohomology_bindings.cpp @@ -20,6 +20,8 @@ PYBIND11_MODULE(gtda_persistent_cohomology, m) { .def(py::init*>()) .def(py::init*, bool>()) + .def("compute_persistence", + &Persistent_cohomology_interface_inst::compute_persistence) .def("get_persistence", &Persistent_cohomology_interface_inst::get_persistence) .def("betti_numbers", @@ -28,5 +30,5 @@ PYBIND11_MODULE(gtda_persistent_cohomology, m) { &Persistent_cohomology_interface_inst::persistent_betti_numbers) .def("intervals_in_dimension", &Persistent_cohomology_interface_inst::intervals_in_dimension); - m.doc() = "GUDHI persistant homology interfacing"; + m.doc() = "GUDHI persistent homology interfacing"; } diff --git a/gtda/externals/bindings/ripser_bindings.cpp b/gtda/externals/bindings/ripser_bindings.cpp index 502bd1c5e..e28b5fe05 100644 --- a/gtda/externals/bindings/ripser_bindings.cpp +++ b/gtda/externals/bindings/ripser_bindings.cpp @@ -7,8 +7,10 @@ #include // PYBIND11 +#include #include #include +#include namespace py = pybind11; @@ -32,24 +34,24 @@ PYBIND11_MODULE(gtda_ripser, m) { .def_readwrite("num_edges", &ripserResults::num_edges); m.def("rips_dm", - [](std::vector D, int N, int modulus, int dim_max, + [](py::array_t& D, int N, int modulus, int dim_max, float threshold, int do_cocycles) { - ripserResults ret = - rips_dm(&D[0], N, modulus, dim_max, threshold, do_cocycles); + ripserResults ret = rips_dm((float*)D.request().ptr, N, modulus, + dim_max, threshold, do_cocycles); return ret; }, "D"_a, "N"_a, "modulus"_a, "dim_max"_a, "threshold"_a, "do_cocycles"_a, "ripser distance matrix"); m.def("rips_dm_sparse", - [](std::vector I, std::vector J, std::vector V, + [](py::array_t& I, py::array_t& J, py::array_t& V, int NEdges, int N, int modulus, int dim_max, float threshold, int do_cocycles) { ripserResults ret = - rips_dm_sparse(&I[0], &J[0], &V[0], NEdges, N, modulus, dim_max, - threshold, do_cocycles); + rips_dm_sparse((int*)I.request().ptr, (int*)J.request().ptr, + (float*)V.request().ptr, NEdges, N, modulus, + dim_max, threshold, do_cocycles); return ret; }, "I"_a, "J"_a, "V"_a, "NEdges"_a, "N"_a, "modulus"_a, "dim_max"_a, "threshold"_a, "do_cocycles"_a, "ripser sparse distance matrix"); } - diff --git a/gtda/externals/bindings/simplex_tree_bindings.cpp b/gtda/externals/bindings/simplex_tree_bindings.cpp index 230c6ef3d..502a27a62 100644 --- a/gtda/externals/bindings/simplex_tree_bindings.cpp +++ b/gtda/externals/bindings/simplex_tree_bindings.cpp @@ -3,8 +3,8 @@ * License: Apache 2.0 *****************************************************************************/ - #include + #include #include @@ -41,8 +41,26 @@ PYBIND11_MODULE(gtda_simplex_tree, m) { const std::vector&, double>( &simplex_tree_interface_inst::insert_simplex_and_subfaces)) - .def("get_filtration", &simplex_tree_interface_inst::get_filtration) - .def("get_skeleton", &simplex_tree_interface_inst::get_skeleton) + .def("get_filtration", + [](simplex_tree_interface_inst& self) + -> std::vector { + std::vector tmp; + for (auto elem = self.get_filtration_iterator_begin(); + elem != self.get_filtration_iterator_end(); elem++) + tmp.push_back(self.get_simplex_and_filtration(*elem)); + return tmp; + }) + .def("get_skeleton", + [](simplex_tree_interface_inst& self, size_t dim) + -> std::vector< + simplex_tree_interface_inst::Simplex_and_filtration> { + std::vector + tmp; + for (auto elem = self.get_skeleton_iterator_begin(dim); + elem != self.get_skeleton_iterator_end(dim); elem++) + tmp.push_back(self.get_simplex_and_filtration(*elem)); + return tmp; + }) .def("get_star", &simplex_tree_interface_inst::get_star) .def("get_cofaces", &simplex_tree_interface_inst::get_cofaces) .def("expansion", &simplex_tree_interface_inst::expansion) @@ -59,6 +77,8 @@ PYBIND11_MODULE(gtda_simplex_tree, m) { py::class_( m, "Simplex_tree_persistence_interface") .def(py::init()) + .def("compute_persistence", + &Persistent_cohomology_interface_inst::compute_persistence) .def("get_persistence", &Persistent_cohomology_interface_inst::get_persistence) .def("betti_numbers", diff --git a/gtda/externals/eigen b/gtda/externals/eigen new file mode 160000 index 000000000..25424d91f --- /dev/null +++ b/gtda/externals/eigen @@ -0,0 +1 @@ +Subproject commit 25424d91f60a9f858e7dc1c7936021cc1dd72019 diff --git a/gtda/externals/gudhi-devel b/gtda/externals/gudhi-devel index a5476516e..a265b030e 160000 --- a/gtda/externals/gudhi-devel +++ b/gtda/externals/gudhi-devel @@ -1 +1 @@ -Subproject commit a5476516e0d1d56842a15c5a79af0df3c1e50c5b +Subproject commit a265b030effa9b34a99a09b0e1b5073e8bb50cb6 diff --git a/gtda/externals/pybind11 b/gtda/externals/pybind11 new file mode 160000 index 000000000..8fa70e748 --- /dev/null +++ b/gtda/externals/pybind11 @@ -0,0 +1 @@ +Subproject commit 8fa70e74838e93f0db38417f3590ba792489b958 diff --git a/gtda/externals/python/cubical_complex_interface.py b/gtda/externals/python/cubical_complex_interface.py index c42d3e50b..3ceab973c 100644 --- a/gtda/externals/python/cubical_complex_interface.py +++ b/gtda/externals/python/cubical_complex_interface.py @@ -100,9 +100,9 @@ def persistence(self, homology_coeff_field=11, min_persistence=0): True) persistence_result = [] if self.pcohptr is not None: - persistence_result = self.pcohptr.get_persistence( - homology_coeff_field, min_persistence) - + self.pcohptr.compute_persistence(homology_coeff_field, + min_persistence) + persistence_result = self.pcohptr.get_persistence() return persistence_result def betti_numbers(self): diff --git a/gtda/externals/python/periodic_cubical_complex_interface.py b/gtda/externals/python/periodic_cubical_complex_interface.py index 01cddaaf5..6df2a4d95 100644 --- a/gtda/externals/python/periodic_cubical_complex_interface.py +++ b/gtda/externals/python/periodic_cubical_complex_interface.py @@ -100,9 +100,9 @@ def persistence(self, homology_coeff_field=11, min_persistence=0): True) persistence_result = [] if self.pcohptr is not None: - persistence_result = \ - self.pcohptr.get_persistence(homology_coeff_field, + self.pcohptr.compute_persistence(homology_coeff_field, min_persistence) + persistence_result = self.pcohptr.get_persistence() return persistence_result def betti_numbers(self): diff --git a/gtda/externals/python/ripser_interface.py b/gtda/externals/python/ripser_interface.py index 8b28b688d..638844a80 100644 --- a/gtda/externals/python/ripser_interface.py +++ b/gtda/externals/python/ripser_interface.py @@ -1,48 +1,57 @@ -from scipy import sparse +import gc +from warnings import warn + import numpy as np +from scipy import sparse +from scipy.spatial.distance import squareform from sklearn.metrics.pairwise import pairwise_distances -from ..modules import gtda_ripser, gtda_ripser_coeff + +from ..modules import gtda_ripser, gtda_ripser_coeff, gtda_collapser + + +def _lexsort_coo_data(row, col, data): + lex_sort_idx = np.lexsort((col, row)) + row, col, data = \ + row[lex_sort_idx], col[lex_sort_idx], data[lex_sort_idx] + return row, col, data -def DRFDM(DParam, maxHomDim, thresh=-1, coeff=2, do_cocycles=1): +def DRFDM(DParam, maxHomDim, thresh=-1, coeff=2, do_cocycles=0): if coeff == 2: ret = gtda_ripser.rips_dm(DParam, DParam.shape[0], coeff, maxHomDim, thresh, do_cocycles) else: ret = gtda_ripser_coeff.rips_dm(DParam, DParam.shape[0], coeff, maxHomDim, thresh, do_cocycles) - ret_rips = {} - ret_rips.update({"births_and_deaths_by_dim": ret.births_and_deaths_by_dim}) - ret_rips.update({"num_edges": ret.num_edges}) - return ret_rips + return ret -def DRFDMSparse(I, J, V, N, maxHomDim, thresh=-1, coeff=2, do_cocycles=1): +def DRFDMSparse(I, J, V, N, maxHomDim, thresh=-1, coeff=2, do_cocycles=0): if coeff == 2: ret = gtda_ripser.rips_dm_sparse(I, J, V, I.size, N, coeff, maxHomDim, thresh, do_cocycles) else: ret = gtda_ripser_coeff.rips_dm_sparse(I, J, V, I.size, N, coeff, maxHomDim, thresh, do_cocycles) - ret_rips = {} - ret_rips.update({"births_and_deaths_by_dim": ret.births_and_deaths_by_dim}) - ret_rips.update({"num_edges": ret.num_edges}) - return ret_rips + return ret def dpoint2pointcloud(X, i, metric): - """ - Return the distance from the ith point in a Euclidean point cloud - to the rest of the points + """Return the distance from the ith point in a Euclidean point cloud + to the rest of the points. + Parameters ---------- X: ndarray (n_samples, n_features) A numpy array of data + i: int The index of the point from which to return all distances + metric: string or callable The metric to use when calculating distance between instances in a feature array + """ ds = pairwise_distances(X, X[i, :][None, :], metric=metric).flatten() ds[i] = 0 @@ -50,26 +59,32 @@ def dpoint2pointcloud(X, i, metric): def get_greedy_perm(X, n_perm=None, metric="euclidean"): - """ - Compute a furthest point sampling permutation of a set of points + """Compute a furthest point sampling permutation of a set of points + Parameters ---------- X: ndarray (n_samples, n_features) A numpy array of either data or distance matrix + n_perm: int Number of points to take in the permutation + metric: string or callable The metric to use when calculating distance between instances in a feature array + Returns ------- idx_perm: ndarray(n_perm) Indices of points in the greedy permutation + lambdas: ndarray(n_perm) Covering radii at different points + dperm2all: ndarray(n_perm, n_samples) Distances from points in the greedy permutation to points in the original point set + """ if not n_perm: n_perm = X.shape[0] @@ -91,66 +106,102 @@ def get_greedy_perm(X, n_perm=None, metric="euclidean"): ds = np.minimum(ds, dperm2all[-1]) lambdas[-1] = np.max(ds) dperm2all = np.array(dperm2all) - return (idx_perm, lambdas, dperm2all) + return idx_perm, lambdas, dperm2all def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", - n_perm=None): - """Compute persistence diagrams for X data array. If X is not a distance - matrix, it will be converted to a distance matrix using the chosen metric. + n_perm=None, collapse_edges=False): + """Compute persistence diagrams for X data array using Ripser [1]_. + + If X is not a distance matrix, it will be converted to a distance matrix + using the chosen metric. Parameters ---------- - X: ndarray (n_samples, n_features) - A numpy array of either data or distance matrix. - Can also be a sparse distance matrix of type scipy.sparse - maxdim: int, optional, default 1 - Maximum homology dimension computed. Will compute all dimensions - lower than and equal to this value. - For 1, H_0 and H_1 will be computed. - thresh: float, default infinity - Maximum distances considered when constructing filtration. - If infinity, compute the entire filtration. - coeff: int prime, default 2 + X : ndarray of shape (n_samples, n_features) + A numpy array of either data or distance matrix. Can also be a sparse + distance matrix of type scipy.sparse + + maxdim : int, optional, default: ``1`` + Maximum homology dimension computed. Will compute all dimensions lower + than and equal to this value. For 1, H_0 and H_1 will be computed. + + thresh : float, optional, default: ``numpy.inf`` + Maximum distances considered when constructing filtration. If + ``numpy.inf``, compute the entire filtration. + + coeff : int prime, optional, default: ``2`` Compute homology with coefficients in the prime field Z/pZ for p=coeff. - metric: string or callable + + metric : string or callable, optional, default: ``'euclidean'`` The metric to use when calculating distance between instances in a - feature array. If metric is a string, it must be one of the options - specified in pairwise_distances, including "euclidean", "manhattan", - or "cosine". Alternatively, if metric is a callable function, it is - called on each pair of instances (rows) and the resulting value - recorded. The callable should take two arrays from X as input and - return a value indicating the distance between them. + feature array. If set to ``'precomputed'``, input data is interpreted + as a distance matrix or of adjacency matrices of a weighted undirected + graph. If a string, it must be one of the options allowed by + :func:`scipy.spatial.distance.pdist` for its metric parameter, or a + or a metric listed in + :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, including + ``'euclidean'``, ``'manhattan'`` or ``'cosine'``. If a callable, it + should take pairs of vectors (1D arrays) as input and, for each two + vectors in a pair, it should return a scalar indicating the + distance/dissimilarity between them. + + n_perm : int or None, optional, default: ``None`` + The number of points to subsample in a "greedy permutation", or a + furthest point sampling of the points. These points will be used in + lieu of the full point cloud for a faster computation, at the expense + of some accuracy, which can be bounded as a maximum bottleneck distance + to all diagrams on the original point set. + + collapse_edges : bool, optional, default: ``False`` + Whether to use the edge collapse algorithm as described in [2]_ prior + to calling ``ripser``. - n_perm: int - The number of points to subsample in a "greedy permutation," - or a furthest point sampling of the points. These points - will be used in lieu of the full point cloud for a faster - computation, at the expense of some accuracy, which can - be bounded as a maximum bottleneck distance to all diagrams - on the original point set Returns ------- A dictionary holding all of the results of the computation - {'dgms': list (size maxdim) of ndarray (n_pairs, 2) - A list of persistence diagrams, one for each dimension less - than maxdim. Each diagram is an ndarray of size (n_pairs, 2) - with the first column representing the birth time and the - second column representing the death time of each pair. - 'num_edges': int - The number of edges added during the computation - 'dperm2all': ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm - The distance matrix used in the computation if n_perm is none. - Otherwise, the distance from all points in the permutation to - all points in the dataset - 'idx_perm': ndarray(n_perm) if n_perm > 0 - Index into the original point cloud of the points used - as a subsample in the greedy permutation - 'r_cover': float - Covering radius of the subsampled points. - If n_perm <= 0, then the full point cloud was used and this is 0 + { + 'dgms': list (size maxdim) of ndarray (n_pairs, 2) + A list of persistence diagrams, one for each dimension less + than maxdim. Each diagram is an ndarray of size (n_pairs, 2) + with the first column representing the birth time and the + second column representing the death time of each pair. + 'num_edges': int + The number of edges added during the computation + 'dperm2all': None or ndarray (n_perm, n_samples) + ``None`` if n_perm is ``None``. Otherwise, the distance from all + points in the permutation to all points in the dataset. + 'idx_perm': ndarray(n_perm) if n_perm > 0 + Index into the original point cloud of the points used + as a subsample in the greedy permutation + 'r_cover': float + Covering radius of the subsampled points. + If n_perm <= 0, then the full point cloud was used and this is 0 } + Notes + ----- + `Ripser `_ is used as a C++ backend + for computing Vietoris–Rips persistent homology. Python bindings were + modified for performance from the `ripser.py + `_ package. + + `GUDHI `_ is used as a C++ backend + for the edge collapse algorithm described in [2]_. + + References + ---------- + .. [1] U. Bauer, "Ripser: efficient computation of Vietoris–Rips + persistence barcodes", 2019; `arXiv:1908.02518 + `_. + + .. [2] J.-D. Boissonnat and S. Pritam, "Edge Collapse and Persistence of + Flag Complexes"; in *36th International Symposium on Computational + Geometry (SoCG 2020)*, pp. 19:1–19:15, Schloss + Dagstuhl-Leibniz–Zentrum für Informatik, 2020; + `DOI: 10.4230/LIPIcs.SoCG.2020.19 + `_. + """ if n_perm and sparse.issparse(X): raise Exception( @@ -159,7 +210,7 @@ def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", if n_perm and n_perm > X.shape[0]: raise Exception( "Number of points in greedy permutation is greater" - + " than number of points in the point cloud" + " than number of points in the point cloud" ) if n_perm and n_perm < 0: raise Exception( @@ -180,41 +231,79 @@ def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", dm = X else: dm = pairwise_distances(X, metric=metric) - dperm2all = dm - - n_points = dm.shape[0] - if not sparse.issparse(dm) and np.sum(np.abs(dm.diagonal()) > 0) > 0: - # If any of the diagonal elements are nonzero, - # convert to sparse format, because currently - # that's the only format that handles nonzero - # births - dm = sparse.coo_matrix(dm) - - if sparse.issparse(dm): - coo = dm.tocoo() + dperm2all = None + + n_points = max(dm.shape) + sort_coo = True + if (dm.diagonal() != 0).any(): + if collapse_edges: + warn("Edge collapses are not supported when any of the diagonal " + "entries are non-zero. Computing persistent homology without " + "using edge collapse.") + collapse_edges = False + if not sparse.issparse(dm): + # If any of the diagonal elements are nonzero, convert to sparse + # format, because currently that's the only format that handles + # nonzero births + dm = sparse.coo_matrix(dm) + sort_coo = False + + if sparse.issparse(dm) or collapse_edges: + if collapse_edges: + sort_coo = True + if not sparse.issparse(dm): + row, col, data = \ + gtda_collapser.flag_complex_collapse_edges_dense(dm, + thresh) + else: + coo = dm.tocoo() + row, col, data = \ + gtda_collapser.flag_complex_collapse_edges_coo(coo.row, + coo.col, + coo.data, + thresh) + else: + if sparse.isspmatrix_coo(dm): + # If the matrix is already COO, we need to order the row and + # column indices lexicographically to avoid errors. See + # https://github.com/scikit-tda/ripser.py/issues/103 + row, col, data = dm.row, dm.col, dm.data + else: + coo = dm.tocoo() + row, col, data = coo.row, coo.col, coo.data + sort_coo = False + + if sort_coo: + row, col, data = _lexsort_coo_data(np.asarray(row), + np.asarray(col), + np.asarray(data)) + res = DRFDMSparse( - coo.row.astype(dtype=np.int32, order="C"), - coo.col.astype(dtype=np.int32, order="C"), - np.array(coo.data, dtype=np.float32, order="C"), + row.astype(dtype=np.int32, order="C"), + col.astype(dtype=np.int32, order="C"), + np.array(data, dtype=np.float32, order="C"), n_points, maxdim, thresh, - coeff, - ) + coeff + ) else: - I, J = np.meshgrid(np.arange(n_points), np.arange(n_points)) - DParam = np.array(dm[I > J], dtype=np.float32) + # Only consider strict upper diagonal + DParam = squareform(dm, checks=False).astype(np.float32) + # Run garbage collector to free up memory taken by `dm` + del dm + gc.collect() res = DRFDM(DParam, maxdim, thresh, coeff) # Unwrap persistence diagrams - dgms = res["births_and_deaths_by_dim"] + dgms = res.births_and_deaths_by_dim for dim in range(len(dgms)): N = int(len(dgms[dim]) / 2) dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2]) ret = { "dgms": dgms, - "num_edges": res["num_edges"], + "num_edges": res.num_edges, "dperm2all": dperm2all, "idx_perm": idx_perm, "r_cover": r_cover, diff --git a/gtda/externals/python/simplex_tree_interface.py b/gtda/externals/python/simplex_tree_interface.py index 92bb83770..6edcb5700 100644 --- a/gtda/externals/python/simplex_tree_interface.py +++ b/gtda/externals/python/simplex_tree_interface.py @@ -16,8 +16,7 @@ class SimplexTree: # Fake constructor that does nothing but documenting the constructor def __init__(self): - """SimplexTree constructor. - """ + "SimplexTree constructor." self.thisptr = Simplex_tree_interface_full_featured() self.pcohptr = None @@ -28,22 +27,20 @@ def __del__(self): del self.pcohptr def __is_defined(self): - """Returns true if SimplexTree pointer is not NULL. - """ + "Return True if SimplexTree pointer is not NULL." if self.thisptr is not None: return True return False def __is_persistence_defined(self): - """Returns true if Persistence pointer is not NULL. - """ + """Return True if Persistence pointer is not NULL.""" if self.pcohptr is not None: return True return False def filtration(self, simplex): - """This function returns the filtration value for a given N-simplex in - this simplicial complex, or +infinity if it is not in the complex. + """Return the filtration value for a given N-simplex in this simplicial + complex, or +infinity if it is not in the complex. :param simplex: The N-simplex, represented by a list of vertex. :type simplex: list of int. :returns: The simplicial complex filtration value. @@ -52,8 +49,8 @@ def filtration(self, simplex): return self.thisptr.simplex_filtration(simplex) def assign_filtration(self, simplex, filtration): - """This function assigns the simplicial complex filtration value for a - given N-simplex. + """Assign the simplicial complex filtration value for a given + N-simplex. :param simplex: The N-simplex, represented by a list of vertex. :type simplex: list of int. :param filtration: The simplicial complex filtration value. @@ -62,8 +59,7 @@ def assign_filtration(self, simplex, filtration): self.thisptr.assign_simplex_filtration(simplex, filtration) def initialize_filtration(self): - """This function initializes and sorts the simplicial complex - filtration vector. + """Initialize and sort the simplicial complex filtration vector. .. note:: This function must be launched before :func:`persistence()`, @@ -77,23 +73,21 @@ def initialize_filtration(self): self.thisptr.initialize_filtration() def num_vertices(self): - """This function returns the number of vertices of the simplicial - complex. + """Return the number of vertices of the simplicial complex. :returns: The simplicial complex number of vertices. :rtype: int """ return self.thisptr.num_vertices() def num_simplices(self): - """This function returns the number of simplices of the simplicial - complex. + """Return the number of simplices of the simplicial complex. :returns: the simplicial complex number of simplices. :rtype: int """ return self.thisptr.num_simplices() def dimension(self): - """This function returns the dimension of the simplicial complex. + """Return the dimension of the simplicial complex. :returns: the simplicial complex dimension. :rtype: int .. note:: @@ -107,15 +101,14 @@ def dimension(self): return self.thisptr.dimension() def upper_bound_dimension(self): - """This function returns a valid dimension upper bound of the - simplicial complex. + """Return a valid dimension upper bound of the simplicial complex. :returns: an upper bound on the dimension of the simplicial complex. :rtype: int """ return self.thisptr.upper_bound_dimension() def set_dimension(self, dimension): - """This function sets the dimension of the simplicial complex. + """Set the dimension of the simplicial complex. :param dimension: The new dimension value. :type dimension: int. .. note:: @@ -130,8 +123,7 @@ def set_dimension(self, dimension): self.thisptr.set_dimension(dimension) def find(self, simplex): - """This function returns if the N-simplex was found in the simplicial - complex or not. + """Return if the N-simplex was found in the simplicial complex or not. :param simplex: The N-simplex to find, represented by a list of vertex. :type simplex: list of int. :returns: true if the simplex was found, false otherwise. @@ -141,10 +133,10 @@ def find(self, simplex): return self.thisptr.find_simplex(csimplex) def insert(self, simplex, filtration=0.0): - """This function inserts the given N-simplex and its subfaces with the - given filtration value (default value is '0.0'). If some of those - simplices are already present with a higher filtration value, their - filtration value is lowered. + """Insert the given N-simplex and its subfaces with the given + filtration value (default value is '0.0'). If some of those simplices + are already present with a higher filtration value, their filtration + value is lowered. :param simplex: The N-simplex to insert, represented by a list of vertex. :type simplex: list of int. @@ -159,8 +151,7 @@ def insert(self, simplex, filtration=0.0): filtration) def get_filtration(self): - """This function returns a list of all simplices with their given - filtration values. + """Return a list of all simplices with their given filtration values. :returns: The simplices sorted by increasing filtration values. :rtype: list of tuples(simplex, filtration) """ @@ -172,8 +163,7 @@ def get_filtration(self): return ct def get_skeleton(self, dimension): - """This function returns the (simplices of the) skeleton of a maximum - given dimension. + """Return the (simplices of the) skeleton of a maximum given dimension. :param dimension: The skeleton dimension value. :type dimension: int. :returns: The (simplices of the) skeleton of a maximum dimension. @@ -187,7 +177,7 @@ def get_skeleton(self, dimension): return ct def get_star(self, simplex): - """This function returns the star of a given N-simplex. + """Return the star of a given N-simplex. :param simplex: The N-simplex, represented by a list of vertex. :type simplex: list of int. :returns: The (simplices of the) star of a simplex. @@ -202,8 +192,7 @@ def get_star(self, simplex): return ct def get_cofaces(self, simplex, codimension): - """This function returns the cofaces of a given N-simplex with a - given codimension. + """Return the cofaces of a given N-simplex with a given codimension. :param simplex: The N-simplex, represented by a list of vertex. :type simplex: list of int. :param codimension: The codimension. If codimension = 0, all cofaces @@ -221,13 +210,13 @@ def get_cofaces(self, simplex, codimension): return ct def remove_maximal_simplex(self, simplex): - """This function removes a given maximal N-simplex from the simplicial - complex. + """Remove a given maximal N-simplex from the simplicial complex. :param simplex: The N-simplex, represented by a list of vertex. :type simplex: list of int. .. note:: Be aware that removing is shifting data in a flat_map - (:func:`initialize_filtration()` to be done). + (:func:`initialize_filtration()` + to be done). .. note:: The dimension of the simplicial complex may be lower after calling remove_maximal_simplex than it was before. However, @@ -267,8 +256,8 @@ def prune_above_filtration(self, filtration): return self.thisptr.prune_above_filtration(filtration) def expansion(self, max_dim): - """Expands the Simplex_tree containing only its one skeleton - until dimension max_dim. + """Expand the Simplex_tree containing only its one skeleton until + dimension max_dim. The expanded simplicial complex until dimension :math:`d` attached to a graph :math:`G` is the maximal simplicial complex of dimension at most :math:`d` admitting the graph :math:`G` as @@ -283,8 +272,8 @@ def expansion(self, max_dim): self.thisptr.expansion(max_dim) def make_filtration_non_decreasing(self): - """This function ensures that each simplex has a higher filtration - value than its faces by increasing the filtration values. + """Ensure that each simplex has a higher filtration value than its + faces by increasing the filtration values. :returns: True if any filtration value was modified, False if the filtration was already non-decreasing. :rtype: bool @@ -300,8 +289,8 @@ def make_filtration_non_decreasing(self): return self.thisptr.make_filtration_non_decreasing() def persistence(self, homology_coeff_field=11, min_persistence=0, - persistence_dim_max = False): - """This function returns the persistence of the simplicial complex. + persistence_dim_max=False): + """Return the persistence of the simplicial complex. :param homology_coeff_field: The homology coefficient field. Must be a prime number. Default value is 11. :type homology_coeff_field: int. @@ -323,13 +312,13 @@ def persistence(self, homology_coeff_field=11, min_persistence=0, persistence_dim_max) persistence_result = [] if self.pcohptr is not None: - persistence_result = \ - self.pcohptr.get_persistence(homology_coeff_field, + self.pcohptr.compute_persistence(homology_coeff_field, min_persistence) + persistence_result = self.pcohptr.get_persistence() return persistence_result def betti_numbers(self): - """This function returns the Betti numbers of the simplicial complex. + """Return the Betti numbers of the simplicial complex. :returns: The Betti numbers ([B0, B1, ..., Bn]). :rtype: list of int :note: betti_numbers function requires @@ -340,13 +329,12 @@ def betti_numbers(self): if self.pcohptr is not None: bn_result = self.pcohptr.betti_numbers() else: - print("betti_numbers function requires persistence function" - " to be launched first.") + print("`betti_numbers` requires persistence function to be " + "launched first.") return bn_result def persistent_betti_numbers(self, from_value, to_value): - """This function returns the persistent Betti numbers of the - simplicial complex. + """Return the persistent Betti numbers of the simplicial complex. :param from_value: The persistence birth limit to be added in the numbers (persistent birth <= from_value). :type from_value: float. @@ -364,13 +352,13 @@ def persistent_betti_numbers(self, from_value, to_value): pbn_result = self.pcohptr.persistent_betti_numbers(from_value, to_value) else: - print("persistent_betti_numbers function requires persistence function" - " to be launched first.") + print("`persistent_betti_numbers` requires persistence function " + "to be launched first.") return pbn_result def persistence_intervals_in_dimension(self, dimension): - """This function returns the persistence intervals of the simplicial - complex in a specific dimension. + """Return the persistence intervals of the simplicial complex in a + specific dimension. :param dimension: The specific dimension. :type dimension: int. :returns: The persistence intervals. @@ -383,12 +371,12 @@ def persistence_intervals_in_dimension(self, dimension): if self.pcohptr is not None: intervals_result = self.pcohptr.intervals_in_dimension(dimension) else: - print("intervals_in_dim function requires persistence function" - " to be launched first.") + print("`intervals_in_dim` requires persistence function to be " + "launched first.") return np.array(intervals_result) def persistence_pairs(self): - """This function returns a list of persistence birth and death simplices pairs. + """Return a list of persistence birth and death simplex pairs. :returns: A list of persistence simplices intervals. :rtype: list of pair of list of int :note: persistence_pairs function requires @@ -399,13 +387,13 @@ def persistence_pairs(self): if self.pcohptr is not None: persistence_pairs_result = self.pcohptr.persistence_pairs() else: - print("persistence_pairs function requires persistence function" - " to be launched first.") + print("`persistence_pairs` requires persistence function to be " + "launched first.") return persistence_pairs_result def write_persistence_diagram(self, persistence_file=''): - """This function writes the persistence intervals of the simplicial - complex in a user given file name. + """Write the persistence intervals of the simplicial complex in a + user-given file name. :param persistence_file: The specific dimension. :type persistence_file: string. :note: intervals_in_dim function requires @@ -416,7 +404,7 @@ def write_persistence_diagram(self, persistence_file=''): if persistence_file != '': self.pcohptr.write_output_diagram(str.encode(persistence_file)) else: - print("persistence_file must be specified") + print("`persistence_file` must be specified") else: - print("intervals_in_dim function requires persistence function" - " to be launched first.") + print("`intervals_in_dim` requires persistence function to be " + "launched first.") diff --git a/gtda/externals/python/tests/test_cech_complex.py b/gtda/externals/python/tests/test_cech_complex.py index ae9534d8e..a81ce1aa4 100644 --- a/gtda/externals/python/tests/test_cech_complex.py +++ b/gtda/externals/python/tests/test_cech_complex.py @@ -1,9 +1,5 @@ from .. import CechComplex -""" Test comes from - -""" - def test_minimal_cech(): points = [[1, 2]] diff --git a/gtda/externals/python/tests/test_collapser.py b/gtda/externals/python/tests/test_collapser.py new file mode 100644 index 000000000..19b64340c --- /dev/null +++ b/gtda/externals/python/tests/test_collapser.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +""" Test comes from +https://github.com/GUDHI/gudhi-devel/blob/master/src/Collapse/example/edge_collapse_basic_example.cpp +""" + +import numpy as np +from gtda.externals.modules.gtda_collapser import \ + flag_complex_collapse_edges_dense, \ + flag_complex_collapse_edges_sparse, \ + flag_complex_collapse_edges_coo +from scipy.sparse import coo_matrix, csr_matrix + +X = np.array([[0, 1, 1.], + [1, 2, 1.], + [2, 3, 1.], + [3, 0, 1.], + [0, 2, 2.], + [1, 3, 2.]], dtype=np.int32) +tX = np.transpose(X) + + +def check_collapse(collapsed, removed): + coo = collapsed.tocoo() + cooT = np.array([coo.row, coo.col, coo.data]).transpose() + for elem in removed: + if (cooT == elem).all(axis=1).any(): + return False + return True + + +def test_simple_csr_example(): + X = csr_matrix((tX[2], (tX[0], tX[1]))) + coo_ = flag_complex_collapse_edges_sparse(X) + coo = coo_matrix((coo_[2], (coo_[0], coo_[1]))) + assert check_collapse(coo, [[1, 3, 2]]) + + +def test_simple_coo_example(): + coo_ = flag_complex_collapse_edges_coo( + tX[0], tX[1], tX[2]) + coo = coo_matrix((coo_[2], (coo_[0], coo_[1]))) + assert check_collapse(coo, [[1, 3, 2]]) + + +def test_simple_dense_example(): + data = csr_matrix((tX[2], (tX[0], tX[1]))).toarray() + coo_ = flag_complex_collapse_edges_dense(data) + coo = coo_matrix((coo_[2], (coo_[0], coo_[1]))) + assert check_collapse(coo, [[1, 3, 2]]) diff --git a/gtda/externals/python/tests/test_ripser.py b/gtda/externals/python/tests/test_ripser.py new file mode 100644 index 000000000..5de7833d3 --- /dev/null +++ b/gtda/externals/python/tests/test_ripser.py @@ -0,0 +1,105 @@ +import numpy as np +import pytest +from hypothesis import given +from hypothesis.extra.numpy import arrays +from hypothesis.strategies import floats, integers, composite +from numpy.testing import assert_almost_equal +from scipy.sparse import coo_matrix + +from gtda.externals import ripser + + +@composite +def get_dense_distance_matrices(draw): + """Generate 2d dense square arrays of floats, with zero along the + diagonal.""" + shapes = draw(integers(min_value=2, max_value=30)) + distance_matrix = draw(arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=True, + min_value=0), + shape=(shapes, shapes), unique=False)) + np.fill_diagonal(distance_matrix, 0) + return distance_matrix + + +@composite +def get_sparse_distance_matrices(draw): + """Generate 2d sparse matrices of floats, with zero along the diagonal.""" + shapes = draw(integers(min_value=2, max_value=40)) + distance_matrix = draw(arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=True, + min_value=0), + shape=(shapes, shapes), unique=False)) + distance_matrix = np.triu(distance_matrix, k=1) + distance_matrix = coo_matrix(distance_matrix) + row, col, data = \ + distance_matrix.row, distance_matrix.col, distance_matrix.data + not_inf_idx = data != np.inf + row = row[not_inf_idx] + col = col[not_inf_idx] + data = data[not_inf_idx] + shape = (np.max(row) + 1, np.max(col) + 1) if not_inf_idx.any() else (0, 0) + distance_matrix = coo_matrix((data, (row, col)), shape=shape) + return distance_matrix + + +@pytest.mark.parametrize('thresh', [False, True]) +@pytest.mark.parametrize('coeff', [2, 7]) +@given(distance_matrix=get_dense_distance_matrices()) +def test_collapse_consistent_with_no_collapse_dense(thresh, + coeff, distance_matrix): + thresh = np.max(distance_matrix) / 2 if thresh else np.inf + maxdim = 3 + pd_collapse = ripser(distance_matrix, thresh=thresh, maxdim=maxdim, + coeff=coeff, metric='precomputed', + collapse_edges=True)['dgms'] + pd_no_collapse = ripser(distance_matrix, thresh=thresh, maxdim=maxdim, + coeff=coeff, metric='precomputed', + collapse_edges=False)['dgms'] + for i in range(maxdim + 1): + pd_collapse[i] = np.sort(pd_collapse[i], axis=0) + pd_no_collapse[i] = np.sort(pd_no_collapse[i], axis=0) + assert_almost_equal(pd_collapse[i], pd_no_collapse[i]) + + +@pytest.mark.parametrize('thresh', [False, True]) +@pytest.mark.parametrize('coeff', [2, 7]) +@given(distance_matrix=get_sparse_distance_matrices()) +def test_collapse_consistent_with_no_collapse_coo(thresh, + coeff, distance_matrix): + if thresh and distance_matrix.nnz: + thresh = np.max(distance_matrix) / 2 + else: + thresh = np.inf + maxdim = 3 + pd_collapse = ripser(distance_matrix, thresh=thresh, maxdim=maxdim, + coeff=coeff, metric='precomputed', + collapse_edges=True)['dgms'] + pd_no_collapse = ripser(distance_matrix, thresh=thresh, maxdim=maxdim, + coeff=coeff, metric='precomputed', + collapse_edges=False)['dgms'] + for i in range(maxdim + 1): + pd_collapse[i] = np.sort(pd_collapse[i], axis=0) + pd_no_collapse[i] = np.sort(pd_no_collapse[i], axis=0) + assert_almost_equal(pd_collapse[i], pd_no_collapse[i]) + + +def test_coo_results_independent_of_order(): + """Regression test for PR #465""" + data = np.array([6., 8., 2., 4., 5., 9., 10., 3., 1., 1.]) + row = np.array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3]) + col = np.array([4, 1, 3, 2, 4, 3, 2, 3, 4, 4]) + dm = coo_matrix((data, (row, col))) + diagrams = ripser(dm, metric="precomputed")['dgms'] + diagrams_csr = ripser(dm.tocsr(), metric="precomputed")['dgms'] + expected = [np.array([[0., 1.], + [0., 1.], + [0., 2.], + [0., 5.], + [0., np.inf]]), + np.array([], dtype=np.float64).reshape(0, 2)] + for i in range(2): + assert np.array_equal(diagrams[i], expected[i]) + assert np.array_equal(diagrams_csr[i], expected[i]) diff --git a/gtda/graphs/__init__.py b/gtda/graphs/__init__.py index a8431ef91..b88d58792 100644 --- a/gtda/graphs/__init__.py +++ b/gtda/graphs/__init__.py @@ -10,4 +10,4 @@ 'TransitionGraph', 'KNeighborsGraph', 'GraphGeodesicDistance' -] + ] diff --git a/gtda/graphs/geodesic_distance.py b/gtda/graphs/geodesic_distance.py index 00bc1cb92..368f0fd05 100644 --- a/gtda/graphs/geodesic_distance.py +++ b/gtda/graphs/geodesic_distance.py @@ -1,10 +1,17 @@ """Graph geodesic distance calculations.""" # License: GNU AGPLv3 +from functools import reduce +from operator import and_ +from warnings import warn + import numpy as np from joblib import Parallel, delayed +from numpy.ma import masked_invalid +from numpy.ma.core import MaskedArray +from scipy.sparse import issparse +from scipy.sparse.csgraph import shortest_path from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.graph_shortest_path import graph_shortest_path from sklearn.utils.validation import check_is_fitted from ..base import PlotterMixin @@ -22,8 +29,15 @@ class GraphGeodesicDistance(BaseEstimator, TransformerMixin, PlotterMixin): path between any two of its vertices, setting it to ``numpy.inf`` when two vertices cannot be connected by a path. - The graphs are encoded as sparse adjacency matrices, while the outputs - are dense distance matrices of variable size. + The graphs are represented by their adjacency matrices which can be dense + arrays, sparse matrices or masked arrays. The following rules apply: + + - In dense arrays of Boolean type, entries which are ``False`` represent + absent edges. + - In dense arrays of integer or float type, zero entries represent edges + of length 0. Absent edges must be indicated by ``numpy.inf``. + - In sparse matrices, non-stored values represent absent edges. Explicitly + stored zero or ``False`` edges represent edges of length 0. Parameters ---------- @@ -32,19 +46,34 @@ class GraphGeodesicDistance(BaseEstimator, TransformerMixin, PlotterMixin): in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. + directed : bool, optional, default: ``True`` + If ``True`` (default), then find the shortest path on a directed graph. + If ``False``, then find the shortest path on an undirected graph. + + unweighted : bool, optional, default: ``False`` + If ``True``, then find unweighted distances. That is, rather than + finding the path between each point such that the sum of weights is + minimized, find the path such that the number of edges is minimized. + + method : ``'auto'`` | ``'FW'`` | ``'D'`` | ``'BF'`` | ``'J'``, optional, \ + default: ``'auto'`` + Algorithm to use for shortest paths. See the `scipy documentation \ + `_. + Examples -------- >>> import numpy as np >>> from gtda.graphs import TransitionGraph, GraphGeodesicDistance >>> X = np.arange(4).reshape(1, -1, 1) - >>> tg = TransitionGraph(func=None).fit_transform(X) - >>> print(tg[0].toarray()) - [[False True False False] - [ True False True False] - [False True False True] - [False False True False]] - >>> ggd = GraphGeodesicDistance().fit_transform(tg) - >>> print(ggd[0]) + >>> X_tg = TransitionGraph(func=None).fit_transform(X) + >>> print(X_tg[0].toarray()) + [[0 1 0 0] + [0 0 1 0] + [0 0 0 1] + [0 0 0 0]] + >>> X_ggd = GraphGeodesicDistance(directed=False).fit_transform(X_tg) + >>> print(X_ggd[0]) [[0. 1. 2. 3.] [1. 0. 1. 2.] [2. 1. 0. 1.] @@ -52,18 +81,40 @@ class GraphGeodesicDistance(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - TransitionGraph, KNeighborsGraph, gtda.homology.VietorisRipsPersistence + TransitionGraph, KNeighborsGraph """ - def __init__(self, n_jobs=None): + def __init__(self, n_jobs=None, directed=False, unweighted=False, + method='auto'): self.n_jobs = n_jobs - - def _geodesic_distance(self, X): - X_distance = graph_shortest_path(X) - X_distance[X_distance == 0] = np.inf # graph_shortest_path returns a - # float64 array, so inserting np.inf does not change the type. - np.fill_diagonal(X_distance, 0) + self.directed = directed + self.unweighted = unweighted + self.method = method + + def _geodesic_distance(self, X, i=None): + method_ = self.method + if not issparse(X): + diag = np.eye(X.shape[0], dtype=bool) + if np.any(~np.logical_or(X, diag)): + if self.method in ['auto', 'FW']: + if np.any(X < 0): + method_ = 'J' + else: + method_ = 'D' + warn( + f"Methods 'auto' and 'FW' are not supported when " + f"some edge weights are zero. Using '{method_}' " + f"instead for graph {i}." + ) + if not isinstance(X, MaskedArray): + # Convert to a masked array with mask given by positions in + # which infs or NaNs occur. + if X.dtype != bool: + X = masked_invalid(X) + X_distance = shortest_path(X, directed=self.directed, + unweighted=self.unweighted, + method=method_) return X_distance def fit(self, X, y=None): @@ -74,10 +125,10 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray of shape (n_samples,) or \ - (n_samples, n_vertices, n_vertices) - Input data, i.e. a collection of adjacency matrices of graphs. - Each adjacency matrix may be a dense or a sparse array. + X : list of length n_samples, or ndarray of shape (n_samples, \ + n_vertices, n_vertices) + Input data: a collection of adjacency matrices of graphs. Each + adjacency matrix may be a dense or a sparse array. y : None There is no need for a target in a transformer, yet the pipeline @@ -94,44 +145,48 @@ def fit(self, X, y=None): return self def transform(self, X, y=None): - """Use :meth:`sklearn.utils.graph_shortest_path.graph_shortest_path` - to compute the lengths of graph shortest paths between any two + """Compute the lengths of graph shortest paths between any two vertices. Parameters ---------- - X : ndarray of shape (n_samples,) or \ - (n_samples, n_vertices, n_vertices) - Input data, i.e. a collection of adjacency matrices of graphs. - Each adjacency matrix may be a dense or sparse array. + X : list of length n_samples, or ndarray of shape (n_samples, \ + n_vertices, n_vertices) + Input data: a collection of ``n_samples`` adjacency matrices of + graphs. Each adjacency matrix may be a dense array, a sparse + matrix, or a masked array. y : None Ignored. Returns ------- - Xt : ndarray of shape (n_samples,) or \ - (n_samples, n_vertices, n_vertices) - Array of distance matrices. If the distance matrices have variable - size across samples, `Xt` is a one-dimensional array of dense - arrays. + Xt : list of length n_samples, or ndarray of shape (n_samples, \ + n_vertices, n_vertices) + Output collection of dense distance matrices. If the distance + matrices all have the same shape, a single 3D ndarray is returned. """ check_is_fitted(self, '_is_fitted') X = check_graph(X) Xt = Parallel(n_jobs=self.n_jobs)( - delayed(self._geodesic_distance)(x) for x in X) - Xt = np.array(Xt) + delayed(self._geodesic_distance)(x, i=i) for i, x in enumerate(X)) + + x0_shape = Xt[0].shape + if reduce(and_, (x.shape == x0_shape for x in Xt), True): + Xt = np.asarray(Xt) + return Xt @staticmethod - def plot(Xt, sample=0, colorscale='blues'): + def plot(Xt, sample=0, colorscale='blues', plotly_params=None): """Plot a sample from a collection of distance matrices. Parameters ---------- - Xt : ndarray of shape (n_samples, n_points, n_points) + Xt : list of length n_samples, or ndarray of shape (n_samples, \ + n_vertices, n_vertices) Collection of distance matrices, such as returned by :meth:`transform`. @@ -142,5 +197,21 @@ def plot(Xt, sample=0, colorscale='blues'): Color scale to be used in the heat map. Can be anything allowed by :class:`plotly.graph_objects.Heatmap`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale) + return plot_heatmap( + Xt[sample], colorscale=colorscale, + title=f"{sample}-th geodesic distance matrix", + plotly_params=plotly_params + ) diff --git a/gtda/graphs/kneighbors.py b/gtda/graphs/kneighbors.py index 4487ec743..6bcf5df62 100644 --- a/gtda/graphs/kneighbors.py +++ b/gtda/graphs/kneighbors.py @@ -1,63 +1,44 @@ """kNN graphs from point cloud data.""" # License: GNU AGPLv3 -import warnings from functools import partial -import numpy as np from joblib import Parallel, delayed -from scipy.sparse import SparseEfficiencyWarning from sklearn.base import BaseEstimator, TransformerMixin from sklearn.neighbors import kneighbors_graph -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_is_fitted from ..utils._docs import adapt_fit_transform_docs +from ..utils.validation import check_point_clouds @adapt_fit_transform_docs class KNeighborsGraph(BaseEstimator, TransformerMixin): - """Adjacency matrices of k-nearest neighbor graphs. + """Adjacency matrices of :math:`k`-nearest neighbor graphs. Given a two-dimensional array of row vectors seen as points in - high-dimensional space, the corresponding kNN graph is a simple, - undirected and unweighted graph with a vertex for every vector in the - array, and an edge between two vertices whenever either the first - corresponding vector is among the k nearest neighbors of the - second, or vice-versa. - - :func:`sklearn.neighbors.kneighbors_graph` is used to compute the - adjacency matrices of kNN graphs. + high-dimensional space, the corresponding :math:`k`NN graph is a directed + graph with a vertex for every vector in the array, and a directed edge from + vertex :math:`i` to vertex :math:`j \\neq i` whenever vector :math:`j` is + among the :math:`k` nearest neighbors of vector :math:`i`. Parameters ---------- n_neighbors : int, optional, default: ``4`` - Number of neighbors to use. + Number of neighbors to use. A point is not considered as its own + neighbour. - metric : string or callable, optional, default: ``'euclidean'`` - Metric to use for distance computation. Any metric from scikit-learn - or :mod:`scipy.spatial.distance` can be used. - If metric is a callable function, it is called on each - pair of instances (rows) and the resulting value recorded. The callable - should take two arrays as input and return one value indicating the - distance between them. This works for Scipy's metrics, but is less - efficient than passing the metric name as a string. - Distance matrices are not supported. - Valid values for `metric` are: - - - from scikit-learn: [``'cityblock'``, ``'cosine'``, ``'euclidean'``, - ``'l1'``, ``'l2'``, ``'manhattan'``] - - from :mod:`scipy.spatial.distance`: [``'braycurtis'``, - ``'canberra'``, ``'chebyshev'``, ``'correlation'``, ``'dice'``, - ``'hamming'``, ``'jaccard'``, ``'kulsinski'``, ``'mahalanobis'``, - ``'minkowski'``, ``'rogerstanimoto'``, ``'russellrao'``, - ``'seuclidean'``, ``'sokalmichener'``, ``'sokalsneath'``, - ``'sqeuclidean'``, ``'yule'``] - - See the documentation for :mod:`scipy.spatial.distance` for details on - these metrics. + mode : ``'connectivity'`` | ``'distance'``, optional, \ + default: ``'connectivity'`` + Type of returned matrices: ``'connectivity'`` will return the 0-1 + connectivity matrices, and ``'distance'`` will return the distances + between neighbors according to the given metric. - metric_params : dict or None, optional, default: ``None`` - Additional keyword arguments for the metric function. + metric : string or callable, optional, default: ``'euclidean'`` + The distance metric to use. See the documentation of + :class:`sklearn.neighbors.DistanceMetric` for a list of available + metrics. If set to ``'precomputed'``, input data is interpreted as a + collection of distance matrices. p : int, optional, default: ``2`` Parameter for the Minkowski (i.e. :math:`\\ell^p`) metric from @@ -65,13 +46,13 @@ class KNeighborsGraph(BaseEstimator, TransformerMixin): when `metric` is ``'minkowski'``. `p` = 1 is the Manhattan distance, and `p` = 2 reduces to the Euclidean distance. - metric_params : dict, optional, default: ``{}`` + metric_params : dict or None, optional, default: ``None`` Additional keyword arguments for the metric function. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. Examples -------- @@ -84,29 +65,31 @@ class KNeighborsGraph(BaseEstimator, TransformerMixin): >>> kng = KNeighborsGraph(n_neighbors=2) >>> Xg = kng.fit_transform(X) >>> print(Xg[0].toarray()) - [[0. 1. 1. 1.] + [[0. 1. 0. 1.] [1. 0. 0. 1.] [1. 0. 0. 1.] - [1. 1. 1. 0.]] + [1. 1. 0. 0.]] + + See also + -------- + TransitionGraph, GraphGeodesicDistance + + Notes + ----- + :func:`sklearn.neighbors.kneighbors_graph` is used to compute the + adjacency matrices of kNN graphs. """ - def __init__(self, n_neighbors=4, metric='euclidean', + def __init__(self, n_neighbors=4, mode='connectivity', metric='euclidean', p=2, metric_params=None, n_jobs=None): self.n_neighbors = n_neighbors + self.mode = mode self.metric = metric self.p = p self.metric_params = metric_params self.n_jobs = n_jobs - def _make_adjacency_matrix(self, X): - A = self._nearest_neighbors(X) - rows, cols = A.nonzero() - with warnings.catch_warnings(): - warnings.simplefilter('ignore', SparseEfficiencyWarning) - A[cols, rows] = 1 - return A - def fit(self, X, y=None): """Do nothing and return the estimator unchanged. @@ -115,9 +98,12 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray of shape (n_samples, n_points, n_dimensions) - Input data. Each entry in `X` along axis 0 is an array of - ``n_points`` row vectors in ``n_dimensions``-dimensional space. + X : list of length n_samples, or ndarray of shape (n_samples, \ + n_points, n_dimensions) or (n_samples, n_points, n_points) + Input data representing a collection of point clouds. Each entry + in `X` is a 2D array of shape ``(n_points, n_dimensions)`` if + `metric` is not ``'precomputed'``, or a 2D array of shape + ``(n_points, n_points)`` if `metric` is ``'precomputed'``. y : None There is no need for a target in a transformer, yet the pipeline @@ -128,24 +114,24 @@ def fit(self, X, y=None): self : object """ - check_array(X, allow_nd=True) - - self._nearest_neighbors = partial( - kneighbors_graph, n_neighbors=self.n_neighbors, metric=self.metric, - p=self.p, metric_params=self.metric_params, mode='connectivity', - include_self=False) + self._is_precomputed = self.metric == 'precomputed' + check_point_clouds(X, distance_matrices=self._is_precomputed) + self._is_fitted = True return self def transform(self, X, y=None): - """Compute kNN graphs and return their adjacency matrices as - sparse matrices. + """Compute kNN graphs and return their adjacency matrices in sparse + format. Parameters ---------- - X : ndarray of shape (n_samples, n_points, n_dimensions) - Input data. Each entry in `X` along axis 0 is an array of - ``n_points`` row vectors in ``n_dimensions``-dimensional space. + X : list of length n_samples, or ndarray of shape (n_samples, \ + n_points, n_dimensions) + Input data representing a collection of point clouds. Each entry + in `X` is a 2D array of shape ``(n_points, n_dimensions)`` if + `metric` is not ``'precomputed'``, or a 2D array of shape + ``(n_points, n_points)`` if `metric` is ``'precomputed'``. y : None There is no need for a target in a transformer, yet the pipeline @@ -153,14 +139,22 @@ def transform(self, X, y=None): Returns ------- - Xt : ndarray of sparse matrices in CSR format, shape (n_samples,) - Adjacency matrices of kNN graphs. + Xt : list of length n_samples + Adjacency matrices of kNN graphs, in sparse CSR format. The + matrices contain ones and zeros if `mode` is ``'connectivity'``, + and floats representing distances according to `metric` if `mode` + is ``'distance'``. """ - check_is_fitted(self, '_nearest_neighbors') - Xt = check_array(X, allow_nd=True) + check_is_fitted(self, '_is_fitted') + Xt = check_point_clouds(X, distance_matrices=self._is_precomputed) + + _adjacency_matrix_func = partial( + kneighbors_graph, n_neighbors=self.n_neighbors, metric=self.metric, + p=self.p, metric_params=self.metric_params, mode=self.mode, + include_self=False + ) + Xt = Parallel(n_jobs=self.n_jobs)(delayed(_adjacency_matrix_func)(x) + for x in Xt) - Xt = Parallel(n_jobs=self.n_jobs)( - delayed(self._make_adjacency_matrix)(x) for x in Xt) - Xt = np.array(Xt) return Xt diff --git a/gtda/graphs/tests/test_geodesic_distance.py b/gtda/graphs/tests/test_geodesic_distance.py index 29b41bae8..4888c4338 100644 --- a/gtda/graphs/tests/test_geodesic_distance.py +++ b/gtda/graphs/tests/test_geodesic_distance.py @@ -1,28 +1,81 @@ """Testing for GraphGeodesicDistance.""" +import warnings + import numpy as np import plotly.io as pio import pytest +from numpy.ma import masked_array from numpy.testing import assert_almost_equal +from scipy.sparse import csr_matrix from sklearn.exceptions import NotFittedError from gtda.graphs import GraphGeodesicDistance -pio.renderers.default = 'plotly_mimetype' +pio.renderers.default = "plotly_mimetype" + + +X_ggd = [] + +X_ggd_float = np.array([ + np.array([[0., 1., 3., 0., 0.], + [1., 0., 5., 0., 0.], + [3., 5., 0., 4., 0.], + [0., 0., 4., 0., 0.], + [0., 0., 0., 0., 0.]]), + np.array([[0., 1., 3., 0., np.inf], + [1., 0., 1., 0., np.inf], + [3., 1., 0., 4., np.inf], + [0., 0., 4., 0., np.inf], + [np.inf, np.inf, np.inf, np.inf, 0.]]) + ]) +X_ggd_float_res = np.array([ + np.zeros(X_ggd_float[0].shape, dtype=np.float), + np.array([[0., 0., 1., 0., np.inf], + [0., 0., 1., 0., np.inf], + [1., 1., 0., 1., np.inf], + [0., 0., 1., 0., np.inf], + [np.inf, np.inf, np.inf, np.inf, 0.]]) + ]) +X_ggd.append((X_ggd_float, X_ggd_float_res)) + +X_ggd_float_list = list(X_ggd_float) +X_ggd.append((X_ggd_float_list, X_ggd_float_res)) + +X_ggd_bool = [np.array([[False, True, False], + [True, False, False], + [False, False, False]])] +X_ggd_bool_res = np.array([[[0., 1., np.inf], + [1., 0., np.inf], + [np.inf, np.inf, 0.]]]) +X_ggd.append((X_ggd_bool, X_ggd_bool_res)) + +X_ggd_int = [X_ggd_bool[0].astype(int)] +X_ggd_int_res = np.zeros((1, *X_ggd_int[0].shape), dtype=np.float) +X_ggd.append((X_ggd_int, X_ggd_int_res)) -X_ggd = np.array([ - np.array( - [[0, 1, 3, 0, 0], - [1, 0, 5, 0, 0], - [3, 5, 0, 4, 0], - [0, 0, 4, 0, 0], - [0, 0, 0, 0, 0]]), - np.array( - [[0, 1, 3, 0, 0], - [1, 0, 1, 0, 0], - [3, 1, 0, 4, 0], - [0, 0, 4, 0, 0], - [0, 0, 0, 0, 0]])]) +x_ggd_float = X_ggd_bool[0].astype(np.float) +X_ggd.append(([x_ggd_float], X_ggd_int_res)) + +X_ggd.append( + ([masked_array(x_ggd_float, mask=x_ggd_float == np.inf)], X_ggd_int_res) + ) + +X_ggd_csr_int = [csr_matrix(X_ggd_int[0])] +X_ggd.append((X_ggd_csr_int, X_ggd_bool_res)) + +X_ggd_csr_int_with_zeros = [ + csr_matrix(([1, 1, 0, 0], ([0, 1, 0, 2], [1, 0, 2, 0]))) + ] +X_ggd_csr_int_with_zeros_res = np.array([[[0., 1., 0.], + [1., 0., 1.], + [0., 1., 0.]]]) +X_ggd.append((X_ggd_csr_int_with_zeros, X_ggd_csr_int_with_zeros_res)) + +X_ggd_csr_bool_with_False = [ + csr_matrix(([True, True, False, False], ([0, 1, 0, 2], [1, 0, 2, 0]))) + ] +X_ggd.append((X_ggd_csr_bool_with_False, X_ggd_csr_int_with_zeros_res)) def test_ggd_not_fitted(): @@ -33,31 +86,29 @@ def test_ggd_not_fitted(): def test_ggd_fit_transform_plot(): - GraphGeodesicDistance().fit_transform_plot(X_ggd, sample=0) - - -def test_ggd_transform(): - X_ggd_res = np.array([ - [[0., 1., 3., 7., np.inf], - [1., 0., 4., 8., np.inf], - [3., 4., 0., 4., np.inf], - [7., 8., 4., 0., np.inf], - [np.inf, np.inf, np.inf, np.inf, 0.]], - - [[0., 1., 2., 6., np.inf], - [1., 0., 1., 5., np.inf], - [2., 1., 0., 4., np.inf], - [6., 5., 4., 0., np.inf], - [np.inf, np.inf, np.inf, np.inf, 0.]] - ]) - ggd = GraphGeodesicDistance() + X = X_ggd[0][0] + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="Methods .*") + GraphGeodesicDistance().fit_transform_plot(X, sample=0) + + +@pytest.mark.parametrize("X, X_res", X_ggd) +@pytest.mark.parametrize("method", ["auto", "FW", "D", "J", "BF"]) +def test_ggd_transform(X, X_res, method): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="Methods .*") + ggd = GraphGeodesicDistance(directed=False, method=method) + X_ft = ggd.fit_transform(X) - assert_almost_equal(ggd.fit_transform(X_ggd), X_ggd_res) + assert_almost_equal(X_ft, X_res) def test_parallel_ggd_transform(): + X = X_ggd[0][0] ggd = GraphGeodesicDistance(n_jobs=1) ggd_parallel = GraphGeodesicDistance(n_jobs=2) - assert_almost_equal(ggd.fit_transform(X_ggd), ggd_parallel.fit_transform( - X_ggd)) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="Methods .*") + assert_almost_equal(ggd.fit_transform(X), + ggd_parallel.fit_transform(X)) diff --git a/gtda/graphs/tests/test_kneighbors.py b/gtda/graphs/tests/test_kneighbors.py index bc7bd323e..66f6667b4 100644 --- a/gtda/graphs/tests/test_kneighbors.py +++ b/gtda/graphs/tests/test_kneighbors.py @@ -3,21 +3,27 @@ import numpy as np import pytest from scipy.sparse import csr_matrix +from scipy.spatial.distance import pdist, squareform from sklearn.exceptions import NotFittedError from gtda.graphs import KNeighborsGraph -X_kng = np.array([[[0, 0], [1, 2], [4, 3], [6, 2]]]) +X_kng = np.array([[[0, 0], + [1, 2], + [4, 3], + [6, 2]]]) +X_kng_list = list(X_kng) +dmat_0 = squareform(pdist(X_kng[0])) +X_kng_precomputed = dmat_0[None, :, :] +X_kng_precomputed_list = [dmat_0] -X_kng_res = np.array([ - csr_matrix((np.array([1] * 4), - (np.array([0, 1, 2, 3]), - np.array([1, 0, 3, 2]))), shape=(4, 4))]) +X_kng_res = [csr_matrix((np.array([1] * 4), + (np.array([0, 1, 2, 3]), np.array([1, 0, 3, 2]))))] -X_kng_res_k2 = np.array([csr_matrix(np.array([[0., 1., 1., 0.], - [1., 0., 1., 1.], - [1., 1., 0., 1.], - [0., 1., 1., 0.]]))]) +X_kng_res_k2 = [csr_matrix(np.array([[0, 1, 1, 0], + [1, 0, 1, 0], + [0, 1, 0, 1], + [0, 1, 1, 0]]))] def test_kng_not_fitted(): @@ -27,12 +33,15 @@ def test_kng_not_fitted(): kn_graph.transform(X_kng) +@pytest.mark.parametrize(('X', 'metric'), + [(X_kng, 'euclidean'), (X_kng_list, 'euclidean'), + (X_kng_precomputed, 'precomputed'), + (X_kng_precomputed_list, 'precomputed')]) @pytest.mark.parametrize(('n_neighbors', 'expected'), [(1, X_kng_res), (2, X_kng_res_k2)]) -def test_kng_transform(n_neighbors, expected): - kn_graph = KNeighborsGraph(n_neighbors=n_neighbors) - - assert (kn_graph.fit_transform(X_kng)[0] != expected[0]).nnz == 0 +def test_kng_transform(X, metric, n_neighbors, expected): + kn_graph = KNeighborsGraph(n_neighbors=n_neighbors, metric=metric) + assert (kn_graph.fit_transform(X)[0] != expected[0]).nnz == 0 def test_parallel_kng_transform(): @@ -41,11 +50,3 @@ def test_parallel_kng_transform(): assert (kn_graph.fit_transform(X_kng)[0] != kn_graph_parallel.fit_transform(X_kng)[0]).nnz == 0 - - -def test_symmetric(): - kn_graph = KNeighborsGraph(n_neighbors=2) - X_kng_transformed = kn_graph.fit_transform(X_kng) - - assert (X_kng_transformed[0] != - X_kng_transformed[0].transpose()).nnz == 0 diff --git a/gtda/graphs/tests/test_transition.py b/gtda/graphs/tests/test_transition.py index 5f6cb3546..78007e71b 100644 --- a/gtda/graphs/tests/test_transition.py +++ b/gtda/graphs/tests/test_transition.py @@ -7,17 +7,21 @@ from gtda.graphs import TransitionGraph -X_tg = np.array([[[1, 0], [2, 3], [5, 4]], - [[0, 1], [3, 2], [4, 5]]]) - -X_tg_res = np.array([ - csr_matrix((np.array([True] * 2), - (np.array([0, 1]), - np.array([1, 0]))), shape=(2, 2)), - csr_matrix((np.array([True] * 2), - (np.array([0, 1]), - np.array([1, 0]))), shape=(2, 2)), -]) +X_tg = np.array([[[1, 0], + [2, 3], + [5, 4]], + [[0, 1], + [3, 2], + [4, 5]], + [[5, 4], + [5, 4], + [5, 4]]]) + +X_tg_res = [ + csr_matrix((np.array([1] * 2), (np.array([0, 1]), np.array([1, 0])))), + csr_matrix((np.array([1] * 2), (np.array([0, 1]), np.array([1, 0])))), + csr_matrix(np.zeros((1, 1))) + ] def test_transition_graph_not_fitted(): @@ -31,8 +35,8 @@ def test_transition_graph_transform(): tg = TransitionGraph() Xt = tg.fit_transform(X_tg) - assert np.array_equal(Xt[0].toarray(), X_tg_res[0].toarray()) - assert np.array_equal(Xt[1].toarray(), X_tg_res[1].toarray()) + for xt, x_tg_res in zip(Xt, X_tg_res): + assert np.array_equal(xt.toarray(), x_tg_res.toarray()) def test_parallel_transition_graph_transform(): @@ -42,5 +46,5 @@ def test_parallel_transition_graph_transform(): Xt = tg.fit_transform(X_tg) Xt_parallel = tg_parallel.fit_transform(X_tg) - assert np.array_equal(Xt[0].toarray(), Xt_parallel[0].toarray()) - assert np.array_equal(Xt[1].toarray(), Xt_parallel[1].toarray()) + for xt, xt_parallel in zip(Xt, Xt_parallel): + assert np.array_equal(xt.toarray(), xt_parallel.toarray()) diff --git a/gtda/graphs/transition.py b/gtda/graphs/transition.py index 4e7be4ab1..5727bb870 100644 --- a/gtda/graphs/transition.py +++ b/gtda/graphs/transition.py @@ -2,17 +2,15 @@ # License: GNU AGPLv3 from types import FunctionType -import warnings import numpy as np from joblib import Parallel, delayed -from scipy import sparse as sp -from scipy.sparse import SparseEfficiencyWarning +from scipy.sparse import csr_matrix from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_is_fitted from ..utils._docs import adapt_fit_transform_docs -from ..utils.validation import validate_params +from ..utils.validation import validate_params, check_point_clouds def identity(x): @@ -27,29 +25,29 @@ class TransitionGraph(BaseEstimator, TransformerMixin): Let A be a two-dimensional array viewed as a time series (along the row axis) of one-dimensional arrays encoding the "state" of a system. The corresponding *undirected transition graph* (or *network*) has as vertex - set the set of all unique states (rows) in A, and there is an edge - between two vertices if and only if one of the corresponding states - immediately follows the other in A. + set the set of all unique states (rows) in A, and there is an edge between + vertex i and vertex j≠i if and only if the state corresponding to vertex + j immediately follows the one corresponding to vertex i, somewhere in A. - Given a collection of two-dimensional arrays, this transformer performs - two tasks: + Given a collection of two-dimensional arrays, this transformer performs two + tasks: - 1. Optionally, it preprocesses the arrays by applying a function row - by row to them. This can be used e.g. as a "compression" step - to reduce the size of the state space. - 2. It computes the undirected transition graph of each array as a - sparse matrix. + 1. Optionally, it preprocesses the arrays by applying a function row by + row to them. This can be used e.g. as a "compression" step to reduce + the size of the state space. + 2. It computes the transition graph of each array as a sparse matrix of + zeros and ones. Parameters ---------- func : None or callable, optional, default: ``numpy.argsort`` If a callable, it is the function to be applied to each row of each - array as a preprocessing steps. Allowed callables are functions - mapping 1-D arrays to 1-D arrays of constant length, and must be - compatible with :mod:`numpy.apply_along_axis`. If ``None``, this - function is the identity (no preprocessing). The default is - ``numpy.argsort``, which makes the final transition graphs - *ordinal partition networks* [1]_ [2]_ [3]_. + array as a preprocessing step. Allowed callables are functions mapping + 1D arrays to 1D arrays of constant length, and must be compatible with + :func:`numpy.apply_along_axis`. If ``None``, this function is the + identity (no preprocessing). The default is ``numpy.argsort``, which + makes the final transition graphs *ordinal partition networks* + [1]_ [2]_ [3]_. func_params : None or dict, optional, default: ``None`` Additional keyword arguments for `func`. @@ -70,19 +68,17 @@ class TransitionGraph(BaseEstimator, TransformerMixin): >>> import numpy as np >>> from gtda.graphs import TransitionGraph >>> X = np.array([[[1, 0], [2, 3], [5, 4]], - ... [[5, 4], [5, 4], [5, 4]]) - >>> tg = TransitionGraph() - >>> tg = tg.fit(X) - >>> print(tg.transform(X)[0].toarray()) + ... [[5, 4], [5, 4], [5, 4]]]) + >>> X_tg = TransitionGraph().fit_transform(X) + >>> print(X_tg[0].toarray()) [[0 1] [1 0]] - >>> print(tg.transform(X)[1].toarray()) - [[1 0] - [0 0]] + >>> print(X_tg[1].toarray()) + [[0]] See also -------- - GraphGeodesicDistance, gtda.time_series.TakensEmbedding + KNeighborsGraph, GraphGeodesicDistance Notes ----- @@ -94,26 +90,24 @@ class TransitionGraph(BaseEstimator, TransformerMixin): ---------- .. [1] M. Small, "Complex networks from time series: Capturing dynamics", *2013 IEEE International Symposium on Circuits and Systems - (IS-CAS2013)*, 2013; doi: `10.1109/iscas.2013.6572389 + (IS-CAS2013)*, 2013; `DOI: 10.1109/iscas.2013.6572389 `_. .. [2] M. McCullough, M. Small, T. Stemler, and H. Ho-Ching Iu, "Time lagged ordinal partition networks for capturing dynamics of - continuous dynamical systems"; *Chaos: An Interdisciplinary - Journal of Nonlinear Science* **25** (5), p. 053101, 2015; - doi: `10.1063/1.4919075 `_. + continuous dynamical systems"; *Chaos: An Interdisciplinary Journal + of Nonlinear Science* **25** (5), p. 053101, 2015; `DOI: + 10.1063/1.4919075 `_. .. [3] A. Myers, E. Munch, and F. A. Khasawneh, "Persistent homology of complex networks for dynamic state detection"; *Phys. Rev. E* - **100**, 022314, 2019; doi: `10.1103/PhysRevE.100.022314 + **100**, 022314, 2019; `DOI: 10.1103/PhysRevE.100.022314 `_. """ - _hyperparameters = { - 'func': {'type': (FunctionType, type(None))}, - 'func_params': {'type': (dict, type(None))} - } + _hyperparameters = {'func': {'type': (FunctionType, type(None))}, + 'func_params': {'type': (dict, type(None))}} def __init__(self, func=np.argsort, func_params=None, n_jobs=None): self.func = func @@ -122,17 +116,16 @@ def __init__(self, func=np.argsort, func_params=None, n_jobs=None): def _make_adjacency_matrix(self, X): Xm = np.apply_along_axis(self._func, 1, X) - Xm = np.unique(Xm, axis=0, return_inverse=True)[1] - n_indices = 2 * (len(Xm) - 1) + unique_states, Xm = np.unique(Xm, axis=0, return_inverse=True) + n_unique_states = len(unique_states) first = Xm[:-1] second = Xm[1:] - Xm = sp.csr_matrix((np.full(n_indices, True), - (np.concatenate([first, second]), - np.concatenate([second, first])))) - # Silence sparse warnings TODO: Benchmark - with warnings.catch_warnings(): - warnings.simplefilter('ignore', SparseEfficiencyWarning) - sp.csr_matrix.setdiag(Xm, 0) + non_diag_idx = first != second + data = np.full(np.sum(non_diag_idx), 1, dtype=int) + first = first[non_diag_idx] + second = second[non_diag_idx] + Xm = csr_matrix((data, (first, second)), + shape=(n_unique_states, n_unique_states)) return Xm def fit(self, X, y=None): @@ -143,8 +136,10 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray of shape (n_samples, n_time_steps, n_features) - Input data. + X : list of length n_samples, or ndarray of shape (n_samples, \ + n_timestamps, n_features) + Input data: a collection of 2D arrays of shape + ``(n_timestamps, n_features)``. y : None There is no need for a target in a transformer, yet the pipeline @@ -155,7 +150,7 @@ def fit(self, X, y=None): self : object """ - check_array(X, allow_nd=True) + check_point_clouds(X) validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -173,14 +168,14 @@ def fit(self, X, y=None): def transform(self, X, y=None): """Create transition graphs from the input data and return their - adjacency matrices. The graphs are simple, undirected and - unweighted, and the adjacency matrices are sparse matrices of type - bool. + adjacency matrices. The graphs are simple and unweighted. Parameters ---------- - X : ndarray of shape (n_samples, n_time_steps, n_features) - Input data. + X : list of length n_samples, or ndarray of shape (n_samples, \ + n_timestamps, n_features) + Input data: a collection of 2D arrays of shape + ``(n_timestamps, n_features)``. y : None There is no need for a target in a transformer, yet the pipeline @@ -188,15 +183,15 @@ def transform(self, X, y=None): Returns ------- - Xt : array of sparse boolean matrices, shape (n_samples,) - The collection of ``n_samples`` transition graphs. Each transition - graph is encoded by a sparse matrix of boolean type. + Xt : list of length n_samples + Collection of ``n_samples`` transition graphs. Each transition + graph is encoded by a sparse CSR matrix of ones and zeros. """ check_is_fitted(self) - Xt = check_array(X, allow_nd=True) + Xt = check_point_clouds(X) Xt = Parallel(n_jobs=self.n_jobs)( - delayed(self._make_adjacency_matrix)(x) for x in Xt) - Xt = np.asarray(Xt) + delayed(self._make_adjacency_matrix)(x) for x in Xt + ) return Xt diff --git a/gtda/homology/__init__.py b/gtda/homology/__init__.py index 0abf3ce1a..1918833e1 100644 --- a/gtda/homology/__init__.py +++ b/gtda/homology/__init__.py @@ -4,12 +4,14 @@ # License: GNU AGPLv3 from .simplicial import VietorisRipsPersistence, SparseRipsPersistence, \ - EuclideanCechPersistence + WeakAlphaPersistence, EuclideanCechPersistence, FlagserPersistence from .cubical import CubicalPersistence __all__ = [ 'VietorisRipsPersistence', 'SparseRipsPersistence', + 'WeakAlphaPersistence', 'EuclideanCechPersistence', + 'FlagserPersistence', 'CubicalPersistence', -] + ] diff --git a/gtda/homology/_utils.py b/gtda/homology/_utils.py index 0bf10836e..007d3ef58 100644 --- a/gtda/homology/_utils.py +++ b/gtda/homology/_utils.py @@ -1,37 +1,65 @@ -""" Utilities function for persistent homology.""" +"""Utility functions for persistent homology.""" # License: GNU AGPLv3 import numpy as np -from joblib import Parallel, delayed -def _pad_diagram(Xd, homology_dimensions, max_n_points, min_values): - for dim in homology_dimensions: - n_points = len(Xd[dim]) - n_points_to_pad = max_n_points[dim] - n_points - if n_points == 0 and n_points_to_pad == 0: - n_points_to_pad = 1 +def _postprocess_diagrams( + Xt, format, homology_dimensions, infinity_values, reduced + ): + # NOTE: `homology_dimensions` must be sorted in ascending order + def replace_infinity_values(subdiagram): + np.nan_to_num(subdiagram, posinf=infinity_values, copy=False) + return subdiagram[subdiagram[:, 0] < subdiagram[:, 1]] - if n_points_to_pad > 0: - padding = ((0, n_points_to_pad), (0, 0)) - Xd[dim] = np.pad(Xd[dim], padding, 'constant') - Xd[dim][-n_points_to_pad:, :] = \ - [min_values[dim], min_values[dim], dim] - Xd = np.vstack([Xd[dim] for dim in homology_dimensions]) - return Xd - - -def _postprocess_diagrams(Xt, homology_dimensions, infinity_values, n_jobs): - max_n_points = {dim: max(1, np.max([Xt[i][dim].shape[0] - for i in range(len(Xt))])) - for dim in homology_dimensions} - min_values = {dim: min([np.min(Xt[i][dim][:, 0]) if Xt[i][dim].size else - np.inf for i in range(len(Xt))]) + # Replace np.inf with infinity_values and turn into list of dictionaries + # whose keys are the dimensions + if format in ["ripser", "flagser"]: # Input is list of list of subdiagrams + # In H0, remove one infinite bar placed at the end by ripser or flagser + # only if `reduce` is True + slices = {dim: slice(None) if (dim or not reduced) else slice(None, -1) for dim in homology_dimensions} - min_values = {dim: min_values[dim] if min_values[dim] != np.inf else 0 + Xt = [{dim: replace_infinity_values(diagram[dim][slices[dim]]) + for dim in homology_dimensions} for diagram in Xt] + elif format == "gudhi": # Input is list of list of [dim, (birth, death)] + # In H0, remove one infinite bar placed at the beginning by GUDHI only + # if `reduce` is True + slices = {dim: slice(None) if (dim or not reduced) else slice(1, None) for dim in homology_dimensions} - Xt = Parallel(n_jobs=n_jobs)(delayed(_pad_diagram)( - Xt[i], homology_dimensions, max_n_points, min_values) - for i in range(len(Xt))) - Xt = np.stack(Xt) - return np.nan_to_num(Xt, posinf=infinity_values) + Xt = [{dim: replace_infinity_values( + np.array([pers_info[1] for pers_info in diagram + if pers_info[0] == dim]).reshape(-1, 2)[slices[dim]] + ) + for dim in homology_dimensions} for diagram in Xt] + else: + raise ValueError( + f"Unknown input format {format} for collection of diagrams." + ) + + # Conversion to array of triples with padding triples + start_idx_per_dim = np.cumsum( + [0] + [np.max([len(diagram[dim]) for diagram in Xt] + [1]) + for dim in homology_dimensions] + ) + min_values = [min([np.min(diagram[dim][:, 0]) if diagram[dim].size + else np.inf for diagram in Xt]) + for dim in homology_dimensions] + min_values = [min_value if min_value != np.inf else 0 + for min_value in min_values] + n_features = start_idx_per_dim[-1] + Xt_padded = np.empty((len(Xt), n_features, 3), dtype=float) + + for i, dim in enumerate(homology_dimensions): + start_idx, end_idx = start_idx_per_dim[i:i + 2] + padding_value = min_values[i] + # Add dimension as the third elements of each (b, d) tuple globally + Xt_padded[:, start_idx:end_idx, 2] = dim + for j, diagram in enumerate(Xt): + subdiagram = diagram[dim] + end_idx_nontrivial = start_idx + len(subdiagram) + # Populate nontrivial part of the subdiagram + Xt_padded[j, start_idx:end_idx_nontrivial, :2] = subdiagram + # Insert padding triples + Xt_padded[j, end_idx_nontrivial:end_idx, :2] = [padding_value] * 2 + + return Xt_padded diff --git a/gtda/homology/cubical.py b/gtda/homology/cubical.py index 373fb3b1a..894af3c2c 100644 --- a/gtda/homology/cubical.py +++ b/gtda/homology/cubical.py @@ -6,14 +6,14 @@ import numpy as np from joblib import Parallel, delayed from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_is_fitted -from ._utils import _pad_diagram +from ._utils import _postprocess_diagrams from ..base import PlotterMixin from ..externals.python import CubicalComplex, PeriodicCubicalComplex from ..plotting import plot_diagram from ..utils.intervals import Interval -from ..utils.validation import validate_params +from ..utils.validation import validate_params, check_collection class CubicalPersistence(BaseEstimator, TransformerMixin, PlotterMixin): @@ -26,6 +26,11 @@ class CubicalPersistence(BaseEstimator, TransformerMixin, PlotterMixin): dimensions and at different scales is summarised in the corresponding persistence diagram. + **Important note**: + - Persistence diagrams produced by this class must be interpreted with + care due to the presence of padding triples which carry no + information. See :meth:`transform` for additional information. + Parameters ---------- homology_dimensions : list or tuple, optional, default: ``(0, 1)`` @@ -34,23 +39,28 @@ class CubicalPersistence(BaseEstimator, TransformerMixin, PlotterMixin): coeff : int prime, optional, default: ``2`` Compute homology with coefficients in the prime field - :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where - :math:`p` equals `coeff`. + :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where :math:`p` + equals `coeff`. periodic_dimensions : boolean ndarray of shape (n_dimensions,) or None, \ optional, default: ``None`` - Periodicity of the boundaries along each of the axis, where + Periodicity of the boundaries along each of the axes, where ``n_dimensions`` is the dimension of the images of the collection. The boolean in the `d`th position expresses whether the boundaries along the `d`th axis are periodic. The default ``None`` is equivalent to passing ``numpy.zeros((n_dimensions,), dtype=np.bool)``, i.e. none of the boundaries are periodic. - infinity_values : float or None, default : ``None`` + infinity_values : float or None, default: ``None`` Which death value to assign to features which are still alive at - filtration value `np.inf`. ``None`` assigns the maximum pixel + filtration value ``numpy.inf``. ``None`` assigns the maximum pixel values within all images passed to :meth:`fit`. + reduced_homology : bool, optional, default: ``True`` + If ``True``, the earliest-born triple in homology dimension 0 which has + infinite death is discarded from each diagram computed in + :meth:`transform`. + n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all @@ -59,8 +69,8 @@ class CubicalPersistence(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- periodic_dimensions_ : boolean ndarray of shape (n_dimensions,) - Effective periodicity of the boundaries along each of the axis. - Set in :meth:`fit`. + Effective periodicity of the boundaries along each of the axes. Set in + :meth:`fit`. infinity_values_ : float Effective death value to assign to features which have infinite @@ -68,62 +78,56 @@ class CubicalPersistence(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - VietorisRipsPersistence, SparseRipsPersistence, EuclideanCechPersistence + images.HeightFiltration, images.RadialFiltration, \ + images.DilationFiltration, images.ErosionFiltration, \ + images.SignedDistanceFiltration Notes ----- `GUDHI `_ is used as a C++ backend - for computing cubical persistent homology. Python bindings were modified - for performance. - - Persistence diagrams produced by this class must be interpreted with - care due to the presence of padding triples which carry no information. - See :meth:`transform` for additional information. + for computing cubical persistent homology [1]_. Python bindings were + modified for performance. References ---------- - [1] P. Dlotko, "Cubical complex", 2015; `GUDHI User and Reference Manual \ - `_. + .. [1] P. Dlotko, "Cubical complex", 2015; `GUDHI User and Reference + Manual `_. """ _hyperparameters = { 'homology_dimensions': { - 'type': (list, tuple), 'of': { - 'type': int, 'in': Interval(0, np.inf, closed='left')}}, + 'type': (list, tuple), + 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} + }, 'coeff': {'type': int, 'in': Interval(2, np.inf, closed='left')}, - 'periodic_dimensions': { - 'type': (np.ndarray, type(None)), - 'of': {'type': np.bool_}}, - 'infinity_values': {'type': (Real, type(None))}} + 'periodic_dimensions': {'type': (np.ndarray, type(None)), + 'of': {'type': np.bool_}}, + 'infinity_values': {'type': (Real, type(None))}, + 'reduced_homology': {'type': bool} + } def __init__(self, homology_dimensions=(0, 1), coeff=2, - periodic_dimensions=None, infinity_values=None, n_jobs=None): + periodic_dimensions=None, infinity_values=None, + reduced_homology=True, n_jobs=None): self.homology_dimensions = homology_dimensions self.coeff = coeff self.periodic_dimensions = periodic_dimensions self.infinity_values = infinity_values + self.reduced_homology = reduced_homology self.n_jobs = n_jobs def _gudhi_diagram(self, X): cubical_complex = self._filtration( dimensions=X.shape, top_dimensional_cells=X.flatten(order="F"), - **self._filtration_kwargs) - Xdgms = cubical_complex.persistence(homology_coeff_field=self.coeff, - min_persistence=0) - - # Separate diagrams by homology dimensions - Xdgms = {dim: np.array([Xdgms[i][1] for i in range(len(Xdgms)) - if Xdgms[i][0] == dim]).reshape((-1, 2)) - for dim in self.homology_dimensions} - - # Add dimension as the third elements of each (b, d) tuple - Xdgms = {dim: np.hstack([Xdgms[dim], - dim * np.ones((Xdgms[dim].shape[0], 1), - dtype=Xdgms[dim].dtype)]) - for dim in self._homology_dimensions} - return Xdgms + **self._filtration_kwargs + ) + Xdgm = cubical_complex.persistence(homology_coeff_field=self.coeff, + min_persistence=0) + + return Xdgm def fit(self, X, y=None): """Do nothing and return the estimator unchanged. @@ -145,7 +149,7 @@ def fit(self, X, y=None): self : object """ - X = check_array(X, allow_nd=True) + X = check_collection(X, force_all_finite=False) validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -162,17 +166,21 @@ def fit(self, X, y=None): self.periodic_dimensions_ if self.infinity_values is None: - self.infinity_values_ = np.max(X) + if hasattr(X, 'shape'): + self.infinity_values_ = np.max(X) + else: + self.infinity_values_ = max(map(np.max, X)) else: self.infinity_values_ = self.infinity_values self._homology_dimensions = sorted(self.homology_dimensions) self._max_homology_dimension = self._homology_dimensions[-1] + return self def transform(self, X, y=None): - """For each image in `X`, compute the relevant persistence diagram - as an array of triples [b, d, q]. Each triple represents a persistent + """For each image in `X`, compute the relevant persistence diagram as + an array of triples [b, d, q]. Each triple represents a persistent topological feature in dimension q (belonging to `homology_dimensions`) which is born at b and dies at d. Only triples in which b < d are meaningful. Triples in which b and d are equal ("diagonal elements") @@ -199,37 +207,29 @@ def transform(self, X, y=None): :math:`\\sum_q n_q`, where :math:`n_q` is the maximum number of topological features in dimension :math:`q` across all samples in `X`. + """ check_is_fitted(self) - Xt = check_array(X, allow_nd=True) - - Xt = Parallel(n_jobs=self.n_jobs)( - delayed(self._gudhi_diagram)(x) for x in Xt) - - max_n_points = { - dim: max(1, np.max([x[dim].shape[0] for x in Xt])) for dim in - self.homology_dimensions} - min_values = { - dim: min([np.min(x[dim][:, 0]) if x[dim].size else np.inf for x - in Xt]) for dim in self.homology_dimensions} - min_values = { - dim: min_values[dim] if min_values[dim] != np.inf else 0 for dim - in self.homology_dimensions} - Xt = Parallel(n_jobs=self.n_jobs)(delayed(_pad_diagram)( - x, self._homology_dimensions, max_n_points, min_values) - for x in Xt) - Xt = np.stack(Xt) - Xt = np.nan_to_num(Xt, posinf=self.infinity_values_) + Xt = check_collection(X, force_all_finite=False) + + Xt = Parallel(n_jobs=self.n_jobs)(delayed(self._gudhi_diagram)(x) + for x in Xt) + + Xt = _postprocess_diagrams( + Xt, "gudhi", self._homology_dimensions, self.infinity_values_, + self.reduced_homology + ) + return Xt @staticmethod - def plot(Xt, sample=0, homology_dimensions=None): + def plot(Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. Parameters ---------- - Xt : ndarray of shape (n_samples, n_points, 3) + Xt : ndarray of shape (n_samples, n_features, 3) Collection of persistence diagrams, such as returned by :meth:`transform`. @@ -240,6 +240,20 @@ def plot(Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` means plotting all dimensions present in ``Xt[sample]``. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_diagram( - Xt[sample], homology_dimensions=homology_dimensions) + Xt[sample], homology_dimensions=homology_dimensions, + plotly_params=plotly_params + ) diff --git a/gtda/homology/simplicial.py b/gtda/homology/simplicial.py index 624becc18..27d6f8511 100644 --- a/gtda/homology/simplicial.py +++ b/gtda/homology/simplicial.py @@ -6,8 +6,10 @@ import numpy as np from joblib import Parallel, delayed +from pyflagser import flagser_weighted +from scipy.sparse import coo_matrix +from scipy.spatial import Delaunay from sklearn.base import BaseEstimator, TransformerMixin - from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.validation import check_is_fitted @@ -16,7 +18,6 @@ from ..externals.python import ripser, SparseRipsComplex, CechComplex from ..plotting import plot_diagram from ..utils._docs import adapt_fit_transform_docs - from ..utils.intervals import Interval from ..utils.validation import validate_params, check_point_clouds @@ -27,14 +28,19 @@ class VietorisRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): :ref:`Vietoris–Rips filtrations `. - Given a :ref:`point cloud ` in - Euclidean space, or an abstract - :ref:`metric space ` encoded by a - distance matrix, information about the appearance and disappearance of - topological features (technically, - :ref:`homology classes `) of various dimension - and at different scales is summarised in the corresponding persistence - diagram. + Given a :ref:`point cloud ` in + Euclidean space, or an abstract :ref:`metric space + ` encoded by a distance matrix, + information about the appearance and disappearance of topological features + (technically, :ref:`homology classes `) of various + dimensions and at different scales is summarised in the corresponding + persistence diagram. + + **Important note**: + + - Persistence diagrams produced by this class must be interpreted with + care due to the presence of padding triples which carry no + information. See :meth:`transform` for additional information. Parameters ---------- @@ -44,9 +50,9 @@ class VietorisRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): undirected graphs. Otherwise, input data is to be interpreted as a collection of point clouds (i.e. feature arrays), and `metric` determines a rule with which to calculate distances between pairs of - points (i.e. row vectors). If `metric` is a string, it must be one - of the options allowed by :func:`scipy.spatial.distance.pdist` for - its metric parameter, or a metric listed in + points (i.e. row vectors). If `metric` is a string, it must be one of + the options allowed by :func:`scipy.spatial.distance.pdist` for its + metric parameter, or a metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, including ``'euclidean'``, ``'manhattan'`` or ``'cosine'``. If `metric` is a callable, it should take pairs of vectors (1D arrays) as input and, for @@ -59,19 +65,30 @@ class VietorisRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): coeff : int prime, optional, default: ``2`` Compute homology with coefficients in the prime field - :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where - :math:`p` equals `coeff`. + :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where :math:`p` + equals `coeff`. + + collapse_edges : bool, optional, default: ``False`` + Whether to run the edge collapse algorithm in [2]_ prior to the + persistent homology computation (see the Notes). Can reduce the runtime + dramatically when the data or the maximum homology dimensions are + large. max_edge_length : float, optional, default: ``numpy.inf`` - Upper bound on the maximum value of the Vietoris–Rips filtration - parameter. Points whose distance is greater than this value will - never be connected by an edge, and topological features at scales - larger than this value will not be detected. + Maximum value of the Vietoris–Rips filtration parameter. Points whose + distance is greater than this value will never be connected by an edge, + and topological features at scales larger than this value will not be + detected. infinity_values : float or None, default: ``None`` Which death value to assign to features which are still alive at - filtration value `max_edge_length`. ``None`` means that this - death value is declared to be equal to `max_edge_length`. + filtration value `max_edge_length`. ``None`` means that this death + value is declared to be equal to `max_edge_length`. + + reduced_homology : bool, optional, default: ``True`` + If ``True``, the earliest-born triple in homology dimension 0 which has + infinite death is discarded from each diagram computed in + :meth:`transform`. n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless @@ -86,61 +103,66 @@ class VietorisRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - SparseRipsPersistence, EuclideanCechPersistence, CubicalPersistence, - ConsistentRescaling + FlagserPersistence, SparseRipsPersistence, WeakAlphaPersistence, \ + EuclideanCechPersistence, ConsistentRescaling, ConsecutiveRescaling Notes ----- - `Ripser `_ is used as a C++ backend + `Ripser `_ [1]_ is used as a C++ backend for computing Vietoris–Rips persistent homology. Python bindings were modified for performance from the `ripser.py `_ package. - Persistence diagrams produced by this class must be interpreted with - care due to the presence of padding triples which carry no information. - See :meth:`transform` for additional information. + `GUDHI `_ is used as a C++ backend + for the edge collapse algorithm described in [2]_. References ---------- - [1] U. Bauer, "Ripser: efficient computation of Vietoris–Rips persistence \ - barcodes", 2019; `arXiv:1908.02518 \ - `_. + .. [1] U. Bauer, "Ripser: efficient computation of Vietoris–Rips + persistence barcodes", 2019; `arXiv:1908.02518 + `_. + + .. [2] J.-D. Boissonnat and S. Pritam, "Edge Collapse and Persistence of + Flag Complexes"; in *36th International Symposium on Computational + Geometry (SoCG 2020)*, pp. 19:1–19:15, + Schloss Dagstuhl-Leibniz–Zentrum für Informatik, 2020; + `DOI: 10.4230/LIPIcs.SoCG.2020.19 + `_. """ _hyperparameters = { 'metric': {'type': (str, FunctionType)}, 'homology_dimensions': { - 'type': (list, tuple), 'of': { - 'type': int, 'in': Interval(0, np.inf, closed='left')}}, + 'type': (list, tuple), + 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} + }, + 'collapse_edges': {'type': bool}, 'coeff': {'type': int, 'in': Interval(2, np.inf, closed='left')}, 'max_edge_length': {'type': Real}, - 'infinity_values': {'type': (Real, type(None))} - } + 'infinity_values': {'type': (Real, type(None))}, + 'reduced_homology': {'type': bool} + } def __init__(self, metric='euclidean', homology_dimensions=(0, 1), - coeff=2, max_edge_length=np.inf, infinity_values=None, - n_jobs=None): + collapse_edges=False, coeff=2, max_edge_length=np.inf, + infinity_values=None, reduced_homology=True, n_jobs=None): self.metric = metric self.homology_dimensions = homology_dimensions + self.collapse_edges = collapse_edges self.coeff = coeff self.max_edge_length = max_edge_length self.infinity_values = infinity_values + self.reduced_homology = reduced_homology self.n_jobs = n_jobs def _ripser_diagram(self, X): - Xdgms = ripser(X, maxdim=self._max_homology_dimension, - thresh=self.max_edge_length, coeff=self.coeff, - metric=self.metric)['dgms'] - - if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][:-1, :] # Remove one infinite bar + Xdgms = ripser( + X, maxdim=self._max_homology_dimension, + thresh=self.max_edge_length, coeff=self.coeff, + metric=self.metric, collapse_edges=self.collapse_edges + )['dgms'] - # Add dimension as the third elements of each (b, d) tuple - Xdgms = {dim: np.hstack([Xdgms[dim], - dim * np.ones((Xdgms[dim].shape[0], 1), - dtype=Xdgms[dim].dtype)]) - for dim in self._homology_dimensions} return Xdgms def fit(self, X, y=None): @@ -151,16 +173,25 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray or list + X : ndarray or list of length n_samples Input data representing a collection of point clouds if `metric` was not set to ``'precomputed'``, and of distance matrices or adjacency matrices of weighted undirected graphs otherwise. Can be either a 3D ndarray whose zeroth dimension has size ``n_samples``, - or a list containing ``n_samples`` 2D ndarrays. If `metric` was - set to ``'precomputed'``, each entry of `X` must be a square - array and should be compatible with a filtration, i.e. the value - at index (i, j) should be no smaller than the values at diagonal - indices (i, i) and (j, j). + or a list containing ``n_samples`` 2D ndarrays/sparse matrices. + Point cloud arrays have shape ``(n_points, n_dimensions)``, and if + `X` is a list these shapes can vary between point clouds. If + `metric` was set to ``'precomputed'``, then: + + - if entries of `X` are dense, only their upper diagonal + portions (including the diagonal) are considered; + - if entries of `X` are sparse, they do not need to be upper + diagonal or symmetric, but correct results can only be + guaranteed when only one between entry (i, j) and entry + (j, i) is stored, or both are stored but they are equal. + - entries of `X` should be compatible with a filtration, i.e. + the value at index (i, j) should be no smaller than the + values at diagonal indices (i, i) and (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -174,7 +205,8 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) self._is_precomputed = self.metric == 'precomputed' - check_point_clouds(X, distance_matrices=self._is_precomputed) + check_point_clouds(X, accept_sparse=True, + distance_matrices=self._is_precomputed) if self.infinity_values is None: self.infinity_values_ = self.max_edge_length @@ -183,6 +215,7 @@ def fit(self, X, y=None): self._homology_dimensions = sorted(self.homology_dimensions) self._max_homology_dimension = self._homology_dimensions[-1] + return self def transform(self, X, y=None): @@ -199,16 +232,25 @@ def transform(self, X, y=None): Parameters ---------- - X : ndarray or list + X : ndarray or list of length n_samples Input data representing a collection of point clouds if `metric` was not set to ``'precomputed'``, and of distance matrices or adjacency matrices of weighted undirected graphs otherwise. Can be either a 3D ndarray whose zeroth dimension has size ``n_samples``, - or a list containing ``n_samples`` 2D ndarrays. If `metric` was - set to ``'precomputed'``, each entry of `X` must be a square - array and should be compatible with a filtration, i.e. the value - at index (i, j) should be no smaller than the values at diagonal - indices (i, i) and (j, j). + or a list containing ``n_samples`` 2D ndarrays/sparse matrices. + Point cloud arrays have shape ``(n_points, n_dimensions)``, and if + `X` is a list these shapes can vary between point clouds. If + `metric` was set to ``'precomputed'``, then: + + - if entries of `X` are dense, only their upper diagonal + portions (including the diagonal) are considered; + - if entries of `X` are sparse, they do not need to be upper + diagonal or symmetric, but correct results can only be + guaranteed when only one between entry (i, j) and entry + (j, i) is stored, or both are stored but they are equal. + - entries of `X` should be compatible with a filtration, i.e. + the value at index (i, j) should be no smaller than the + values at diagonal indices (i, i) and (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -225,23 +267,26 @@ def transform(self, X, y=None): """ check_is_fitted(self) - X = check_point_clouds(X, distance_matrices=self._is_precomputed) + X = check_point_clouds(X, accept_sparse=True, + distance_matrices=self._is_precomputed) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._ripser_diagram)(x) for x in X) - Xt = _postprocess_diagrams(Xt, self._homology_dimensions, - self.infinity_values_, self.n_jobs) + Xt = _postprocess_diagrams( + Xt, "ripser", self._homology_dimensions, self.infinity_values_, + self.reduced_homology + ) return Xt @staticmethod - def plot(Xt, sample=0, homology_dimensions=None): + def plot(Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. Parameters ---------- - Xt : ndarray of shape (n_samples, n_points, 3) + Xt : ndarray of shape (n_samples, n_features, 3) Collection of persistence diagrams, such as returned by :meth:`transform`. @@ -252,9 +297,23 @@ def plot(Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` means plotting all dimensions present in ``Xt[sample]``. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_diagram( - Xt[sample], homology_dimensions=homology_dimensions) + Xt[sample], homology_dimensions=homology_dimensions, + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -263,31 +322,35 @@ class SparseRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): :ref:`Sparse Vietoris–Rips filtrations `. - Given a :ref:`point cloud ` in - Euclidean space, or an abstract - :ref:`metric space ` - encoded by a distance matrix, information about the appearance and - disappearance of topological features (technically, - :ref:`homology classes `) of various dimensions - and at different scales is summarised in the corresponding persistence - diagram. + Given a :ref:`point cloud ` in + Euclidean space, or an abstract :ref:`metric space + ` encoded by a distance matrix, + information about the appearance and disappearance of topological features + (technically, :ref:`homology classes `) of various + dimensions and at different scales is summarised in the corresponding + persistence diagram. + + **Important note**: + + - Persistence diagrams produced by this class must be interpreted with + care due to the presence of padding triples which carry no + information. See :meth:`transform` for additional information. Parameters ---------- metric : string or callable, optional, default: ``'euclidean'`` If set to ``'precomputed'``, input data is to be interpreted as a collection of distance matrices. Otherwise, input data is to be - interpreted as a collection of point clouds (i.e. feature arrays), - and `metric` determines a rule with which to calculate distances - between pairs of instances (i.e. rows) in these arrays. - If `metric` is a string, it must be one of the options allowed by + interpreted as a collection of point clouds (i.e. feature arrays), and + `metric` determines a rule with which to calculate distances between + pairs of instances (i.e. rows) in these arrays. If `metric` is a + string, it must be one of the options allowed by :func:`scipy.spatial.distance.pdist` for its metric parameter, or a metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, - including "euclidean", "manhattan", or "cosine". - If `metric` is a callable function, it is called on each pair of - instances and the resulting value recorded. The callable should take - two arrays from the entry in `X` as input, and return a value - indicating the distance between them. + including "euclidean", "manhattan", or "cosine". If `metric` is a + callable, it is called on each pair of instances and the resulting + value recorded. The callable should take two arrays from the entry in + `X` as input, and return a value indicating the distance between them. homology_dimensions : list or tuple, optional, default: ``(0, 1)`` Dimensions (non-negative integers) of the topological features to be @@ -295,24 +358,29 @@ class SparseRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): coeff : int prime, optional, default: ``2`` Compute homology with coefficients in the prime field - :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where - :math:`p` equals `coeff`. + :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where :math:`p` + equals `coeff`. epsilon : float between 0. and 1., optional, default: ``0.1`` Parameter controlling the approximation to the exact Vietoris–Rips - filtration. If set to `0.`, :class:`SparseRipsPersistence` leads to - the same results as :class:`VietorisRipsPersistence` but is slower. + filtration. If set to `0.`, :class:`SparseRipsPersistence` leads to the + same results as :class:`VietorisRipsPersistence` but is slower. max_edge_length : float, optional, default: ``numpy.inf`` - Upper bound on the maximum value of the Vietoris–Rips filtration - parameter. Points whose distance is greater than this value will - never be connected by an edge, and topological features at scales - larger than this value will not be detected. + Maximum value of the Sparse Rips filtration parameter. Points whose + distance is greater than this value will never be connected by an edge, + and topological features at scales larger than this value will not be + detected. - infinity_values : float or None, default : ``None`` + infinity_values : float or None, default: ``None`` Which death value to assign to features which are still alive at - filtration value `max_edge_length`. ``None`` means that this - death value is declared to be equal to `max_edge_length`. + filtration value `max_edge_length`. ``None`` means that this death + value is declared to be equal to `max_edge_length`. + + reduced_homology : bool, optional, default: ``True`` + If ``True``, the earliest-born triple in homology dimension 0 which has + infinite death is discarded from each diagram computed in + :meth:`transform`. n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless @@ -327,73 +395,62 @@ class SparseRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - VietorisRipsPersistence, EuclideanCechPersistence, CubicalPersistence, - ConsistentRescaling + VietorisRipsPersistence, FlagserPersistence, WeakAlphaPersistence, \ + EuclideanCechPersistence, ConsistentRescaling, ConsecutiveRescaling Notes ----- `GUDHI `_ is used as a C++ backend - for computing sparse Vietoris–Rips persistent homology. Python bindings - were modified for performance. - - Persistence diagrams produced by this class must be interpreted with - care due to the presence of padding triples which carry no information. - See :meth:`transform` for additional information. + for computing sparse Vietoris–Rips persistent homology [1]_. Python + bindings were modified for performance. References ---------- - [1] C. Maria, "Persistent Cohomology", 2020; `GUDHI User and Reference \ - Manual `_. + .. [1] C. Maria, "Persistent Cohomology", 2020; `GUDHI User and Reference + Manual `_. """ _hyperparameters = { 'metric': {'type': (str, FunctionType)}, 'homology_dimensions': { - 'type': (list, tuple), 'of': { - 'type': int, 'in': Interval(0, np.inf, closed='left')}}, + 'type': (list, tuple), + 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} + }, 'coeff': {'type': int, 'in': Interval(2, np.inf, closed='left')}, 'epsilon': {'type': Real, 'in': Interval(0, 1, closed='both')}, 'max_edge_length': {'type': Real}, - 'infinity_values': {'type': (Real, type(None))} - } + 'infinity_values': {'type': (Real, type(None))}, + 'reduced_homology': {'type': bool} + } def __init__(self, metric='euclidean', homology_dimensions=(0, 1), coeff=2, epsilon=0.1, max_edge_length=np.inf, - infinity_values=None, n_jobs=None): + infinity_values=None, reduced_homology=True, n_jobs=None): self.metric = metric self.homology_dimensions = homology_dimensions self.coeff = coeff self.epsilon = epsilon self.max_edge_length = max_edge_length self.infinity_values = infinity_values + self.reduced_homology = reduced_homology self.n_jobs = n_jobs def _gudhi_diagram(self, X): - Xdgms = pairwise_distances(X, metric=self.metric) + Xdgm = pairwise_distances(X, metric=self.metric) sparse_rips_complex = SparseRipsComplex( - distance_matrix=Xdgms, max_edge_length=self.max_edge_length, - sparse=self.epsilon) + distance_matrix=Xdgm, max_edge_length=self.max_edge_length, + sparse=self.epsilon + ) simplex_tree = sparse_rips_complex.create_simplex_tree( - max_dimension=max(self._homology_dimensions) + 1) - Xdgms = simplex_tree.persistence( - homology_coeff_field=self.coeff, min_persistence=0) - - # Separate diagrams by homology dimensions - Xdgms = {dim: np.array([Xdgms[i][1] for i in range(len(Xdgms)) - if Xdgms[i][0] == dim]).reshape((-1, 2)) - for dim in self.homology_dimensions} - - if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][1:, :] # Remove one infinite bar - - # Add dimension as the third elements of each (b, d) tuple - Xdgms = {dim: np.hstack([Xdgms[dim], - dim * np.ones((Xdgms[dim].shape[0], 1), - dtype=Xdgms[dim].dtype)]) - for dim in self._homology_dimensions} - return Xdgms + max_dimension=max(self._homology_dimensions) + 1 + ) + Xdgm = simplex_tree.persistence( + homology_coeff_field=self.coeff, min_persistence=0 + ) + + return Xdgm def fit(self, X, y=None): """Calculate :attr:`infinity_values_`. Then, return the estimator. @@ -403,15 +460,17 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray or list - Input data representing a collection of point clouds or of distance - matrices. Can be either a 3D ndarray whose zeroth dimension has + X : ndarray or list of length n_samples + Input data representing a collection of point clouds if `metric` + was not set to ``'precomputed'``, and of distance matrices + otherwise. Can be either a 3D ndarray whose zeroth dimension has size ``n_samples``, or a list containing ``n_samples`` 2D ndarrays. - If ``metric == 'precomputed'``, elements of `X` must be square - arrays representing distance matrices; otherwise, their rows are - interpreted as vectors in Euclidean space and, when `X` is a list, - warnings are issued when the number of columns (dimension of the - Euclidean space) differs among samples. + Point cloud arrays have shape ``(n_points, n_dimensions)``, and if + `X` is a list these shapes can vary between point clouds. If + `metric` was set to ``'precomputed'``, each entry of `X` should be + compatible with a filtration, i.e. the value at index (i, j) should + be no smaller than the values at diagonal indices (i, i) and + (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -425,7 +484,8 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) self._is_precomputed = self.metric == 'precomputed' - check_point_clouds(X, distance_matrices=self._is_precomputed) + check_point_clouds(X, accept_sparse=True, + distance_matrices=self._is_precomputed) if self.infinity_values is None: self.infinity_values_ = self.max_edge_length @@ -450,15 +510,17 @@ def transform(self, X, y=None): Parameters ---------- - X : ndarray or list - Input data representing a collection of point clouds or of distance - matrices. Can be either a 3D ndarray whose zeroth dimension has + X : ndarray or list of length n_samples + Input data representing a collection of point clouds if `metric` + was not set to ``'precomputed'``, and of distance matrices + otherwise. Can be either a 3D ndarray whose zeroth dimension has size ``n_samples``, or a list containing ``n_samples`` 2D ndarrays. - If ``metric == 'precomputed'``, elements of `X` must be square - arrays representing distance matrices; otherwise, their rows are - interpreted as vectors in Euclidean space and, when `X` is a list, - warnings are issued when the number of columns (dimension of the - Euclidean space) differs among samples. + Point cloud arrays have shape ``(n_points, n_dimensions)``, and if + `X` is a list these shapes can vary between point clouds. If + `metric` was set to ``'precomputed'``, each entry of `X` should be + compatible with a filtration, i.e. the value at index (i, j) should + be no smaller than the values at diagonal indices (i, i) and + (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -475,23 +537,277 @@ def transform(self, X, y=None): """ check_is_fitted(self) - X = check_point_clouds(X, distance_matrices=self._is_precomputed) + X = check_point_clouds(X, accept_sparse=True, + distance_matrices=self._is_precomputed) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._gudhi_diagram)(x) for x in X) - Xt = _postprocess_diagrams(Xt, self._homology_dimensions, - self.infinity_values_, self.n_jobs) + Xt = _postprocess_diagrams( + Xt, "gudhi", self._homology_dimensions, self.infinity_values_, + self.reduced_homology + ) + return Xt + + @staticmethod + def plot(Xt, sample=0, homology_dimensions=None, plotly_params=None): + """Plot a sample from a collection of persistence diagrams, with + homology in multiple dimensions. + + Parameters + ---------- + Xt : ndarray of shape (n_samples, n_features, 3) + Collection of persistence diagrams, such as returned by + :meth:`transform`. + + sample : int, optional, default: ``0`` + Index of the sample in `Xt` to be plotted. + + homology_dimensions : list, tuple or None, optional, default: ``None`` + Which homology dimensions to include in the plot. ``None`` means + plotting all dimensions present in ``Xt[sample]``. + + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + + """ + return plot_diagram( + Xt[sample], homology_dimensions=homology_dimensions, + plotly_params=plotly_params + ) + + +@adapt_fit_transform_docs +class WeakAlphaPersistence(BaseEstimator, TransformerMixin, PlotterMixin): + """:ref:`Persistence diagrams ` resulting from + :ref:`weak alpha filtrations `. + + Given a :ref:`point cloud ` in + Euclidean space, information about the appearance and disappearance of + topological features (technically, :ref:`homology classes + `) of various dimensions and at different scales + is summarised in the corresponding persistence diagram. + + The weak alpha filtration of a point cloud is defined to be the + :ref:`Vietoris–Rips filtration + ` of the sparse matrix + of Euclidean distances between neighbouring vertices in the Delaunay + triangulation of the point cloud. In low dimensions, computing the + persistent homology of this filtration can be much faster than computing + Vietoris–Rips persistent homology via :class:`VietorisRipsPersistence`. + + **Important note**: + + - Persistence diagrams produced by this class must be interpreted with + care due to the presence of padding triples which carry no + information. See :meth:`transform` for additional information. + + Parameters + ---------- + homology_dimensions : list or tuple, optional, default: ``(0, 1)`` + Dimensions (non-negative integers) of the topological features to be + detected. + + coeff : int prime, optional, default: ``2`` + Compute homology with coefficients in the prime field + :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where :math:`p` + equals `coeff`. + + max_edge_length : float, optional, default: ``numpy.inf`` + Maximum value of the Vietoris–Rips filtration parameter. Points whose + distance is greater than this value will never be connected by an edge, + and topological features at scales larger than this value will not be + detected. + + infinity_values : float or None, default: ``None`` + Which death value to assign to features which are still alive at + filtration value `max_edge_length`. ``None`` means that this death + value is declared to be equal to `max_edge_length`. + + reduced_homology : bool, optional, default: ``True`` + If ``True``, the earliest-born triple in homology dimension 0 which has + infinite death is discarded from each diagram computed in + :meth:`transform`. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + + Attributes + ---------- + infinity_values_ : float + Effective death value to assign to features which are still alive at + filtration value `max_edge_length`. + + See also + -------- + VietorisRipsPersistence, FlagserPersistence, SparseRipsPersistence, \ + EuclideanCechPersistence + + Notes + ----- + Delaunay triangulation are computed by :class:`scipy.spatial.Delaunay`. + `Ripser `_ [1]_ is used as a C++ backend + for computing Vietoris–Rips persistent homology. Python bindings were + modified for performance from the `ripser.py + `_ package. + + References + ---------- + .. [1] U. Bauer, "Ripser: efficient computation of Vietoris–Rips + persistence barcodes", 2019; `arXiv:1908.02518 + `_. + + """ + + _hyperparameters = { + 'homology_dimensions': { + 'type': (list, tuple), + 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} + }, + 'coeff': {'type': int, 'in': Interval(2, np.inf, closed='left')}, + 'max_edge_length': {'type': Real}, + 'infinity_values': {'type': (Real, type(None))}, + 'reduced_homology': {'type': bool} + } + + def __init__(self, homology_dimensions=(0, 1), coeff=2, + max_edge_length=np.inf, infinity_values=None, + reduced_homology=True, n_jobs=None): + self.homology_dimensions = homology_dimensions + self.coeff = coeff + self.max_edge_length = max_edge_length + self.infinity_values = infinity_values + self.reduced_homology = reduced_homology + self.n_jobs = n_jobs + + def _weak_alpha_diagram(self, X): + # `indices` will serve as the array of column indices + indptr, indices = Delaunay(X).vertex_neighbor_vertices + + # Compute the array of row indices + row = np.zeros_like(indices) + row[indptr[1:-1]] = 1 + np.cumsum(row, out=row) + + # We only need the upper diagonal + mask = indices > row + row, col = row[mask], indices[mask] + dists = np.linalg.norm(X[row] - X[col], axis=1) + # Note: passing the shape explicitly should not be needed in more + # recent versions of C++ ripser + n_points = len(X) + dm = coo_matrix((dists, (row, col)), shape=(n_points, n_points)) + + Xdgms = ripser(dm, maxdim=self._max_homology_dimension, + thresh=self.max_edge_length, coeff=self.coeff, + metric='precomputed')['dgms'] + + return Xdgms + + def fit(self, X, y=None): + """Calculate :attr:`infinity_values_`. Then, return the estimator. + + This method is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray or list of length n_samples + Input data representing a collection of point clouds. Can be either + a 3D ndarray whose zeroth dimension has size ``n_samples``, or a + list containing ``n_samples`` 2D ndarrays. Point cloud arrays have + shape ``(n_points, n_dimensions)``, and if `X` is a list these + shapes can vary between point clouds. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + self : object + + """ + validate_params( + self.get_params(), self._hyperparameters, exclude=['n_jobs']) + check_point_clouds(X) + + if self.infinity_values is None: + self.infinity_values_ = self.max_edge_length + else: + self.infinity_values_ = self.infinity_values + + self._homology_dimensions = sorted(self.homology_dimensions) + self._max_homology_dimension = self._homology_dimensions[-1] + + return self + + def transform(self, X, y=None): + """For each point cloud in `X`, compute the relevant persistence + diagram as an array of triples [b, d, q]. Each triple represents a + persistent topological feature in dimension q (belonging to + `homology_dimensions`) which is born at b and dies at d. Only triples + in which b < d are meaningful. Triples in which b and d are equal + ("diagonal elements") may be artificially introduced during the + computation for padding purposes, since the number of non-trivial + persistent topological features is typically not constant across + samples. They carry no information and hence should be effectively + ignored by any further computation. + + Parameters + ---------- + X : ndarray or list of length n_samples + Input data representing a collection of point clouds. Can be either + a 3D ndarray whose zeroth dimension has size ``n_samples``, or a + list containing ``n_samples`` 2D ndarrays. Point cloud arrays have + shape ``(n_points, n_dimensions)``, and if `X` is a list these + shapes can vary between point clouds. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features, 3) + Array of persistence diagrams computed from the feature arrays or + distance matrices in `X`. ``n_features`` equals + :math:`\\sum_q n_q`, where :math:`n_q` is the maximum number of + topological features in dimension :math:`q` across all samples in + `X`. + + """ + check_is_fitted(self) + X = check_point_clouds(X) + + Xt = Parallel(n_jobs=self.n_jobs)( + delayed(self._weak_alpha_diagram)(x) for x in X) + + Xt = _postprocess_diagrams( + Xt, "ripser", self._homology_dimensions, self.infinity_values_, + self.reduced_homology + ) return Xt @staticmethod - def plot(Xt, sample=0, homology_dimensions=None): + def plot(Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. Parameters ---------- - Xt : ndarray of shape (n_samples, n_points, 3) + Xt : ndarray of shape (n_samples, n_features, 3) Collection of persistence diagrams, such as returned by :meth:`transform`. @@ -502,22 +818,41 @@ def plot(Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` means plotting all dimensions present in ``Xt[sample]``. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_diagram( - Xt[sample], homology_dimensions=homology_dimensions) + Xt[sample], homology_dimensions=homology_dimensions, + plotly_params=plotly_params + ) @adapt_fit_transform_docs class EuclideanCechPersistence(BaseEstimator, TransformerMixin, PlotterMixin): """:ref:`Persistence diagrams ` resulting from - `Cech filtrations `_. + `Cech filtrations `_. - Given a :ref:`point cloud ` in + Given a :ref:`point cloud ` in Euclidean space, information about the appearance and disappearance of - topological features (technically, - :ref:`homology classes `) of various dimensions - and at different scales is summarised in the corresponding persistence - diagram. + topological features (technically, :ref:`homology classes + `) of various dimensions and at different scales + is summarised in the corresponding persistence diagram. + + **Important note**: + + - Persistence diagrams produced by this class must be interpreted with + care due to the presence of padding triples which carry no + information. See :meth:`transform` for additional information. Parameters ---------- @@ -527,20 +862,22 @@ class EuclideanCechPersistence(BaseEstimator, TransformerMixin, PlotterMixin): coeff : int prime, optional, default: ``2`` Compute homology with coefficients in the prime field - :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where - :math:`p` equals `coeff`. + :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where :math:`p` + equals `coeff`. max_edge_length : float, optional, default: ``numpy.inf`` - Upper bound on the maximum value of the Vietoris–Rips filtration - parameter. Points whose distance is greater than this value will - never be connected by an edge, and topological features at scales - larger than this value will not be detected. + Maximum value of the Cech filtration parameter. Topological features at + scales larger than this value will not be detected. infinity_values : float or None, default: ``None`` Which death value to assign to features which are still alive at filtration value `max_edge_length`. ``None`` means that this death value is declared to be equal to `max_edge_length`. + reduced_homology : bool, optional, default: ``True`` + If ``True``, the earliest-born triple in homology dimension 0 which has + infinite death is discarded in :meth:`transform`. + n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all @@ -554,68 +891,55 @@ class EuclideanCechPersistence(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - VietorisRipsPersistence, SparseRipsPersistence, CubicalPersistence, - ConsistentRescaling + VietorisRipsPersistence, FlagserPersistence, SparseRipsPersistence, + WeakAlphaPersistence Notes ----- `GUDHI `_ is used as a C++ backend - for computing Cech persistent homology. Python bindings were modified + for computing Cech persistent homology [1]_. Python bindings were modified for performance. - Persistence diagrams produced by this class must be interpreted with - care due to the presence of padding triples which carry no information. - See :meth:`transform` for additional information. - References ---------- - [1] C. Maria, "Persistent Cohomology", 2020; `GUDHI User and Reference \ - Manual `_. + .. [1] C. Maria, "Persistent Cohomology", 2020; `GUDHI User and Reference + Manual `_. """ _hyperparameters = { 'homology_dimensions': { - 'type': (list, tuple), 'of': { - 'type': int, 'in': Interval(0, np.inf, closed='left')}}, + 'type': (list, tuple), + 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} + }, 'coeff': {'type': int, 'in': Interval(2, np.inf, closed='left')}, - 'max_edge_length': { - 'type': Real, 'in': Interval(0, np.inf, closed='right')}, - 'infinity_values': { - 'type': (Real, type(None)), - 'in': Interval(0, np.inf, closed='neither')}, - } + 'max_edge_length': {'type': Real, + 'in': Interval(0, np.inf, closed='right')}, + 'infinity_values': {'type': (Real, type(None)), + 'in': Interval(0, np.inf, closed='neither')}, + 'reduced_homology': {'type': bool} + } def __init__(self, homology_dimensions=(0, 1), coeff=2, - max_edge_length=np.inf, infinity_values=None, n_jobs=None): + max_edge_length=np.inf, infinity_values=None, + reduced_homology=True, n_jobs=None): self.homology_dimensions = homology_dimensions self.coeff = coeff self.max_edge_length = max_edge_length self.infinity_values = infinity_values + self.reduced_homology = reduced_homology self.n_jobs = n_jobs def _gudhi_diagram(self, X): cech_complex = CechComplex(points=X, max_radius=self.max_edge_length) simplex_tree = cech_complex.create_simplex_tree( - max_dimension=max(self._homology_dimensions) + 1) - Xdgms = simplex_tree.persistence( - homology_coeff_field=self.coeff, min_persistence=0) - - # Separate diagrams by homology dimensions - Xdgms = {dim: np.array([Xdgms[i][1] for i in range(len(Xdgms)) - if Xdgms[i][0] == dim]).reshape((-1, 2)) - for dim in self.homology_dimensions} - - if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][1:, :] # Remove one infinite bar - - # Add dimension as the third elements of each (b, d) tuple - Xdgms = {dim: np.hstack([Xdgms[dim], - dim * np.ones((Xdgms[dim].shape[0], 1), - dtype=Xdgms[dim].dtype)]) - for dim in self._homology_dimensions} - return Xdgms + max_dimension=max(self._homology_dimensions) + 1 + ) + Xdgm = simplex_tree.persistence(homology_coeff_field=self.coeff, + min_persistence=0) + + return Xdgm def fit(self, X, y=None): """Calculate :attr:`infinity_values_`. Then, return the estimator. @@ -625,13 +949,12 @@ def fit(self, X, y=None): Parameters ---------- - X : ndarray or list - Input data representing a collection of point clouds. Can be - either a 3D ndarray whose zeroth dimension has size ``n_samples``, - or a list containing ``n_samples`` 2D ndarrays. The rows of - elements in `X` are interpreted as vectors in Euclidean space and. - and, when `X` is a list, warnings are issued when the number of - columns (dimension of the Euclidean space) differs among samples. + X : ndarray or list of length n_samples + Input data representing a collection of point clouds. Can be either + a 3D ndarray whose zeroth dimension has size ``n_samples``, or a + list containing ``n_samples`` 2D ndarrays. Point cloud arrays have + shape ``(n_points, n_dimensions)``, and if `X` is a list these + shapes can vary between point clouds. y : None There is no need for a target in a transformer, yet the pipeline @@ -653,6 +976,7 @@ def fit(self, X, y=None): self._homology_dimensions = sorted(self.homology_dimensions) self._max_homology_dimension = self._homology_dimensions[-1] + return self def transform(self, X, y=None): @@ -669,13 +993,12 @@ def transform(self, X, y=None): Parameters ---------- - X : ndarray of shape (n_samples, n_points, n_dimensions) - Input data representing a collection of point clouds. Can be - either a 3D ndarray whose zeroth dimension has size ``n_samples``, - or a list containing ``n_samples`` 2D ndarrays. The rows of - elements in `X` are interpreted as vectors in Euclidean space and. - and, when `X` is a list, warnings are issued when the number of - columns (dimension of the Euclidean space) differs among samples. + X : ndarray or list of length n_samples + Input data representing a collection of point clouds. Can be either + a 3D ndarray whose zeroth dimension has size ``n_samples``, or a + list containing ``n_samples`` 2D ndarrays. Point cloud arrays have + shape ``(n_points, n_dimensions)``, and if `X` is a list these + shapes can vary between point clouds. y : None There is no need for a target in a transformer, yet the pipeline @@ -693,21 +1016,312 @@ def transform(self, X, y=None): check_is_fitted(self) X = check_point_clouds(X) + Xt = Parallel(n_jobs=self.n_jobs)(delayed(self._gudhi_diagram)(x) + for x in X) + + Xt = _postprocess_diagrams( + Xt, "gudhi", self._homology_dimensions, self.infinity_values_, + self.reduced_homology + ) + return Xt + + @staticmethod + def plot(Xt, sample=0, homology_dimensions=None, plotly_params=None): + """Plot a sample from a collection of persistence diagrams, with + homology in multiple dimensions. + + Parameters + ---------- + Xt : ndarray of shape (n_samples, n_features, 3) + Collection of persistence diagrams, such as returned by + :meth:`transform`. + + sample : int, optional, default: ``0`` + Index of the sample in `Xt` to be plotted. + + homology_dimensions : list, tuple or None, optional, default: ``None`` + Which homology dimensions to include in the plot. ``None`` means + plotting all dimensions present in ``Xt[sample]``. + + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + + """ + return plot_diagram( + Xt[sample], homology_dimensions=homology_dimensions, + plotly_params=plotly_params + ) + + +@adapt_fit_transform_docs +class FlagserPersistence(BaseEstimator, TransformerMixin, PlotterMixin): + """:ref:`Persistence diagrams ` resulting from + :ref:`filtrations ` of :ref:`directed or undirected flag + complexes ` [1]_. + + Given a weighted directed or undirected graph, information about the + appearance and disappearance of topological features (technically, + :ref:`homology classes `) of various dimension and + at different scales is summarised in the corresponding persistence diagram. + + **Important note**: + + - Persistence diagrams produced by this class must be interpreted with + care due to the presence of padding triples which carry no + information. See :meth:`transform` for additional information. + + Parameters + ---------- + homology_dimensions : list or tuple, optional, default: ``(0, 1)`` + Dimensions (non-negative integers) of the topological features to be + detected. + + directed : bool, optional, default: ``True`` + If ``True``, :meth:`transform` computes the persistence diagrams of the + filtered directed flag complexes arising from the input collection of + weighted directed graphs. If ``False``, :meth:`transform` computes the + persistence diagrams of the filtered undirected flag complexes obtained + by regarding all input weighted graphs as undirected, and: + + - if `max_edge_weight` is ``numpy.inf``, it is sufficient to pass a + collection of (dense or sparse) upper-triangular matrices; + - if `max_edge_weight` is finite, it is recommended to pass either a + collection of symmetric dense matrices, or a collection of sparse + upper-triangular matrices. + + filtration : string, optional, default: ``'max'`` + Algorithm determining the filtration values of higher order simplices + from the weights of the vertices and edges. Possible values are: + ['dimension', 'zero', 'max', 'max3', 'max_plus_one', 'product', 'sum', + 'pmean', 'pmoment', 'remove_edges', 'vertex_degree'] + + coeff : int prime, optional, default: ``2`` + Compute homology with coefficients in the prime field + :math:`\\mathbb{F}_p = \\{ 0, \\ldots, p - 1 \\}` where :math:`p` + equals `coeff`. + + max_edge_weight : float, optional, default: ``numpy.inf`` + Maximum edge weight to be considered in the filtration. All edge + weights greater than this value will be considered as absent from the + filtration and topological features at scales larger than this value + will not be detected. + + infinity_values : float or None, default: ``None`` + Which death value to assign to features which are still alive at + filtration value `max_edge_weight`. ``None`` means that this death + value is declared to be equal to `max_edge_weight`. + + reduced_homology : bool, optional, default: ``True`` + If ``True``, the earliest-born triple in homology dimension 0 which has + infinite death is discarded from each diagram computed in + :meth:`transform`. + + max_entries : int, optional, default: ``-1`` + Number controlling the degree of precision in the matrix reductions + performed by the the backend. Corresponds to the parameter + ``approximation`` in :func:`pyflagser.flagser_weighted` and + :func:`pyflagser.flagser_unweighted`. Increase for higher precision, + decrease for faster computation. A good value is often ``100000`` in + hard problems. A negative value computes highest possible precision. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + + Attributes + ---------- + infinity_values_ : float + Effective death value to assign to features which are still alive at + filtration value `max_edge_weight`. + + See also + -------- + VietorisRipsPersistence, SparseRipsPersistence, WeakAlphaPersistence, + EuclideanCechPersistence, ConsistentRescaling, ConsecutiveRescaling + + Notes + ----- + The `pyflagser `_ Python package + is used for binding `Flagser `_, a C++ + backend for computing the (persistent) homology of (filtered) directed + flag complexes. For more details, please refer to the `flagser \ + documentation `_. + + References + ---------- + .. [1] D. Luetgehetmann, D. Govc, J. P. Smith, and R. Levi, "Computing + persistent homology of directed flag complexes", *Algorithms*, + 13(1), 2020. + + """ + + _hyperparameters = { + 'homology_dimensions': { + 'type': (list, tuple), + 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} + }, + 'directed': {'type': bool}, + 'coeff': {'type': int, 'in': Interval(2, np.inf, closed='left')}, + 'max_edge_weight': {'type': Real}, + 'infinity_values': {'type': (Real, type(None))}, + 'reduced_homology': {'type': bool}, + 'max_entries': {'type': int} + } + + def __init__(self, homology_dimensions=(0, 1), directed=True, + filtration='max', coeff=2, max_edge_weight=np.inf, + infinity_values=None, reduced_homology=True, max_entries=-1, + n_jobs=None): + self.homology_dimensions = homology_dimensions + self.directed = directed + self.filtration = filtration + self.coeff = coeff + self.max_edge_weight = max_edge_weight + self.infinity_values = infinity_values + self.reduced_homology = reduced_homology + self.max_entries = max_entries + self.n_jobs = n_jobs + + def _flagser_diagram(self, X): + Xdgms = [np.empty((0, 2), dtype=float)] * self._min_homology_dimension + Xdgms += flagser_weighted(X, max_edge_weight=self.max_edge_weight, + min_dimension=self._min_homology_dimension, + max_dimension=self._max_homology_dimension, + directed=self.directed, + filtration=self.filtration, coeff=self.coeff, + approximation=self.max_entries)['dgms'] + n_missing_dims = self._max_homology_dimension + 1 - len(Xdgms) + if n_missing_dims: + Xdgms += [np.empty((0, 2), dtype=float)] * n_missing_dims + + return Xdgms + + def fit(self, X, y=None): + """Calculate :attr:`infinity_values_`. Then, return the estimator. + + This method is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray or list of length n_samples + Input collection of adjacency matrices of weighted directed or + undirected graphs. Can be either a 3D ndarray whose zeroth + dimension has size ``n_samples``, or a list containing + ``n_samples`` 2D ndarrays/sparse matrices. In each adjacency + matrix, diagonal elements are vertex weights and off-diagonal + elements are edges weights. It is assumed that a vertex weight + cannot be larger than the weight of the edges it + forms. The way zero values are handled depends on the format of the + matrix. If the matrix is a dense ``numpy.ndarray``, zero values + denote zero-weighted edges. If the matrix is a sparse + ``scipy.sparse`` matrix, explicitly stored off-diagonal zeros and + all diagonal zeros denote zero-weighted edges. Off-diagonal values + that have not been explicitly stored are treated by + ``scipy.sparse`` as zeros but will be understood as + infinitely-valued edges, i.e., edges absent from the filtration. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + self : object + + """ + check_point_clouds(X, accept_sparse=True, distance_matrices=True) + validate_params( + self.get_params(), self._hyperparameters, exclude=['n_jobs', + 'filtration']) + + if self.infinity_values is None: + self.infinity_values_ = self.max_edge_weight + else: + self.infinity_values_ = self.infinity_values + + self._homology_dimensions = sorted(self.homology_dimensions) + self._min_homology_dimension = self._homology_dimensions[0] + self._max_homology_dimension = self._homology_dimensions[-1] + + return self + + def transform(self, X, y=None): + """For each adjacency matrix in `X`, compute the relevant persistence + diagram as an array of triples [b, d, q]. Each triple represents a + persistent topological feature in dimension q (belonging to + `homology_dimensions`) which is born at b and dies at d. Only triples + in which b < d are meaningful. Triples in which b and d are equal + ("diagonal elements") may be artificially introduced during the + computation for padding purposes, since the number of non-trivial + persistent topological features is typically not constant across + samples. They carry no information and hence should be effectively + ignored by any further computation. + + Parameters + ---------- + X : ndarray or list of length n_samples + Input collection of adjacency matrices of weighted directed or + undirected graphs. Can be either a 3D ndarray whose zeroth + dimension has size ``n_samples``, or a list containing + ``n_samples`` 2D ndarrays/sparse matrices. In each adjacency + matrix, diagonal elements are vertex weights and off-diagonal + elements are edges weights. It is assumed that a vertex weight + cannot be larger than the weight of the edges it + forms. The way zero values are handled depends on the format of the + matrix. If the matrix is a dense ``numpy.ndarray``, zero values + denote zero-weighted edges. If the matrix is a sparse + ``scipy.sparse`` matrix, explicitly stored off-diagonal zeros and + all diagonal zeros denote zero-weighted edges. Off-diagonal values + that have not been explicitly stored are treated by + ``scipy.sparse`` as zeros but will be understood as + infinitely-valued edges, i.e., edges absent from the filtration. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features, 3) + Array of persistence diagrams computed from the feature arrays or + distance matrices in `X`. ``n_features`` equals + :math:`\\sum_q n_q`, where :math:`n_q` is the maximum number of + topological features in dimension :math:`q` across all samples in + `X`. + + """ + check_is_fitted(self) + X = check_point_clouds(X, accept_sparse=True, distance_matrices=True) + Xt = Parallel(n_jobs=self.n_jobs)( - delayed(self._gudhi_diagram)(x) for x in X) + delayed(self._flagser_diagram)(x) for x in X) - Xt = _postprocess_diagrams(Xt, self._homology_dimensions, - self.infinity_values_, self.n_jobs) + Xt = _postprocess_diagrams( + Xt, "flagser", self._homology_dimensions, self.infinity_values_, + self.reduced_homology + ) return Xt @staticmethod - def plot(Xt, sample=0, homology_dimensions=None): + def plot(Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. Parameters ---------- - Xt : ndarray of shape (n_samples, n_points, 3) + Xt : ndarray of shape (n_samples, n_features, 3) Collection of persistence diagrams, such as returned by :meth:`transform`. @@ -718,6 +1332,20 @@ def plot(Xt, sample=0, homology_dimensions=None): Which homology dimensions to include in the plot. ``None`` means plotting all dimensions present in ``Xt[sample]``. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_diagram( - Xt[sample], homology_dimensions=homology_dimensions) + Xt[sample], homology_dimensions=homology_dimensions, + plotly_params=plotly_params + ) diff --git a/gtda/homology/tests/test_cubical.py b/gtda/homology/tests/test_cubical.py index 85b9fcc3d..a87f2f107 100644 --- a/gtda/homology/tests/test_cubical.py +++ b/gtda/homology/tests/test_cubical.py @@ -16,15 +16,15 @@ [2.98935825, 2.79848711], [2.79848711, 2.41211849], [2.41211849, 1.92484888]]]) +X_list = list(X) -X_cp_res = np.array([[[1.9248489, 2.9893582, 0.], - [2., 2.79848711, 0], +X_cp_res = np.array([[[2., 2.79848711, 0], [0., 0., 1]]]) -X_cp_res_periodic = np.array([[[1.9248489, 2.9893582, 0.], - [2., 2.9893582, 1.], - [2.7984871, 2.9893582, 1.], - [2.7984871, 2.841471, 1.]]]) +X_cp_res_periodic = np.array([[[0., 0., 0.], + [2., 2.98935825, 1.], + [2.79848711, 2.98935825, 1.], + [2.79848711, 2.84147098, 1.]]]) def test_cp_not_fitted(): @@ -36,14 +36,15 @@ def test_cp_not_fitted(): @pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) def test_cp_fit_transform_plot(hom_dims): - CubicalPersistence().fit_transform_plot( - X, sample=0, homology_dimensions=hom_dims) + CubicalPersistence().fit_transform_plot(X, sample=0, + homology_dimensions=hom_dims) +@pytest.mark.parametrize("X", [X, X_list]) @pytest.mark.parametrize("periodic_dimensions, expected", [(None, X_cp_res), (np.array([False, False]), X_cp_res), (np.array([True, True]), X_cp_res_periodic)]) -def test_cp_transform(periodic_dimensions, expected): +def test_cp_transform(X, periodic_dimensions, expected): cp = CubicalPersistence(periodic_dimensions=periodic_dimensions) assert_almost_equal(cp.fit_transform(X), expected) diff --git a/gtda/homology/tests/test_simplicial.py b/gtda/homology/tests/test_simplicial.py index e93472e7b..76be686fa 100644 --- a/gtda/homology/tests/test_simplicial.py +++ b/gtda/homology/tests/test_simplicial.py @@ -5,16 +5,46 @@ import plotly.io as pio import pytest from numpy.testing import assert_almost_equal +from scipy.sparse import csr_matrix +from scipy.spatial.distance import pdist, squareform +from scipy.spatial.qhull import QhullError from sklearn.exceptions import NotFittedError from gtda.homology import VietorisRipsPersistence, SparseRipsPersistence, \ - EuclideanCechPersistence + WeakAlphaPersistence, EuclideanCechPersistence, FlagserPersistence pio.renderers.default = 'plotly_mimetype' -X = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], - [2.98935825, 2.79848711], [2.79848711, 2.41211849], - [2.41211849, 1.92484888]]]) +X_pc = np.array([ + [[2., 2.47942554], + [2.47942554, 2.84147098], + [2.98935825, 2.79848711], + [2.79848711, 2.41211849], + [2.41211849, 1.92484888]] + ]) +X_pc_list = list(X_pc) + +X_dist = np.array([ + squareform(pdist(x)) for x in X_pc + ]) +X_dist_list = list(X_dist) + +X_pc_sparse = [csr_matrix(x) for x in X_pc] +X_dist_sparse = [csr_matrix(x) for x in X_dist] + +X_dist_disconnected = np.array([[[0, np.inf], [np.inf, 0]]]) + +# 8-point sampling of a noisy circle +X_circle = np.array([ + [[1.00399159, -0.00797583], + [0.70821787, 0.68571714], + [-0.73369765, -0.71298056], + [0.01110395, -1.03739883], + [-0.64968271, 0.7011624], + [0.03895963, 0.94494511], + [0.76291108, -0.68774373], + [-1.01932365, -0.05793851]] + ]) def test_vrp_params(): @@ -22,38 +52,75 @@ def test_vrp_params(): vrp = VietorisRipsPersistence(metric=metric) with pytest.raises(ValueError): - vrp.fit_transform(X) + vrp.fit_transform(X_pc) def test_vrp_not_fitted(): vrp = VietorisRipsPersistence() with pytest.raises(NotFittedError): - vrp.transform(X) - - -X_vrp_res = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], - [0., 0.60077095, 0], [0., 0.62186205, 0], - [0.69093919, 0.80131882, 1]]]) - - -def test_vrp_transform(): - vrp = VietorisRipsPersistence() - - assert_almost_equal(vrp.fit_transform(X), X_vrp_res) - - -def test_vrp_list_of_arrays(): + vrp.transform(X_pc) + + +X_vrp_exp = np.array([ + [[0., 0.43094373, 0.], + [0., 0.5117411, 0.], + [0., 0.60077095, 0.], + [0., 0.62186205, 0.], + [0.69093919, 0.80131882, 1.]] + ]) + + +@pytest.mark.parametrize('X, metric', [(X_pc, 'euclidean'), + (X_pc_list, 'euclidean'), + (X_pc_sparse, 'euclidean'), + (X_dist, 'precomputed'), + (X_dist_list, 'precomputed'), + (X_dist_sparse, 'precomputed')]) +@pytest.mark.parametrize('collapse_edges', [True, False]) +@pytest.mark.parametrize('max_edge_length', [np.inf, 0.8]) +@pytest.mark.parametrize('infinity_values', [10, 30]) +def test_vrp_transform(X, metric, collapse_edges, max_edge_length, + infinity_values): + vrp = VietorisRipsPersistence(metric=metric, + collapse_edges=collapse_edges, + max_edge_length=max_edge_length, + infinity_values=infinity_values) + # This is not generally true, it is only a way to obtain the res array + # in this specific case + X_exp = X_vrp_exp.copy() + X_exp[:, :, :2][X_exp[:, :, :2] >= max_edge_length] = infinity_values + assert_almost_equal(vrp.fit_transform(X), X_exp) + + +def test_vrp_list_of_arrays_different_size(): X_2 = np.array([[0., 1.], [1., 2.]]) - X_list = [X[0].copy(), X_2] vrp = VietorisRipsPersistence() - vrp.fit(X_list) - - + assert_almost_equal(vrp.fit_transform([X_pc[0], X_2])[0], X_vrp_exp[0]) + + +@pytest.mark.parametrize('X, metric', [(X_pc, 'euclidean'), + (X_pc_list, 'euclidean'), + (X_pc_sparse, 'euclidean'), + (X_dist, 'precomputed'), + (X_dist_list, 'precomputed'), + (X_dist_sparse, 'precomputed')]) +def test_vrp_low_infinity_values(X, metric): + vrp = VietorisRipsPersistence(max_edge_length=0.001, + metric=metric, + infinity_values=-1) + assert_almost_equal(vrp.fit_transform(X)[:, :, :2], + np.zeros((1, 2, 2))) + + +@pytest.mark.parametrize('X, metric', [(X_pc, 'euclidean'), + (X_pc_list, 'euclidean'), + (X_dist_disconnected, 'precomputed')]) @pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) -def test_vrp_fit_transform_plot(hom_dims): - VietorisRipsPersistence().fit_transform_plot( - X, sample=0, homology_dimensions=hom_dims) +def test_vrp_fit_transform_plot(X, metric, hom_dims): + VietorisRipsPersistence(metric=metric).fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims + ) def test_srp_params(): @@ -61,35 +128,124 @@ def test_srp_params(): vrp = SparseRipsPersistence(metric=metric) with pytest.raises(ValueError): - vrp.fit_transform(X) + vrp.fit_transform(X_pc) def test_srp_not_fitted(): srp = SparseRipsPersistence() with pytest.raises(NotFittedError): - srp.transform(X) + srp.transform(X_pc) + + +X_srp_exp = np.array([ + [[0., 0.43094373, 0.], + [0., 0.5117411, 0.], + [0., 0.60077095, 0.], + [0., 0.62186205, 0.], + [0.69093919, 0.80131882, 1.]] + ]) + + +@pytest.mark.parametrize('X, metric', [(X_pc, 'euclidean'), + (X_pc_list, 'euclidean'), + (X_pc_sparse, 'euclidean'), + (X_dist, 'precomputed'), + (X_dist_list, 'precomputed')]) +@pytest.mark.parametrize("epsilon, diagrams", + [(0.0, X_vrp_exp), (1.0, X_srp_exp)]) +def test_srp_transform(X, metric, epsilon, diagrams): + srp = SparseRipsPersistence(metric=metric, epsilon=epsilon) + + assert_almost_equal(np.sort(srp.fit_transform(X), axis=1), + np.sort(diagrams, axis=1)) -X_srp_res_2 = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], - [0., 0.60077095, 0], [0., 0.62186205, 0], - [0.69093919, 0.80131882, 1]]]) +@pytest.mark.parametrize('X', [X_pc, X_pc_list]) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_srp_fit_transform_plot(X, hom_dims): + SparseRipsPersistence().fit_transform_plot(X, sample=0, + homology_dimensions=hom_dims) + +def test_wap_params(): + coeff = 'not_defined' + wap = WeakAlphaPersistence(coeff=coeff) -@pytest.mark.parametrize("epsilon, point_clouds, expected", - [(0.0, X, X_vrp_res), - (1.0, X, X_srp_res_2)]) -def test_srp_transform(epsilon, point_clouds, expected): - srp = SparseRipsPersistence(epsilon=epsilon) + with pytest.raises(TypeError): + wap.fit_transform(X_pc) - assert_almost_equal(np.sort(srp.fit_transform(point_clouds), axis=1), - np.sort(expected, axis=1)) +def test_wap_not_fitted(): + wap = WeakAlphaPersistence() + with pytest.raises(NotFittedError): + wap.transform(X_pc) + + +# On this particular X_pc, WeakAlpha and VietorisRips should give the exact +# same result +X_wap_exp = X_vrp_exp + + +@pytest.mark.parametrize('X', [X_pc, X_pc_list]) +@pytest.mark.parametrize('max_edge_length', [np.inf, 0.8]) +@pytest.mark.parametrize('infinity_values', [10, 30]) +def test_wap_transform(X, max_edge_length, infinity_values): + wap = WeakAlphaPersistence(max_edge_length=max_edge_length, + infinity_values=infinity_values) + # This is not generally true, it is only a way to obtain the res array + # in this specific case + X_exp = X_wap_exp.copy() + X_exp[:, :, :2][X_exp[:, :, :2] >= max_edge_length] = infinity_values + assert_almost_equal(wap.fit_transform(X), X_exp) + + +@pytest.mark.parametrize("transformer_cls", [VietorisRipsPersistence, + WeakAlphaPersistence]) +def test_vrp_wap_transform_circle(transformer_cls): + """Test that, on a sampled noisy circle, both VietorisRipsPersistence and + WeakAlphaPersistence lead to reasonable barcodes""" + transformer = transformer_cls() + X_res = transformer.fit_transform(X_circle) + subdiagram_0 = X_res[X_res[:, :, 2] == 0] + subdiagram_1 = X_res[X_res[:, :, 2] == 1] + length_reg_pol = 2 * np.sin(np.pi / X_circle.shape[1]) + last_conn_comp_param = np.max(subdiagram_0[:, 1]) + assert last_conn_comp_param < length_reg_pol + 0.1 + assert len(subdiagram_1) == 1 + assert subdiagram_1[0, 0] > last_conn_comp_param + assert subdiagram_1[0, 1] > np.sqrt(3) + + +def test_wap_qhullerror(): + """"Test that SciPy raises a QhullError when there are too few points (at + least 4 are needed)""" + X_pc_2 = np.array([[[0., 1.], [1., 2.], [2., 3.]]]) + wap = WeakAlphaPersistence() + with pytest.raises(QhullError): + wap.fit_transform(X_pc_2) + + +def test_wap_list_of_arrays_different_size(): + X = [X_pc[0], X_pc[0][:-1]] + wap = WeakAlphaPersistence() + assert_almost_equal(wap.fit_transform(X)[0], X_wap_exp[0]) + + +@pytest.mark.parametrize('X', [X_pc, X_pc_list]) +def test_wap_low_infinity_values(X): + wap = WeakAlphaPersistence(max_edge_length=0.001, infinity_values=-1) + assert_almost_equal(wap.fit_transform(X)[:, :, :2], + np.zeros((1, 2, 2))) + + +@pytest.mark.parametrize('X', [X_pc, X_pc_list]) @pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) -def test_srp_fit_transform_plot(hom_dims): - SparseRipsPersistence().fit_transform_plot( - X, sample=0, homology_dimensions=hom_dims) +def test_wap_fit_transform_plot(X, hom_dims): + WeakAlphaPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims + ) def test_cp_params(): @@ -97,31 +253,127 @@ def test_cp_params(): cp = EuclideanCechPersistence(coeff=coeff) with pytest.raises(TypeError): - cp.fit_transform(X) + cp.fit_transform(X_pc) def test_cp_not_fitted(): cp = EuclideanCechPersistence() with pytest.raises(NotFittedError): - cp.transform(X) + cp.transform(X_pc) -X_cp_res = np.array( - [[[0., 0.31093103, 0.], [0., 0.30038548, 0.], - [0., 0.25587055, 0.], [0., 0.21547186, 0.], - [0.34546959, 0.41473758, 1.], [0.51976681, 0.55287585, 1.], - [0.26746207, 0.28740871, 1.], [0.52355742, 0.52358794, 1.], - [0.40065941, 0.40067135, 1.], [0.45954496, 0.45954497, 1.]]]) +X_cp_exp = np.array([ + [[0., 0.31093103, 0.], + [0., 0.30038548, 0.], + [0., 0.25587055, 0.], + [0., 0.21547186, 0.], + [0.34546959, 0.41473758, 1.], + [0.51976681, 0.55287585, 1.], + [0.26746207, 0.28740871, 1.], + [0.52355742, 0.52358794, 1.], + [0.40065941, 0.40067135, 1.], + [0.45954496, 0.45954497, 1.]] + ]) -def test_cp_transform(): +@pytest.mark.parametrize('X', [X_pc, X_pc_list]) +def test_cp_transform(X): cp = EuclideanCechPersistence() - assert_almost_equal(cp.fit_transform(X), X_cp_res) + assert_almost_equal(cp.fit_transform(X), X_cp_exp) +@pytest.mark.parametrize('X', [X_pc, X_pc_list]) @pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) -def test_cp_fit_transform_plot(hom_dims): +def test_cp_fit_transform_plot(X, hom_dims): EuclideanCechPersistence().fit_transform_plot( - X, sample=0, homology_dimensions=hom_dims) + X, sample=0, homology_dimensions=hom_dims + ) + + +def test_fp_params(): + coeff = 'not_defined' + fp = FlagserPersistence(coeff=coeff) + + with pytest.raises(TypeError): + fp.fit_transform(X_dist) + + +def test_fp_not_fitted(): + fp = FlagserPersistence() + + with pytest.raises(NotFittedError): + fp.transform(X_dist) + + +X_dir_graph = X_dist.copy() +X_dir_graph[0, 0, :] = X_dir_graph[0, 0, :] / 2. +X_dir_graph[0][np.tril_indices(5, k=-1)] = np.inf + +X_dir_graph_list = [x for x in X_dir_graph] + +X_dir_graph_sparse = [csr_matrix(x) for x in X_dir_graph] + +X_fp_dir_exp = np.array([ + [[0., 0.30038548, 0.], + [0., 0.34546959, 0.], + [0., 0.40065941, 0.], + [0., 0.43094373, 0.], + [0.5117411, 0.51976681, 1.]] + ]) + + +@pytest.mark.parametrize('X', + [X_dir_graph, X_dir_graph_list, X_dir_graph_sparse]) +@pytest.mark.parametrize('max_edge_weight', [np.inf, 0.8]) +@pytest.mark.parametrize('infinity_values', [10, 30]) +def test_fp_transform_directed(X, max_edge_weight, infinity_values): + fp = FlagserPersistence(directed=True, max_edge_weight=max_edge_weight, + infinity_values=infinity_values) + # In the undirected case with "max" filtration, the results are always the + # same as the one of VietorisRipsPersistence + X_exp = X_fp_dir_exp.copy() + # This is not generally true, it is only a way to obtain the res array + # in this specific case + X_exp[:, :, :2][X_exp[:, :, :2] >= max_edge_weight] = infinity_values + assert_almost_equal(fp.fit_transform(X), X_exp) + + +@pytest.mark.parametrize('X', [X_dist, X_dist_list, X_dist_sparse]) +@pytest.mark.parametrize('max_edge_weight', [np.inf, 0.8, 0.6]) +@pytest.mark.parametrize('infinity_values', [10, 30]) +def test_fp_transform_undirected(X, max_edge_weight, infinity_values): + fp = FlagserPersistence(directed=False, max_edge_weight=max_edge_weight, + infinity_values=infinity_values) + # In the undirected case with "max" filtration, the results are always the + # same as the one of VietorisRipsPersistence + X_exp = X_vrp_exp.copy() + + # In that case, the subdiagram of dimension 1 is empty + if max_edge_weight == 0.6: + X_exp[0, -1, :] = [0., 0., 1.] + + # This is not generally true, it is only a way to obtain the res array + # in this specific case + X_exp[:, :, :2][X_exp[:, :, :2] >= max_edge_weight] = infinity_values + assert_almost_equal(fp.fit_transform(X), X_exp) + + +@pytest.mark.parametrize('delta', range(1, 4)) +def test_fp_transform_high_hom_dim(delta): + """Test that if the maximum homology dimension is greater than or equal to + the number of points, we do not produce errors.""" + n_points = 3 + X = X_dist[:, :n_points, :n_points] + fp = FlagserPersistence(homology_dimensions=list(range(n_points + delta))) + assert_almost_equal(fp.fit_transform(X)[0, -1], + np.array([0., 0., n_points + delta - 1], dtype=float)) + + +@pytest.mark.parametrize('X', [X_dist, X_dist_list, X_dist_disconnected]) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_fp_fit_transform_plot(X, hom_dims): + FlagserPersistence(directed=False).fit_transform_plot( + X_dist, sample=0, homology_dimensions=hom_dims + ) diff --git a/gtda/images/__init__.py b/gtda/images/__init__.py index 0dd1def91..f4b700e91 100644 --- a/gtda/images/__init__.py +++ b/gtda/images/__init__.py @@ -5,7 +5,8 @@ from .preprocessing import Binarizer, Inverter, Padder, ImageToPointCloud from .filtrations import HeightFiltration, RadialFiltration, \ - DilationFiltration, ErosionFiltration, SignedDistanceFiltration + DilationFiltration, ErosionFiltration, SignedDistanceFiltration, \ + DensityFiltration __all__ = [ 'Binarizer', @@ -17,4 +18,5 @@ 'DilationFiltration', 'ErosionFiltration', 'SignedDistanceFiltration', -] + 'DensityFiltration' + ] diff --git a/gtda/images/filtrations.py b/gtda/images/filtrations.py index 624da3f4e..68b8e8652 100644 --- a/gtda/images/filtrations.py +++ b/gtda/images/filtrations.py @@ -1,9 +1,9 @@ """Filtrations of 2D/3D binary images.""" # License: GNU AGPLv3 -from numbers import Real +from numbers import Real, Integral from types import FunctionType -from warnings import warn +import itertools import numpy as np from joblib import Parallel, delayed, effective_n_jobs @@ -13,6 +13,7 @@ from sklearn.utils.validation import check_array, check_is_fitted from ._utils import _dilate, _erode +from .preprocessing import Padder from ..base import PlotterMixin from ..plotting import plot_heatmap from ..utils._docs import adapt_fit_transform_docs @@ -63,21 +64,22 @@ class HeightFiltration(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - gtda.homology.CubicalPersistence, Binarizer + RadialFiltration, DilationFiltration, ErosionFiltration, \ + SignedDistanceFiltration, DensityFiltration, \ + gtda.homology.CubicalPersistence References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ _hyperparameters = { - 'direction': { - 'type': (np.ndarray, type(None)), 'of': {'type': Real}} - } + 'direction': {'type': (np.ndarray, type(None)), 'of': {'type': Real}} + } def __init__(self, direction=None, n_jobs=None): self.direction = direction @@ -93,7 +95,7 @@ def _calculate_height(self, X): return Xh def fit(self, X, y=None): - """Calculate :attr:`direction_`, :attr:`n_dimensions_`, :attr:`mesh_` + """Calculate :attr:`n_dimensions_`, :attr:`direction_`, :attr:`mesh_` and :attr:`max_value_` from a collection of binary images. Then, return the estimator. @@ -117,9 +119,9 @@ def fit(self, X, y=None): """ X = check_array(X, allow_nd=True) self.n_dimensions_ = X.ndim - 1 - if (self.n_dimensions_ < 2) or (self.n_dimensions_ > 3): - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -163,8 +165,8 @@ def transform(self, X, y=None): Returns ------- - Xt : ndarray of shape (n_samples, n_pixels_x, - n_pixels_y [, n_pixels_z]) + Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y \ + [, n_pixels_z]) Transformed collection of images. Each entry along axis 0 is a 2D or 3D greyscale image. @@ -180,7 +182,8 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D greyscale images. Parameters @@ -201,8 +204,24 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale, origin=origin) + return plot_heatmap( + Xt[sample], colorscale=colorscale, origin=origin, + title=f"Height filtration of image {sample}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -245,7 +264,7 @@ class RadialFiltration(BaseEstimator, TransformerMixin, PlotterMixin): two arrays from the entry in `X` as input, and return a value indicating the distance between them. - metric_params : dict or None, optional, default: ``None`` + metric_params : dict or None, optional, default: ``{}`` Additional keyword arguments for the metric function. n_jobs : int or None, optional, default: ``None`` @@ -261,11 +280,6 @@ class RadialFiltration(BaseEstimator, TransformerMixin, PlotterMixin): center_ : ndarray of shape (:attr:`n_dimensions_`,) Effective center of the radial filtration. Set in :meth:`fit`. - effective_metric_params_ : dict - Dictionary containing all information present in - `metric_params`. If `metric_params` is ``None``, it is set to - the empty dictionary. - mesh_ : ndarray of shape ( n_pixels_x, n_pixels_y [, n_pixels_z]) greyscale image corresponding to the radial filtration of a binary image where each pixel is activated. Set in :meth:`fit`. @@ -276,27 +290,28 @@ class RadialFiltration(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - gtda.homology.CubicalPersistence, Binarizer + HeightFiltration, DilationFiltration, ErosionFiltration, \ + SignedDistanceFiltration, DensityFiltration, \ + gtda.homology.CubicalPersistence References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ _hyperparameters = { - 'center': { - 'type': (np.ndarray, type(None)), 'of': {'type': int}}, + 'center': {'type': (np.ndarray, type(None)), 'of': {'type': Integral}}, 'radius': {'type': Real, 'in': Interval(0, np.inf, closed='right')}, 'metric': {'type': (str, FunctionType)}, - 'metric_params': {'type': (dict, type(None))} - } + 'metric_params': {'type': dict} + } def __init__(self, center=None, radius=np.inf, metric='euclidean', - metric_params=None, n_jobs=None): + metric_params={}, n_jobs=None): self.center = center self.radius = radius self.metric = metric @@ -313,9 +328,9 @@ def _calculate_radial(self, X): return Xr def fit(self, X, y=None): - """Calculate :attr:`center_`, :attr:`effective_metric_params_`, - :attr:`n_dimensions_`, :attr:`mesh_` and :attr:`max_value_` from a - collection of binary images. Then, return the estimator. + """Calculate :attr:`center_`, :attr:`n_dimensions_`, :attr:`mesh_` and + :attr:`max_value_` from a collection of binary images. Then, return the + estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -337,9 +352,9 @@ def fit(self, X, y=None): """ X = check_array(X, allow_nd=True) self.n_dimensions_ = X.ndim - 1 - if (self.n_dimensions_ < 2) or (self.n_dimensions_ > 3): - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -349,11 +364,6 @@ def fit(self, X, y=None): self.center_ = np.copy(self.center) self.center_ = self.center_.reshape((1, -1)) - if self.metric_params is None: - self.effective_metric_params_ = {} - else: - self.effective_metric_params_ = self.metric_params.copy() - axis_order = [2, 1, 3] mesh_range_list = [np.arange(0, X.shape[i]) for i in axis_order[:self.n_dimensions_]] @@ -363,12 +373,12 @@ def fit(self, X, y=None): axis=self.n_dimensions_).reshape((-1, self.n_dimensions_)) self.mesh_ = pairwise_distances( self.center_, self.mesh_, metric=self.metric, - n_jobs=1, **self.effective_metric_params_).reshape(X.shape[1:]) + n_jobs=1, **self.metric_params).reshape(X.shape[1:]) self.mesh_[self.mesh_ > self.radius] = np.inf self.max_value_ = 0. - self.max_value_ = np.max(self._calculate_radial( - np.ones((1, *X.shape[1:])))) + 1 + self.max_value_ = \ + np.max(self._calculate_radial(np.ones((1, *X.shape[1:])))) + 1 return self @@ -406,7 +416,8 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D greyscale images. Parameters @@ -427,8 +438,24 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale, origin=origin) + return plot_heatmap( + Xt[sample], colorscale=colorscale, origin=origin, + title=f"Radial filtration of image {sample}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -465,6 +492,9 @@ class DilationFiltration(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- + n_dimensions_ : ``2`` or ``3`` + Dimension of the images. Set in :meth:`fit`. + n_iterations_ : int Effective number of iterations in the dilation process. Set in :meth:`fit`. @@ -475,22 +505,23 @@ class DilationFiltration(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - gtda.homology.CubicalPersistence, Binarizer + HeightFiltration, RadialFiltration, ErosionFiltration, \ + SignedDistanceFiltration, DensityFiltration, \ + gtda.homology.CubicalPersistence References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ _hyperparameters = { - 'n_iterations': { - 'type': (int, type(None)), - 'in': Interval(1, np.inf, closed='left')} - } + 'n_iterations': {'type': (int, type(None)), + 'in': Interval(1, np.inf, closed='left')} + } def __init__(self, n_iterations=None, n_jobs=None): self.n_iterations = n_iterations @@ -505,8 +536,9 @@ def _calculate_dilation(self, X): return Xd def fit(self, X, y=None): - """Calculate :attr:`n_iterations_` and :attr:`max_value_` from a - collection of binary images. Then, return the estimator. + """Calculate :attr:`n_dimensions_`, :attr:`n_iterations_` and + :attr:`max_value_` from a collection of binary images. Then, return the + estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -527,11 +559,10 @@ def fit(self, X, y=None): """ X = check_array(X, allow_nd=True) - - n_dimensions = X.ndim - 1 - if (n_dimensions < 2) or (n_dimensions > 3): - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") + self.n_dimensions_ = X.ndim - 1 + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -579,7 +610,8 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D greyscale images. Parameters @@ -600,8 +632,24 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale, origin=origin) + return plot_heatmap( + Xt[sample], colorscale=colorscale, origin=origin, + title=f"Dilation filtration of image {sample}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -638,6 +686,9 @@ class ErosionFiltration(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- + n_dimensions_ : ``2`` or ``3`` + Dimension of the images. Set in :meth:`fit`. + n_iterations_ : int Effective number of iterations in the erosion process. Set in :meth:`fit`. @@ -648,22 +699,23 @@ class ErosionFiltration(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - gtda.homology.CubicalPersistence, Binarizer + HeightFiltration, RadialFiltration, DilationFiltration, \ + SignedDistanceFiltration, DensityFiltration, \ + gtda.homology.CubicalPersistence References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ _hyperparameters = { - 'n_iterations': { - 'type': (int, type(None)), - 'in': Interval(1, np.inf, closed='left')} - } + 'n_iterations': {'type': (int, type(None)), + 'in': Interval(1, np.inf, closed='left')} + } def __init__(self, n_iterations=None, n_jobs=None): self.n_iterations = n_iterations @@ -678,8 +730,9 @@ def _calculate_erosion(self, X): return Xe def fit(self, X, y=None): - """Calculate :attr:`n_iterations_` and :attr:`max_value_` from a - collection of binary images. Then, return the estimator. + """Calculate :attr:`n_dimensions_`, :attr:`n_iterations_` and + :attr:`max_value_` from a collection of binary images. Then, return the + estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -700,10 +753,10 @@ def fit(self, X, y=None): """ X = check_array(X, allow_nd=True) - n_dimensions = X.ndim - 1 - if (n_dimensions < 2) or (n_dimensions > 3): - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") + self.n_dimensions_ = X.ndim - 1 + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -751,7 +804,8 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D greyscale images. Parameters @@ -772,8 +826,24 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale, origin=origin) + return plot_heatmap( + Xt[sample], colorscale=colorscale, origin=origin, + title=f"Erosion filtration of image {sample}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -812,6 +882,9 @@ class SignedDistanceFiltration(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- + n_dimensions_ : ``2`` or ``3`` + Dimension of the images. Set in :meth:`fit`. + n_iterations_ : int Effective number of iterations in the dilation process. Set in :meth:`fit`. @@ -822,23 +895,22 @@ class SignedDistanceFiltration(BaseEstimator, TransformerMixin, PlotterMixin): See also -------- - gtda.homology.CubicalPersistence, Binarizer, ErosionFiltration, \ - DilationFiltration + HeightFiltration, RadialFiltration, DilationFiltration, \ + ErosionFiltration, DensityFiltration, gtda.homology.CubicalPersistence References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ _hyperparameters = { - 'n_iterations': { - 'type': (int, type(None)), - 'in': Interval(1, np.inf, closed='left')} - } + 'n_iterations': {'type': (int, type(None)), + 'in': Interval(1, np.inf, closed='left')} + } def __init__(self, n_iterations=None, n_jobs=None): self.n_iterations = n_iterations @@ -860,8 +932,9 @@ def _calculate_signed_distance(self, X): return (Xd + Xe) def fit(self, X, y=None): - """Calculate :attr:`n_iterations_` and :attr:`max_value_` from a - collection of binary images. Then, return the estimator. + """Calculate :attr:`n_dimensions_`, :attr:`n_iterations_` and + :attr:`max_value_` from a collection of binary images. Then, return the + estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -882,10 +955,10 @@ def fit(self, X, y=None): """ X = check_array(X, allow_nd=True) - n_dimensions = X.ndim - 1 - if (n_dimensions < 2) or (n_dimensions > 3): - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") + self.n_dimensions_ = X.ndim - 1 + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -933,7 +1006,8 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D greyscale images. Parameters @@ -954,5 +1028,263 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + + """ + return plot_heatmap( + Xt[sample], colorscale=colorscale, origin=origin, + title=f"Signed-distance filtration of image {sample}", + plotly_params=plotly_params + ) + + +@adapt_fit_transform_docs +class DensityFiltration(BaseEstimator, TransformerMixin, PlotterMixin): + """Filtrations of 2D/3D binary images based on the number of activated + neighboring pixels. + + The density filtration assigns to each pixel of a binary image a greyscale + value equal to the number of activated pixels within a ball centered around + it. + + Parameters + ---------- + radius : float, optional, default: ``1.`` + The radius of the ball within which the number of activated pixels is + considered. + + metric : string or callable, optional, default: ``'euclidean'`` + Determines a rule with which to calculate distances between + pairs of pixels. + If ``metric`` is a string, it must be one of the options allowed by + ``scipy.spatial.distance.pdist`` for its metric parameter, or a metric + listed in ``sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS``, including + "euclidean", "manhattan", or "cosine". + If ``metric`` is a callable function, it is called on each pair of + instances and the resulting value recorded. The callable should take + two arrays from the entry in `X` as input, and return a value + indicating the distance between them. + + metric_params : dict, optional, default: ``{}`` + Additional keyword arguments for the metric function. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + + Attributes + ---------- + n_dimensions_ : ``2`` or ``3`` + Dimension of the images. Set in :meth:`fit`. + + mask_ : ndarray of shape (radius, radius [, radius]) + The mask applied around each pixel to calculate the weighted number of + its activated neighbors. Set in :meth:`fit`. + + See also + -------- + HeightFiltration, RadialFiltration, DilationFiltration, \ + ErosionFiltration, SignedDistanceFiltration, \ + gtda.homology.CubicalPersistence + + References + ---------- + [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification + of MNIST using TDA"; 19th International IEEE Conference on Machine + Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ + `_. + + """ + + _hyperparameters = { + 'radius': {'type': Real, 'in': Interval(0, np.inf, closed='right')}, + 'metric': {'type': (str, FunctionType)}, + 'metric_params': {'type': dict}, + } + + def __init__(self, radius=3, metric='euclidean', metric_params={}, + n_jobs=None): + self.radius = radius + self.metric = metric + self.metric_params = metric_params + self.n_jobs = n_jobs + + def _calculate_density(self, X): + Xd = np.zeros(X.shape) + + # The idea behind this is to sum up pixel values of the image + # rolled according to the 3D mask + for i, j, k in self._iterator: + Xd += np.roll(np.roll( + np.roll(X, k, axis=3), j, axis=2), i, axis=1) \ + * self.mask_[self._size + i, self._size + j, + self._size + k] + return Xd + + def fit(self, X, y=None): + """Calculate :attr:`n_dimensions_` and :attr:`mask_` from a collection + of binary images. Then, return the estimator. + + This method is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) + Input data. Each entry along axis 0 is interpreted as a 2D or 3D + binary image. + + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + + Returns + ------- + self : object + + """ + X = check_array(X, allow_nd=True) + self.n_dimensions_ = X.ndim - 1 + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") + validate_params( + self.get_params(), self._hyperparameters, exclude=['n_jobs']) + + # Determine the size of the mask based on the radius and metric + self._size = int(np.ceil( + pairwise_distances([[0]], [[self.radius]], metric=self.metric, + **self.metric_params) + )) + # The mask is always 3D but not the iterator. + self.mask_ = np.ones(tuple(2 * self._size + 1 for _ in range(3)), + dtype=np.bool) + + # Create an iterator for applying the mask to every pixel at once + iterator_size_list = \ + [range(-self._size, self._size + 1)] * self.n_dimensions_ + \ + [[0] for _ in range(3 - self.n_dimensions_)] + self._iterator = tuple(itertools.product(*iterator_size_list)) + + # We create a mesh so that we have an array with coordinates and we can + # calculate the distance of each point to the center + mesh_size_list = [np.arange(0, 2 * self._size + 1)] * 3 + self.mesh_ = np.stack( + np.meshgrid(*mesh_size_list), axis=3).reshape((-1, 3)) + + # Calculate those distances to the center and use them to set the mask + # values so that it corresponds to a ball + center = self._size * np.ones((1, 3)) + self.mask_ = pairwise_distances( + center, self.mesh_, metric=self.metric, + n_jobs=1, **self.metric_params).reshape(self.mask_.shape) + + self.mask_ = self.mask_ <= self.radius + + # Instantiate a padder to pad all images with 0 so that the rolling of + # the mask also works at the boundary of the images + padding = np.asarray([*[self._size] * self.n_dimensions_, + *[0] * (3 - self.n_dimensions_)]) + self._padder = Padder(padding=padding) + self._padder.fit(X.reshape((*X.shape[:3], -1))) + + return self + + def transform(self, X, y=None): + """For each binary image in the collection `X`, calculate a + corresponding greyscale image based on the density of its pixels. + Return the collection of greyscale images. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) + Input data. Each entry along axis 0 is interpreted as a 2D or 3D + binary image. + + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y \ + [, n_pixels_z]) + Transformed collection of images. Each entry along axis 0 is a + 2D or 3D greyscale image. + + """ + check_is_fitted(self) + Xt = check_array(X, allow_nd=True, copy=True) + + # Reshape the images to 3D so that they can be rolled according to the + # 3D mask + Xt = Xt.reshape((*X.shape[:3], -1)) + Xt = self._padder.transform(Xt) + + Xt = Parallel(n_jobs=self.n_jobs)( + delayed(self._calculate_density)(Xt[s]) + for s in gen_even_slices(Xt.shape[0], + effective_n_jobs(self.n_jobs))) + Xt = np.concatenate(Xt) + + Xt = Xt[:, self._size: -self._size, self._size: -self._size] + + if self.n_dimensions_ == 3: + Xt = Xt[:, :, :, self._size: -self._size] + + Xt = Xt.reshape(X.shape) + + return Xt + + @staticmethod + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): + """Plot a sample from a collection of 2D greyscale images. + + Parameters + ---------- + Xt : ndarray of shape (n_samples, n_pixels_x, n_pixels_y) + Collection of 2D greyscale images, such as returned by + :meth:`transform`. + + sample : int, optional, default: ``0`` + Index of the sample in `Xt` to be plotted. + + colorscale : str, optional, default: ``'greys'`` + Color scale to be used in the heat map. Can be anything allowed by + :class:`plotly.graph_objects.Heatmap`. + + origin : ``'upper'`` | ``'lower'``, optional, default: ``'upper'`` + Position of the [0, 0] pixel of `data`, in the upper left or lower + left corner. The convention ``'upper'`` is typically used for + matrices and images. + + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale, origin=origin) + return plot_heatmap( + Xt[sample], colorscale=colorscale, origin=origin, + title=f"Signed-distance filtration of image {sample}", + plotly_params=plotly_params + ) diff --git a/gtda/images/preprocessing.py b/gtda/images/preprocessing.py index dc996f8d5..526646cf3 100644 --- a/gtda/images/preprocessing.py +++ b/gtda/images/preprocessing.py @@ -3,8 +3,7 @@ from functools import reduce from operator import iconcat -from numbers import Real -from warnings import warn +from numbers import Real, Integral import numpy as np from joblib import Parallel, delayed, effective_n_jobs @@ -36,7 +35,7 @@ class Binarizer(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- - n_dimensions_ : int + n_dimensions_ : ``2`` or ``3`` Dimension of the images. Set in meth:`fit`. max_value_ : float @@ -49,16 +48,16 @@ class Binarizer(BaseEstimator, TransformerMixin, PlotterMixin): References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ _hyperparameters = { 'threshold': {'type': Real, 'in': Interval(0, 1, closed='right')} - } + } def __init__(self, threshold=0.5, n_jobs=None): self.threshold = threshold @@ -94,9 +93,9 @@ def fit(self, X, y=None): """ X = check_array(X, allow_nd=True) self.n_dimensions_ = X.ndim - 1 - if (self.n_dimensions_ < 2) or (self.n_dimensions_ > 3): - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) @@ -141,7 +140,8 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D binary images. Parameters @@ -162,36 +162,83 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_heatmap( - Xt[sample] * 1, colorscale=colorscale, origin=origin) + Xt[sample] * 1, colorscale=colorscale, origin=origin, + title=f"Binarization of image {sample}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs class Inverter(BaseEstimator, TransformerMixin, PlotterMixin): - """Invert all 2D/3D binary images in a collection. + """Invert all 2D/3D images in a collection. + + Applies an inversion function to the value of all pixels of all images in + the input collection. If the images are binary, the inversion function is + defined as the logical NOT function. Otherwise, it is the function + :math:`f(x) = M - x`, where `x` is a pixel value and `M` is + :attr:`max_value_`. Parameters ---------- + max_value : bool, int, float or None, optional, default: ``None`` + Maximum possible pixel value in the images. It should be a boolean if + input images are binary and an int or a float if they are greyscale. + If ``None``, it is calculated from the collection of images passed in + :meth:`fit`. + n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. + Attributes + ---------- + n_dimensions_ : ``2`` or ``3`` + Dimension of the images. Set in :meth:`fit`. + + max_value_ : int, float or bool + Effective maximum value of the images' pixels. Set in :meth:`fit`. + References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: \ - Classification of MNIST using TDA"; 19th International \ - IEEE Conference on Machine Learning and Applications (ICMLA 2020), \ - 2019; arXiv: `1910.08345 `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ - def __init__(self, n_jobs=None): + _hyperparameters = { + 'max_value': {'type': (bool, Real, type(None))} + } + + def __init__(self, max_value=None, n_jobs=None): + self.max_value = max_value self.n_jobs = n_jobs + def _invert(self, X): + if self.max_value_ is True: + return np.logical_not(X) + else: + return self.max_value_ - X + def fit(self, X, y=None): - """Do nothing and return the estimator unchanged. + """Calculate :attr:`n_dimensions_` and :attr:`max_value_` from the + collection of images. Then, return the estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -200,7 +247,7 @@ def fit(self, X, y=None): ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D - binary image. + image. y : None There is no need of a target in a transformer, yet the pipeline API @@ -212,8 +259,21 @@ def fit(self, X, y=None): """ check_array(X, allow_nd=True) + self.n_dimensions_ = X.ndim - 1 + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") + validate_params(self.get_params(), self._hyperparameters, + exclude=['n_jobs']) + + if self.max_value is None: + if X.dtype == np.bool: + self.max_value_ = True + else: + self.max_value_ = np.max(X) + else: + self.max_value_ = self.max_value - self._is_fitted = True return self def transform(self, X, y=None): @@ -238,18 +298,19 @@ def transform(self, X, y=None): 2D or 3D binary image. """ - check_is_fitted(self, ['_is_fitted']) + check_is_fitted(self) Xt = check_array(X, allow_nd=True) Xt = Parallel(n_jobs=self.n_jobs)(delayed( - np.logical_not)(Xt[s]) + self._invert)(Xt[s]) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D binary images. Parameters @@ -270,26 +331,41 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_heatmap( - Xt[sample] * 1, colorscale=colorscale, origin=origin) + Xt[sample] * 1, colorscale=colorscale, origin=origin, + title=f"Inversion of image {sample}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs class Padder(BaseEstimator, TransformerMixin, PlotterMixin): - """Pad all 2D/3D binary images in a collection. + """Pad all 2D/3D images in a collection. Parameters ---------- - paddings : int ndarray of shape (padding_x, padding_y [, padding_z]) or \ + padding : int ndarray of shape (padding_x, padding_y [, padding_z]) or \ None, optional, default: ``None`` Number of pixels to pad the images along each axis and on both side of the images. By default, a frame of a single pixel width is added around the image (``1 = padding_x = padding_y [= padding_z]``). - activated : bool, optional, default: ``False`` - If ``True``, the padded pixels are activated. If ``False``, they are - deactivated. + value : bool, int, or float, optional, default: ``0`` + Value given to the padded pixels. It should be a boolean if the input + images are binary and an int or float if they are greyscale. n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless @@ -298,33 +374,35 @@ class Padder(BaseEstimator, TransformerMixin, PlotterMixin): Attributes ---------- - paddings_ : int ndarray of shape (padding_x, padding_y [, padding_z]) - Effective padding along each of the axis. Set in :meth:`fit`. + n_dimensions_ : ``2`` or ``3`` + Dimension of the images. Set in :meth:`fit`. + + padding_ : int ndarray of shape (padding_x, padding_y [, padding_z]) + Effective padding along each of the axes. Set in :meth:`fit`. References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ _hyperparameters = { - 'paddings': { - 'type': (np.ndarray, type(None)), - 'of': {'type': int}}, - 'activated': {'type': bool} - } - - def __init__(self, paddings=None, activated=False, n_jobs=None): - self.paddings = paddings - self.activated = activated + 'padding': {'type': (np.ndarray, type(None)), + 'of': {'type': Integral}}, + 'value': {'type': (bool, Real)} + } + + def __init__(self, padding=None, value=False, n_jobs=None): + self.padding = padding + self.value = value self.n_jobs = n_jobs def fit(self, X, y=None): - """Calculate :attr:`paddings_` from a collection of binary images. - Then, return the estimator. + """Calculate :attr:`n_dimensions_` and :attr:`padding_` from a + collection of images. Then, return the estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -333,7 +411,7 @@ def fit(self, X, y=None): ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D - binary image. + image. y : None There is no need of a target in a transformer, yet the pipeline API @@ -345,25 +423,26 @@ def fit(self, X, y=None): """ X = check_array(X, allow_nd=True) - n_dimensions = X.ndim - 1 - if n_dimensions < 2 or n_dimensions > 3: - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") - validate_params( - self.get_params(), self._hyperparameters, exclude=['n_jobs']) - - if self.paddings is None: - self.paddings_ = np.ones((n_dimensions,), dtype=np.int) - elif len(self.paddings) != n_dimensions: + self.n_dimensions_ = X.ndim - 1 + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") + validate_params(self.get_params(), self._hyperparameters, + exclude=['value', 'n_jobs']) + + if self.padding is None: + self.padding_ = np.ones((self.n_dimensions_,), dtype=np.int) + elif len(self.padding) != self.n_dimensions_: raise ValueError( - f"`paddings` has length {self.paddings} while the input " - f"data requires it to have length equal to {n_dimensions}.") + f"`padding` has length {self.padding} while the input " + f"data requires it to have length equal to " + f"{self.n_dimensions_}.") else: - self.paddings_ = self.paddings + self.padding_ = self.padding self._pad_width = ((0, 0), - *[(self.paddings_[axis], self.paddings_[axis]) - for axis in range(n_dimensions)]) + *[(self.padding_[axis], self.padding_[axis]) + for axis in range(self.n_dimensions_)]) return self @@ -375,7 +454,7 @@ def transform(self, X, y=None): ---------- X : ndarray of shape (n_samples, n_pixels_x, n_pixels_y [, n_pixels_z]) Input data. Each entry along axis 0 is interpreted as a 2D or 3D - binary image. + image. y : None There is no need of a target in a transformer, yet the pipeline API @@ -394,14 +473,15 @@ def transform(self, X, y=None): Xt = Parallel(n_jobs=self.n_jobs)(delayed( np.pad)(Xt[s], pad_width=self._pad_width, - constant_values=self.activated) + constant_values=self.value) for s in gen_even_slices(len(Xt), effective_n_jobs(self.n_jobs))) Xt = np.concatenate(Xt) return Xt @staticmethod - def plot(Xt, sample=0, colorscale='greys', origin='upper'): + def plot(Xt, sample=0, colorscale='greys', origin='upper', + plotly_params=None): """Plot a sample from a collection of 2D binary images. Parameters @@ -422,9 +502,24 @@ def plot(Xt, sample=0, colorscale='greys', origin='upper'): left corner. The convention ``'upper'`` is typically used for matrices and images. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ return plot_heatmap( - Xt[sample] * 1, colorscale=colorscale, origin=origin) + Xt[sample] * 1, colorscale=colorscale, origin=origin, + title=f"Padded version of image {sample}", + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -446,6 +541,11 @@ class ImageToPointCloud(BaseEstimator, TransformerMixin, PlotterMixin): in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. + Attributes + ---------- + n_dimensions_ : ``2`` or ``3`` + Dimension of the images. Set in :meth:`fit`. + See also -------- gtda.homology.VietorisRipsPersistence, gtda.homology.SparseRipsPersistence, @@ -453,21 +553,24 @@ class ImageToPointCloud(BaseEstimator, TransformerMixin, PlotterMixin): References ---------- - [1] A. Garin and G. Tauzin, "A topological reading lesson: Classification - of MNIST using TDA"; 19th International IEEE Conference on Machine - Learning and Applications (ICMLA 2020), 2019; arXiv: `1910.08345 \ - `_. + .. [1] A. Garin and G. Tauzin, "A topological reading lesson: + Classification of MNIST using TDA"; 19th International IEEE + Conference on Machine Learning and Applications (ICMLA 2020), 2019; + `arXiv:1910.08345 `_. """ def __init__(self, n_jobs=None): self.n_jobs = n_jobs - def _embed(self, X): + @staticmethod + def _embed(X): return [np.argwhere(x) for x in X] def fit(self, X, y=None): - """Do nothing and return the estimator unchanged. + """Calculate :attr:`n_dimensions_` from a collection of binary images. + Then, return the estimator. + This method is here to implement the usual scikit-learn API and hence work in pipelines. @@ -487,13 +590,11 @@ def fit(self, X, y=None): """ check_array(X, allow_nd=True) + self.n_dimensions_ = X.ndim - 1 + if self.n_dimensions_ > 3: + raise ValueError(f"Input of `fit` contains arrays of dimension " + f"{self.n_dimensions_}.") - n_dimensions = X.ndim - 1 - if n_dimensions < 2 or n_dimensions > 3: - warn(f"Input of `fit` contains arrays of dimension " - f"{self.n_dimensions_}.") - - self._is_fitted = True return self def transform(self, X, y=None): @@ -514,13 +615,12 @@ def transform(self, X, y=None): Returns ------- Xt : ndarray of shape (n_samples, n_pixels_x * n_pixels_y [* \ - n_pixels_z], - n_dimensions) + n_pixels_z], n_dimensions) Transformed collection of images. Each entry along axis 0 is a point cloud in ``n_dimensions``-dimensional space. """ - check_is_fitted(self, '_is_fitted') + check_is_fitted(self) Xt = check_array(X, allow_nd=True) Xt = np.swapaxes(np.flip(Xt, axis=1), 1, 2) @@ -531,7 +631,7 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0): + def plot(Xt, sample=0, plotly_params=None): """Plot a sample from a collection of point clouds. If the point cloud is in more than three dimensions, only the first three are plotted. @@ -544,5 +644,17 @@ def plot(Xt, sample=0): sample : int, optional, default: ``0`` Index of the sample in `Xt` to be plotted. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_point_cloud(Xt[sample]) + return plot_point_cloud(Xt[sample], plotly_params=plotly_params) diff --git a/gtda/images/tests/test_filtrations.py b/gtda/images/tests/test_filtrations.py index 022ebe020..359fa1e61 100644 --- a/gtda/images/tests/test_filtrations.py +++ b/gtda/images/tests/test_filtrations.py @@ -8,7 +8,8 @@ from sklearn.exceptions import NotFittedError from gtda.images import HeightFiltration, RadialFiltration, \ - DilationFiltration, ErosionFiltration, SignedDistanceFiltration + DilationFiltration, ErosionFiltration, SignedDistanceFiltration, \ + DensityFiltration pio.renderers.default = 'plotly_mimetype' @@ -23,6 +24,16 @@ np.zeros((3, 4, 2))], axis=0) +@pytest.mark.parametrize("transformer", + [HeightFiltration(), RadialFiltration(), + DilationFiltration(), ErosionFiltration(), + SignedDistanceFiltration(), DensityFiltration()]) +def test_invalid_input_shape(transformer): + X = np.ones((1, 1, 1, 1, 1)) + with pytest.raises(ValueError, match="Input of `fit`"): + transformer.fit(X) + + def test_height_not_fitted(): height = HeightFiltration() with pytest.raises(NotFittedError): @@ -276,3 +287,49 @@ def test_signed_transform(n_iterations, images, expected): def test_signed_fit_transform_plot(): SignedDistanceFiltration().fit_transform_plot(images_2D, sample=0) + + +def test_density_not_fitted(): + density = DensityFiltration() + with pytest.raises(NotFittedError): + density.transform(images_2D) + + +def test_density_errors(): + radius = 'a' + density = DensityFiltration(radius=radius) + with pytest.raises(TypeError): + density.fit(images_2D) + + +images_2D_density = np.array( + [[[6., 8., 8., 6.], [7., 10., 10., 7.], [6., 8., 8., 6.]], + [[5., 5., 3., 1.], [6., 6., 4., 1.], [5., 5., 3., 1.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]]]) + + +images_3D_density = np.array( + [[[[10., 10.], [14., 14.], [14., 14.], [10., 10.]], + [[13., 13.], [19., 19.], [19., 19.], [13., 13.]], + [[10., 10.], [14., 14.], [14., 14.], [10., 10.]]], + [[[9., 9.], [9., 9.], [5., 5.], [1., 1.]], + [[12., 12.], [12., 12.], [7., 7.], [1., 1.]], + [[9., 9.], [9., 9.], [5., 5.], [1., 1.]]], + [[[0., 0.], [0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.], [0., 0.]]]]) + + +@pytest.mark.parametrize("radius, images, expected", + [(2., images_2D, images_2D_density), + (2.2, images_2D, images_2D_density), + (2., images_3D, images_3D_density)]) +def test_density_transform(radius, images, expected): + density = DensityFiltration(radius=radius) + + assert_almost_equal(density.fit_transform(images), + expected) + + +def test_density_fit_transform_plot(): + DensityFiltration().fit_transform_plot(images_2D, sample=0) diff --git a/gtda/images/tests/test_preprocessing.py b/gtda/images/tests/test_preprocessing.py index 8fb9fb2eb..2e8087461 100644 --- a/gtda/images/tests/test_preprocessing.py +++ b/gtda/images/tests/test_preprocessing.py @@ -1,4 +1,4 @@ -"""Testing for images preprocessors.""" +"""Testing for image preprocessors.""" # License: GNU AGPLv3 import numpy as np @@ -21,6 +21,20 @@ np.concatenate([np.ones((7, 4, 4)), np.zeros((7, 4, 4))], axis=1), np.zeros((7, 8, 4))], axis=0) +images_3D_float = np.stack([ + 2.5*np.ones((7, 8, 4)), + 3.*np.concatenate([np.ones((7, 4, 4)), np.zeros((7, 4, 4))], axis=1), + np.zeros((7, 8, 4))], axis=0) + + +@pytest.mark.parametrize("transformer", + [Binarizer(), Inverter(), Padder(), + ImageToPointCloud()]) +def test_invalid_input_shape(transformer): + X = np.ones((1, 1, 1, 1, 1)) + with pytest.raises(ValueError, match="Input of `fit`"): + transformer.fit(X) + def test_binarizer_not_fitted(): binarizer = Binarizer() @@ -68,7 +82,9 @@ def test_inverter_not_fitted(): @pytest.mark.parametrize("images, expected", [(images_2D, images_2D_inverted), - (images_3D, images_3D_inverted)]) + (images_3D, images_3D_inverted), + (images_3D.astype(bool), + images_3D_inverted.astype(bool))]) def test_inverter_transform(images, expected): inverter = Inverter() @@ -86,17 +102,19 @@ def test_padder_not_fitted(): padder.transform(images_2D) -@pytest.mark.parametrize("images, paddings", +@pytest.mark.parametrize("images, padding", [(images_2D, np.array([1, 1], dtype=np.int)), (images_2D, None), - (images_3D, np.array([2, 2, 2], dtype=np.int))]) -def test_padder_transform(images, paddings): - padder = Padder(paddings=paddings) + (images_3D, np.array([2, 2, 2], dtype=np.int)), + (images_3D_float, + np.array([2, 2, 2], dtype=np.int))]) +def test_padder_transform(images, padding): + padder = Padder(padding=padding) - if paddings is None: + if padding is None: expected_shape = np.asarray(images.shape[1:]) + 2 else: - expected_shape = images.shape[1:] + 2 * paddings + expected_shape = images.shape[1:] + 2 * padding assert_equal(padder.fit_transform(images).shape[1:], expected_shape) diff --git a/gtda/mapper/__init__.py b/gtda/mapper/__init__.py index 30f27d94b..02161795b 100644 --- a/gtda/mapper/__init__.py +++ b/gtda/mapper/__init__.py @@ -1,9 +1,10 @@ """The module :mod:`gtda.mapper` implements the Mapper algorithm for topological clustering and visualisation.""" -from .cluster import FirstHistogramGap, FirstSimpleGap +from .cluster import FirstHistogramGap, FirstSimpleGap, ParallelClustering from .cover import CubicalCover, OneDimensionalCover from .filter import Eccentricity, Entropy, Projection +from .nerve import Nerve from .pipeline import make_mapper_pipeline from .utils.decorators import method_to_transform from .utils.pipeline import transformer_from_callable_on_rows @@ -18,9 +19,11 @@ 'CubicalCover', 'FirstSimpleGap', 'FirstHistogramGap', + 'ParallelClustering', + 'Nerve', 'make_mapper_pipeline', 'plot_static_mapper_graph', 'plot_interactive_mapper_graph', 'method_to_transform', 'transformer_from_callable_on_rows' -] + ] diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index 65cd5a56c..2e24ccb7b 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -7,18 +7,13 @@ import numpy as np from joblib import Parallel, delayed from sklearn.base import BaseEstimator, ClusterMixin, clone -from sklearn.cluster import DBSCAN - -try: # scikit-learn >= 0.22.1 - from sklearn.cluster._agglomerative import _TREE_BUILDERS, _hc_cut -except ImportError: - from sklearn.cluster._hierarchical import _TREE_BUILDERS, _hc_cut +from sklearn.cluster._agglomerative import _TREE_BUILDERS, _hc_cut from sklearn.utils import check_array from sklearn.utils.validation import check_memory +from .utils._cluster import _num_clusters_histogram, _num_clusters_simple from ..utils.intervals import Interval from ..utils.validation import validate_params -from .utils._cluster import _num_clusters_histogram, _num_clusters_simple class ParallelClustering(BaseEstimator): @@ -34,28 +29,25 @@ class ParallelClustering(BaseEstimator): Parameters ---------- - clusterer : object, optional, default: ``None`` - Clustering object such as derived from - :class:`sklearn.base.ClusterMixin`. ``None`` means that the default - :class:`sklearn.cluster.DBSCAN` is used. + clusterer : object + Clustering object derived from :class:`sklearn.base.ClusterMixin`. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. - parallel_backend_prefer : ``'processes'`` | ``'threads'``, optional, \ - default: ``'threads'`` - Selects the default joblib backend. The default process-based backend - is 'loky' and the default thread-based backend is 'threading'. + parallel_backend_prefer : ``"processes"`` | ``"threads"`` | ``None``, \ + optional, default: ``None`` + Soft hint for the selection of the default joblib backend. The default + process-based backend is 'loky' and the default thread-based backend is + 'threading'. See [1]_. Attributes ---------- clusterers_ : tuple of object - If `clusterer` is not ``None``, clones of `clusterer` fitted - to the portions of the full data array specified in :meth:`fit`. - Otherwise, clones of a default instance of - :class:`sklearn.cluster.DBSCAN`, fitted in the same way. + Clones of `clusterer` fitted to the portions of the full data array + specified in :meth:`fit`. clusters_ : list of list of tuple Labels and indices of each cluster found in :meth:`fit`. The i-th @@ -64,29 +56,32 @@ class ParallelClustering(BaseEstimator): cluster label and ``indices`` is the array of indices of points belonging to cluster ``(i, label)``. + References + ---------- + .. [1] "Thread-based parallelism vs process-based parallelism", in + `joblib documentation + `_. + """ - def __init__(self, clusterer=None, - n_jobs=None, - parallel_backend_prefer='threads'): + def __init__(self, clusterer, n_jobs=None, parallel_backend_prefer=None): self.clusterer = clusterer self.n_jobs = n_jobs self.parallel_backend_prefer = parallel_backend_prefer - def _validate_clusterer(self, default=DBSCAN()): - """Set :attr:`clusterer_` depending on the value of `clusterer`. + def _validate_clusterer(self): + """Set :attr:`clusterer_` depending on the value of `clusterer`. Also verify whether calculations are to be based on precomputed metric/affinity information or not. """ - if self.clusterer is not None: - self._clusterer = self.clusterer - else: - self._clusterer = default + if not isinstance(self.clusterer, ClusterMixin): + raise TypeError("`clusterer` must be an instance of " + "sklearn.base.ClusterMixin.") params = [param for param in ['metric', 'affinity'] - if param in signature(self._clusterer.__init__).parameters] - precomputed = [(getattr(self._clusterer, param) == 'precomputed') + if param in signature(self.clusterer.__init__).parameters] + precomputed = [(getattr(self.clusterer, param) == 'precomputed') for param in params] if not precomputed: self._precomputed = False @@ -94,7 +89,7 @@ def _validate_clusterer(self, default=DBSCAN()): self._precomputed = precomputed[0] else: raise NotImplementedError("Behaviour when metric and affinity " - "are both set to 'precomputed' not yet" + "are both set to 'precomputed' not yet " "implemented by ParallelClustering.") def fit(self, X, y=None, sample_weight=None): @@ -126,8 +121,16 @@ def fit(self, X, y=None, sample_weight=None): self : object """ - self._validate_clusterer() X_tot, masks = X + check_array(X_tot, ensure_2d=True) + check_array(masks, ensure_2d=True) + if masks.dtype != np.bool_: + raise TypeError("`masks` must be a boolean array.") + if len(X_tot) != len(masks): + raise ValueError("`X_tot` and `masks` must have the same number " + "of rows.") + self._validate_clusterer() + if sample_weight is not None: sample_weights = [sample_weight[masks[:, i]] for i in range(masks.shape[1])] @@ -139,12 +142,13 @@ def fit(self, X, y=None, sample_weight=None): else: single_fitter = self._fit_single_abs_labels - self.clusterers_ = Parallel(n_jobs=self.n_jobs, - prefer=self.parallel_backend_prefer)( - delayed(single_fitter)( - X_tot, np.flatnonzero(mask), - mask_num, sample_weight=sample_weights[mask_num]) - for mask_num, mask in enumerate(masks.T)) + self.clusterers_ = Parallel( + n_jobs=self.n_jobs, prefer=self.parallel_backend_prefer + )(delayed(single_fitter)(X_tot, + np.flatnonzero(mask), + mask_num, + sample_weight=sample_weights[mask_num]) + for mask_num, mask in enumerate(masks.T)) self.clusters_ = [clusterer.abs_labels_ for clusterer in self.clusterers_] return self @@ -167,7 +171,7 @@ def _fit_single_abs_labels_precomputed(self, X, relative_indices, mask_num, return cloned_clusterer def _fit_single(self, X, relative_indices, sample_weight): - cloned_clusterer = clone(self._clusterer) + cloned_clusterer = clone(self.clusterer) X_sub = X[relative_indices] fit_params = signature(cloned_clusterer.fit).parameters @@ -236,12 +240,14 @@ def transform(self, X, y=None): """ raise NotImplementedError( "Transforming new data with a fitted ParallelClustering object " - "not yet implemented, use fit_transform instead.") + "not yet implemented, use fit_transform instead." + ) def fit_transform(self, X, y=None, **fit_params): """Alias for :meth:`fit_predict`. - Allows for this class to be used as a step in a scikit-learn pipeline. + Allows for this class to be used as an intermediate step in a + scikit-learn pipeline. Parameters ---------- @@ -332,29 +338,6 @@ class FirstSimpleGap(ClusterMixin, BaseEstimator, Agglomerative): Parameters ---------- - relative_gap_size : float, optional, default: ``0.3`` - The fraction of the largest linkage in the dendrogram to be used as - a threshold for determining a large enough gap. - - max_fraction : float or None, optional, default: ``None`` - When not ``None``, the algorithm is constrained to produce no more - than ``max_fraction * (n_samples - 1)`` clusters, even if a - candidate gap is observed in the iterative process which would produce - a greater number of clusters. - - affinity : str, optional, default: ``'euclidean'`` - Metric used to compute the linkage. Can be ``'euclidean'``, ``'l1'``, - ``'l2'``, ``'manhattan'``, ``'cosine'``, or ``'precomputed'``. - If linkage is ``'ward'``, only ``'euclidean'`` is accepted. - If ``'precomputed'``, a distance matrix (instead of a similarity - matrix) is needed as input for :meth:`fit`. - - memory : None, str or object with the joblib.Memory interface, \ - optional, default: ``None`` - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - linkage : ``'ward'`` | ``'complete'`` | ``'average'`` | ``'single'``, \ optional, default: ``'single'`` Which linkage criterion to use. The linkage criterion determines which @@ -369,6 +352,29 @@ class FirstSimpleGap(ClusterMixin, BaseEstimator, Agglomerative): - ``'single'`` uses the minimum of the distances between all observations of the two sets. + affinity : str, optional, default: ``'euclidean'`` + Metric used to compute the linkage. Can be ``'euclidean'``, ``'l1'``, + ``'l2'``, ``'manhattan'``, ``'cosine'``, or ``'precomputed'``. + If linkage is ``'ward'``, only ``'euclidean'`` is accepted. + If ``'precomputed'``, a distance matrix (instead of a similarity + matrix) is needed as input for :meth:`fit`. + + relative_gap_size : float, optional, default: ``0.3`` + The fraction of the largest linkage in the dendrogram to be used as + a threshold for determining a large enough gap. + + max_fraction : float, optional, default: ``1.`` + When not ``None``, the algorithm is constrained to produce no more + than ``max_fraction * n_samples`` clusters, even if a candidate gap is + observed in the iterative process which would produce a greater number + of clusters. + + memory : None, str or object with the joblib.Memory interface, \ + optional, default: ``None`` + Used to cache the output of the computation of the tree. By default, no + caching is performed. If a string is given, it is the path to the + caching directory. + Attributes ---------- n_clusters_ : int @@ -399,21 +405,20 @@ class FirstSimpleGap(ClusterMixin, BaseEstimator, Agglomerative): """ _hyperparameters = { - 'relative_gap_size': { - 'type': Real, 'in': Interval(0, 1, closed='right')}, - 'max_fraction': { - 'type': (Real, type(None)), 'in': Interval(0, 1, closed='right')}, + 'linkage': {'type': str}, 'affinity': {'type': str}, - 'linkage': {'type': str} - } + 'relative_gap_size': {'type': Real, + 'in': Interval(0, 1, closed='right')}, + 'max_fraction': {'type': Real, 'in': Interval(0, 1, closed='right')} + } - def __init__(self, relative_gap_size=0.3, max_fraction=None, - affinity='euclidean', memory=None, linkage='single'): + def __init__(self, linkage='single', affinity='euclidean', + relative_gap_size=0.3, max_fraction=1., memory=None): + self.linkage = linkage + self.affinity = affinity self.relative_gap_size = relative_gap_size self.max_fraction = max_fraction - self.affinity = affinity self.memory = memory - self.linkage = linkage def fit(self, X, y=None): """Fit the agglomerative clustering from features or distance matrix. @@ -439,8 +444,6 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['memory']) - _max_fraction = 1. if self.max_fraction is None else self.max_fraction - if X.shape[0] == 1: self.labels_ = np.array([0]) self.n_clusters_ = 1 @@ -450,7 +453,7 @@ def fit(self, X, y=None): min_gap_size = self.relative_gap_size * self.distances_[-1] self.n_clusters_ = _num_clusters_simple( - self.distances_, min_gap_size, _max_fraction) + self.distances_, min_gap_size, self.max_fraction) # Cut the tree to find labels # TODO: Verify whether Daniel Mullner's implementation of this step @@ -466,42 +469,15 @@ class FirstHistogramGap(ClusterMixin, BaseEstimator, Agglomerative): Given a frequency threshold f and an initial integer k: 1) create a histogram of k equally spaced bins of the number of merges in the - dendrogram, as a function of the linkage parameter; 2) the value of - linkage at which the tree is to be cut is the first one after which a - bin of height no greater than f (i.e. a "gap") is observed; 3) if no gap is - observed, increase k and repeat 1) and 2) until termination. The algorithm - can be partially overridden to ensure that the final number of clusters - does not exceed a certain threshold, by passing a parameter `max_fraction`. + dendrogram, as a function of the linkage parameter; 2) the value of linkage + at which the tree is to be cut is the first one after which a bin of height + no greater than f (i.e. a "gap") is observed; 3) if no gap is observed, + increase k and repeat 1) and 2) until termination. The algorithm can be + partially overridden to ensure that the final number of clusters does not + exceed a certain threshold, by passing a parameter `max_fraction`. Parameters ---------- - freq_threshold : int, optional, default: ``0`` - The frequency threshold for declaring that a gap in the histogram of - merges is present. - - max_fraction : float or None, optional, default: ``None`` - When not ``None``, the algorithm is constrained to produce no more - than ``max_fraction * (n_samples - 1)`` clusters, even if a - candidate gap is observed in the iterative process which would produce - a greater number of clusters. - - n_bins_start : int, optional, default: ``5`` - The initial number of bins in the iterative process for finding a - gap in the histogram of merges. - - affinity : str, optional, default: ``'euclidean'`` - Metric used to compute the linkage. Can be ``'euclidean'``, ``'l1'``, - ``'l2'``, ``'manhattan'``, ``'cosine'``, or ``'precomputed'``. - If linkage is ``'ward'``, only ``'euclidean'`` is accepted. - If ``'precomputed'``, a distance matrix (instead of a similarity - matrix) is needed as input for :meth:`fit`. - - memory : None, str or object with the joblib.Memory interface, \ - optional, default: ``None`` - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - linkage : ``'ward'`` | ``'complete'`` | ``'average'`` | ``'single'``, \ optional, default: ``'single'`` Which linkage criterion to use. The linkage criterion determines which @@ -511,11 +487,38 @@ class FirstHistogramGap(ClusterMixin, BaseEstimator, Agglomerative): - ``'ward'`` minimizes the variance of the clusters being merged. - ``'average'`` uses the average of the distances of each observation of the two sets. - - ``'complete'`` linkage uses the maximum distances between - all observations of the two sets. + - ``'complete'`` linkage uses the maximum distances between all + observations of the two sets. - ``'single'`` uses the minimum of the distances between all observations of the two sets. + affinity : str, optional, default: ``'euclidean'`` + Metric used to compute the linkage. Can be ``'euclidean'``, ``'l1'``, + ``'l2'``, ``'manhattan'``, ``'cosine'``, or ``'precomputed'``. + If linkage is ``'ward'``, only ``'euclidean'`` is accepted. + If ``'precomputed'``, a distance matrix (instead of a similarity + matrix) is needed as input for :meth:`fit`. + + freq_threshold : int, optional, default: ``0`` + The frequency threshold for declaring that a gap in the histogram of + merges is present. + + max_fraction : float, optional, default: ``1.`` + When not ``None``, the algorithm is constrained to produce no more + than ``max_fraction * n_samples`` clusters, even if a candidate gap is + observed in the iterative process which would produce a greater number + of clusters. + + n_bins_start : int, optional, default: ``5`` + The initial number of bins in the iterative process for finding a gap + in the histogram of merges. + + memory : None, str or object with the joblib.Memory interface, \ + optional, default: ``None`` + Used to cache the output of the computation of the tree. By default, no + caching is performed. If a string is given, it is the path to the + caching directory. + Attributes ---------- n_clusters_ : int @@ -552,24 +555,24 @@ class FirstHistogramGap(ClusterMixin, BaseEstimator, Agglomerative): """ _hyperparameters = { - 'freq_threshold': { - 'type': int, 'in': Interval(0, np.inf, closed='left')}, - 'max_fraction': { - 'type': (Real, type(None)), 'in': Interval(0, 1, closed='right')}, - 'n_bins_start': { - 'type': int, 'in': Interval(1, np.inf, closed='left')}, + 'linkage': {'type': str}, 'affinity': {'type': str}, - 'linkage': {'type': str} - } - - def __init__(self, freq_threshold=0, max_fraction=None, n_bins_start=5, - affinity='euclidean', memory=None, linkage='single'): + 'freq_threshold': {'type': int, + 'in': Interval(0, np.inf, closed='left')}, + 'max_fraction': {'type': Real, 'in': Interval(0, 1, closed='right')}, + 'n_bins_start': {'type': int, + 'in': Interval(1, np.inf, closed='left')}, + } + + def __init__(self, linkage='single', affinity='euclidean', + freq_threshold=0, max_fraction=1., n_bins_start=5, + memory=None): + self.linkage = linkage + self.affinity = affinity self.freq_threshold = freq_threshold self.max_fraction = max_fraction self.n_bins_start = n_bins_start - self.affinity = affinity self.memory = memory - self.linkage = linkage def fit(self, X, y=None): """Fit the agglomerative clustering from features or distance matrix. @@ -595,8 +598,6 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['memory']) - _max_fraction = 1. if self.max_fraction is None else self.max_fraction - if X.shape[0] == 1: self.labels_ = np.array([0]) self.n_clusters_ = 1 @@ -606,7 +607,7 @@ def fit(self, X, y=None): self.n_clusters_ = _num_clusters_histogram( self.distances_, self.freq_threshold, self.n_bins_start, - _max_fraction) + self.max_fraction) # Cut the tree to find labels # TODO: Verify whether Daniel Mullner's implementation of this step diff --git a/gtda/mapper/cover.py b/gtda/mapper/cover.py index 9fd85b977..b7c13a10b 100644 --- a/gtda/mapper/cover.py +++ b/gtda/mapper/cover.py @@ -88,7 +88,7 @@ class OneDimensionalCover(BaseEstimator, TransformerMixin): 'kind': {'type': str, 'in': ['uniform', 'balanced']}, 'n_intervals': {'type': int, 'in': Interval(1, np.inf, closed='left')}, 'overlap_frac': {'type': float, 'in': Interval(0, 1, closed='neither')} - } + } def __init__(self, kind='uniform', n_intervals=10, overlap_frac=0.1): self.kind = kind @@ -314,11 +314,11 @@ def _limits_from_ranks(self, X_rank, X, left_ranks, right_ranks): left_limits = np.array([ X[nonzero_indices[0]] if nonzero_indices.size else -np.inf for nonzero_indices in left_indices - ]) + ]) right_limits = np.array([ X[nonzero_indices[0]] if nonzero_indices.size else np.inf for nonzero_indices in right_indices - ]) + ]) left_limits[0] = -np.inf right_limits[-1] = np.inf return left_limits, right_limits @@ -384,7 +384,7 @@ class CubicalCover(BaseEstimator, TransformerMixin): 'kind': {'type': str, 'in': ['uniform', 'balanced']}, 'n_intervals': {'type': int, 'in': Interval(1, np.inf, closed='left')}, 'overlap_frac': {'type': float, 'in': Interval(0, 1, closed='neither')} - } + } def __init__(self, kind='uniform', n_intervals=10, overlap_frac=0.1): self.kind = kind @@ -414,7 +414,7 @@ def _fit(self, X): self._coverers = [ partial(self._clone_and_apply_to_column, X, coverer, fitter)(i) for i in range(X.shape[1]) - ] + ] self._n_features_fit = X.shape[1] return self @@ -537,13 +537,13 @@ def fit_transform(self, X, y=None, **fit_params): coverer = OneDimensionalCover(kind=self.kind, n_intervals=self.n_intervals, overlap_frac=self.overlap_frac) - coverers = [clone(coverer) for i in range(Xt.shape[1])] + coverers = [clone(coverer) for _ in range(Xt.shape[1])] fit_transformer = '_fit_transform_balanced' covers = [ partial(self._clone_and_apply_to_column, Xt, coverer, fit_transformer)(i) for i, coverer in enumerate(coverers) - ] + ] # Only store attributes if above succeeds self._coverers = coverers self._n_features_fit = Xt.shape[1] @@ -555,7 +555,7 @@ def _combine_one_dim_covers(covers): # Stack intervals for each cover intervals = ( [cover[:, i] for i in range(cover.shape[1])] for cover in covers - ) + ) # Calculate masks for pullback cover Xt = np.array([np.logical_and.reduce(t) diff --git a/gtda/mapper/filter.py b/gtda/mapper/filter.py index 0b6cc2cf1..7bca5c96a 100644 --- a/gtda/mapper/filter.py +++ b/gtda/mapper/filter.py @@ -5,7 +5,7 @@ import numpy as np from scipy.spatial.distance import pdist, squareform -from scipy.special import entr +from scipy.stats import entropy from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_array, check_is_fitted @@ -33,12 +33,12 @@ class Eccentricity(BaseEstimator, TransformerMixin): already a distance matrix. If not ``'precomputed'``, it may be anything allowed by :func:`scipy.spatial.distance.pdist`. - metric_params : dict or None, optional, default: ``None`` + metric_params : dict, optional, default: ``{}`` Additional keyword arguments for the metric function. """ - def __init__(self, exponent=2, metric='euclidean', metric_params=None): + def __init__(self, exponent=2, metric='euclidean', metric_params={}): self.exponent = exponent self.metric = metric self.metric_params = metric_params @@ -70,11 +70,7 @@ def fit(self, X, y=None): # Evaluate performance impact of doing this. check_array(X) - if self.metric_params is None: - self.effective_metric_params_ = dict() - else: - self.effective_metric_params_ = self.metric_params.copy() - + self._is_fitted = True return self def transform(self, X, y=None): @@ -96,12 +92,13 @@ def transform(self, X, y=None): Column vector of eccentricities of points in `X`. """ - check_is_fitted(self) + check_is_fitted(self, '_is_fitted') Xt = check_array(X) if self.metric != 'precomputed': Xt = squareform( - pdist(Xt, metric=self.metric, **self.effective_metric_params_)) + pdist(Xt, metric=self.metric, **self.metric_params) + ) Xt = np.linalg.norm(Xt, axis=1, ord=self.exponent, keepdims=True) return Xt @@ -111,9 +108,9 @@ def transform(self, X, y=None): class Entropy(BaseEstimator, TransformerMixin): """Entropy of rows in a two-dimensional array. - The rows of the array are interpreted as probability vectors, - after taking absolute values if necessary and normalizing. Then, - their Shannon entropies are computed and returned. + The rows of the array are interpreted as probability vectors, after taking + absolute values if necessary and normalizing. Then, their (base 2) Shannon + entropies are computed and returned. """ @@ -175,8 +172,7 @@ def transform(self, X, y=None): "value to calculate probabilities.") Xt = np.abs(Xt) - Xt = Xt / Xt.sum(axis=1, keepdims=True) - Xt = entr(Xt).sum(axis=1, keepdims=True) / np.log(2) + Xt = entropy(Xt, base=2, axis=1)[:, None] return Xt diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index 87e3d54be..6c0289070 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -1,8 +1,8 @@ -"""Construct the nerve of a Mapper cover.""" +"""Construct the nerve of a refined Mapper cover.""" # License: GNU AGPLv3 from functools import reduce -from itertools import combinations +from itertools import combinations, filterfalse from operator import iconcat import igraph as ig @@ -10,60 +10,74 @@ from sklearn.base import BaseEstimator, TransformerMixin +def _limit_mapping(mapping): + """Given a 1D array interpreted as a function + :math:`f : \\{0, \\ldots, n - 1\\}} \to \\{0, \\ldots, n - 1\\}}`, such + that :math:`f^{(k)} = f^{(k + 1)}` for some :math:`k`, find the 1D array + corresponding to :math:`f^{(k)}`.""" + terminal_states = np.empty_like(mapping) + for i, initial_target_idx in enumerate(mapping): + temp_target_idx = i + next_target_idx = initial_target_idx + while temp_target_idx != next_target_idx: + temp_target_idx = mapping[temp_target_idx] + next_target_idx = mapping[mapping[temp_target_idx]] + terminal_states[i] = temp_target_idx + + return terminal_states + + class Nerve(BaseEstimator, TransformerMixin): - """One-dimensional skeleton of the nerve of a Mapper cover, i.e. the - Mapper graph. + """1-skeleton of the nerve of a refined Mapper cover, i.e. the Mapper + graph. This transformer is the final step in the :class:`gtda.mapper.pipeline.MapperPipeline` objects created - by :func:`gtda.mapper.make_mapper_pipeline`. It is not intended for - direct use. + by :func:`gtda.mapper.make_mapper_pipeline`. It corresponds the last two + arrows in `this diagram <../../../../_images/mapper_pipeline.svg>`_. + + This transformer is not intended for direct use. Parameters ---------- min_intersection : int, optional, default: ``1`` - The minimum size of the intersection between Mapper cover sets - required to create an edge in the Mapper graph. + Minimum size of the intersection, between data subsets associated to + any two Mapper nodes, required to create an edge between the nodes in + the Mapper graph. Must be positive. + + store_edge_elements : bool, optional, default: ``False`` + Whether the indices of data elements associated to Mapper edges (i.e. + in the intersections allowed by `min_intersection`) should be stored in + the :class:`igraph.Graph` object output by :meth:`fit_transform`. When + ``True``, might lead to a large :class:`igraph.Graph` object. + + contract_nodes : bool, optional, default: ``False`` + If ``True``, any node representing a cluster which is a strict subset + of the cluster corresponding to another node is eliminated, and only + one maximal node is kept. Attributes ---------- - X_ : list of tuple - Nodes of the Mapper graph obtained from the input data for - :meth:`fit`. It is a flattened version of the input Mapper cover, - with the addition of a globally unique node ID as the first entry in - each tuple. Created only when :meth:`fit` is called. - - edges_ : list of dict - Edges of the Mapper graph obtained from the input data for - :meth:`fit`. Each edge is a dictionary with two keys: - ``'node_indices'`` is mapped to a pair of triples characterising the - two adjacent nodes; ``'intersection'`` is mapped to the array of - indices of points in the intersection between the two nodes. Created - only when :meth:`fit` is called. + graph_ : :class:`igraph.Graph` object + Mapper graph obtained from the input data. Created when :meth:`fit` is + called. """ - def __init__(self, min_intersection=1): + def __init__(self, min_intersection=1, store_edge_elements=False, + contract_nodes=False): self.min_intersection = min_intersection + self.store_edge_elements = store_edge_elements + self.contract_nodes = contract_nodes def fit(self, X, y=None): - """Compute and store the nodes and edges of the Mapper graph, - and return the estimator. + """Compute the Mapper graph as in :meth:`fit_transform`, but store the + graph as :attr:`graph_` and return the estimator. Parameters ---------- X : list of list of tuple - Input data structure describing an abstract Mapper cover. Each - sublist corresponds to a (non-empty) pullback cover set -- - equivalently, to a cover set in the filter range which has - non-empty preimage -- and contains triples of the form ``( \ - pullback_set_label, partial_cluster_label, indices)`` where - ``partial_cluster_label`` is a cluster label within the pullback - cover set identified by ``pullback_set_label``, and ``indices`` - is the array of indices of points belonging to cluster ``( \ - pullback_set_label, partial_cluster_label)``. In the context of a - :class:`gtda.mapper.MapperPipeline`, this is the output of the - clustering step. + See :meth:`fit_transform`. y : None There is no need for a target in a transformer, yet the pipeline @@ -74,27 +88,28 @@ def fit(self, X, y=None): self : object """ - # TODO: Include a validation step for X - self.X_, self.edges_ = self._graph_data_creation(X) + self.graph_ = self.fit_transform(X, y=y) return self - def fit_transform(self, X, y=None, **fit_params): - """Construct a Mapper graph from an abstract Mapper cover `X`. + def fit_transform(self, X, y=None): + """Construct a Mapper graph from a refined Mapper cover. Parameters ---------- X : list of list of tuple - Input data structure describing an abstract Mapper cover. Each - sublist corresponds to a (non-empty) pullback cover set -- + Data structure describing a cover of a dataset (e.g. as depicted in + `this diagram <../../../../_images/mapper_pipeline.svg>`_) produced + by the clustering step of a :class:`gtda.mapper.MapperPipeline`. + Each sublist corresponds to a (non-empty) pullback cover set -- equivalently, to a cover set in the filter range which has - non-empty preimage -- and contains triples of the form ``( \ - pullback_set_label, partial_cluster_label, indices)`` where - ``partial_cluster_label`` is a cluster label within the pullback - cover set identified by ``pullback_set_label``, and ``indices`` - is the array of indices of points belonging to cluster ``( \ - pullback_set_label, partial_cluster_label)``. In the context of a - :class:`gtda.mapper.MapperPipeline`, this is the output of the - clustering step. + non-empty preimage. It contains triples of the form + ``(pullback_set_label, partial_cluster_label, node_elements)`` + where ``partial_cluster_label`` is a cluster label within the + pullback cover set identified by ``pullback_set_label``, and + ``node_elements`` is an array of integer indices. To each pair + ``(pullback_set_label, partial_cluster_label)`` there corresponds + a unique node in the output Mapper graph. This node represents + the data subset defined by the indices in ``node_elements``. y : None There is no need for a target in a transformer, yet the pipeline @@ -103,47 +118,128 @@ def fit_transform(self, X, y=None, **fit_params): Returns ------- graph : :class:`igraph.Graph` object - Mapper graph. Edges exist between two Mapper cover sets if and - only if the size of the intersection between the two sets is no - less than `min_intersection`. + Undirected Mapper graph according to `X` and `min_intersection`. + Each node is an :class:`igraph.Vertex` object with attributes + ``"pullback_set_label"``, ``"partial_cluster_label"`` and + ``"node_elements"``. Each edge is an :class:`igraph.Edge` object + with a ``"weight"`` attribute which is equal to the size of the + intersection between the data subsets associated to its two nodes. + If `store_edge_elements` is ``True`` each edge also has an + additional attribute ``"edge_elements"``. """ # TODO: Include a validation step for X - _X, _edges = self._graph_data_creation(X) - - # Graph construction - graph = ig.Graph() - graph.add_vertices([vertex[0] for vertex in _X]) - graph.add_edges([ - (edge['node_indices'][0][0], edge['node_indices'][1][0]) - for edge in _edges - ]) - graph['node_metadata'] = dict( - zip(['node_id', 'pullback_set_label', 'partial_cluster_label', - 'node_elements'], - zip(*_X))) + # Graph construction -- vertices with their metadata + nodes = reduce(iconcat, X, []) + graph = ig.Graph(len(nodes)) + + # Since `nodes` is a list, say of length N, of triples of the form + # (pullback_set_label, partial_cluster_label, node_elements), + # zip(*nodes) generates three tuples of length N, each corresponding to + # a type of node attribute. + node_attributes = zip(*nodes) + attribute_names = ["pullback_set_label", "partial_cluster_label", + "node_elements"] + for i, node_attribute in enumerate(node_attributes): + graph.vs[attribute_names[i]] = node_attribute + + # Graph construction -- edges with weights given by intersection sizes. + # In general, we need all information in `nodes` to narrow down the set + # of combinations to check when `contract_nodes` is True + node_index_pairs, weights, intersections, mapping = \ + self._generate_edge_data(nodes) + graph.es["weight"] = 1 + graph.add_edges(node_index_pairs) + graph.es["weight"] = weights + if self.store_edge_elements: + graph.es["edge_elements"] = intersections + if self.contract_nodes: + # Due to the order in which itertools.combinations produces pairs, + # and to the preference given to node 1 in the if-elif-else clause + # in `_subset_check_metadata_append`, `mapping` is guaranteed to + # send everything to one of its fixed points after sufficiently + # many repeated applications and, by construction, no two pairs of + # indices in `_limit_mapping(mapping)` can correspond to data + # subsets which are in a subset relation. Thus the nodes are + # correctly contracted by `_limit_mapping(mapping)`. + limit_mapping = _limit_mapping(mapping) + graph.contract_vertices(limit_mapping, + combine_attrs="first") + graph.delete_vertices([i for i in graph.vs.indices + if i != limit_mapping[i]]) + return graph - def _graph_data_creation(self, X): - X_ = reduce(iconcat, X, []) - # Preprocess X by 1) flattening and 2) extending each tuple - X_ = [(node_info[0], *node_info[1]) - for node_info in zip(range(len(X_)), X_)] - edges_ = self._generate_edges(X_) - return X_, edges_ - - @staticmethod - def _pairwise_intersections(min_intersection, node_pair): - data = dict() - node_1, node_2 = node_pair - data['node_indices'] = tuple((node_1[0:3], node_2[0:3])) - data['intersection'] = np.intersect1d(node_1[3], node_2[3]) - if data['intersection'].size >= min_intersection: - yield data - - def _generate_edges(self, nodes): - node_tuples = combinations(nodes, 2) - for pair in node_tuples: - for intersection in \ - self._pairwise_intersections(self.min_intersection, pair): - yield intersection + def _generate_edge_data(self, nodes): + def _in_same_pullback_set(_node_tuple): + return _node_tuple[0][1][0] == _node_tuple[1][1][0] + + def _do_nothing(*args): + pass + + def _intersections_append(_intersection): + return intersections.append(_intersection) + + def _metadata_append( + _node_1_idx, _node_2_idx, _intersection_size, _intersection, + *args + ): + if _intersection_size >= self.min_intersection: + # Add edge (as a node tuple) to list of node index pairs + node_index_pairs.append((_node_1_idx, _node_2_idx)) + weights.append(_intersection_size) + intersection_behavior(_intersection) + + def _subset_check_metadata_append( + _node_1_idx, _node_2_idx, _intersection_size, _intersection, + _node_1_elements, _node_2_elements + ): + if _intersection_size == len(_node_2_elements): + # Node 2 is contained in node 1 and we remove it in favour of + # node 1. + mapping[_node_2_idx] = _node_1_idx + elif _intersection_size == len(_node_1_elements): + # Node 1 is strictly contained in node 2 and we remove it in + # favour of node 2. + mapping[_node_1_idx] = _node_2_idx + else: + # Edge exists provided `_intersection_size` is large enough + _metadata_append(_node_1_idx, _node_2_idx, _intersection_size, + _intersection) + + node_tuples = combinations(enumerate(nodes), 2) + + node_index_pairs = [] + weights = [] + intersections = [] + + # Choose whether intersections are stored or not. + # `intersection_behavior` is in scope for `_metadata_append` and + # `_subset_check_metadata_append`. + if self.store_edge_elements: + intersection_behavior = _intersections_append + else: + intersection_behavior = _do_nothing + + if self.contract_nodes: + mapping = np.arange(len(nodes)) + behavior = _subset_check_metadata_append + else: + mapping = None + behavior = _metadata_append + + # No need to check for intersections within each pullback set as the + # input is assumed to be a refined Mapper cover + for node_tuple in filterfalse(_in_same_pullback_set, node_tuples): + ((node_1_idx, (_, _, node_1_elements)), + (node_2_idx, (_, _, node_2_elements))) = node_tuple + intersection = np.intersect1d(node_1_elements, node_2_elements) + intersection_size = len(intersection) + + if intersection_size: + behavior(node_1_idx, node_2_idx, intersection_size, + intersection, node_1_elements, node_2_elements) + else: + continue + + return node_index_pairs, weights, intersections, mapping diff --git a/gtda/mapper/pipeline.py b/gtda/mapper/pipeline.py index 5f65d5b73..8dc48f89a 100644 --- a/gtda/mapper/pipeline.py +++ b/gtda/mapper/pipeline.py @@ -8,16 +8,16 @@ from .utils._list_feature_union import ListFeatureUnion from .utils.pipeline import transformer_from_callable_on_rows, identity -global_pipeline_params = ('memory', 'verbose') -nodes_params = ('scaler', 'filter_func', 'cover') -clust_prepr_params = ('clustering_preprocessing',) -clust_params = ('clusterer', 'n_jobs', - 'parallel_backend_prefer') -nerve_params = ('min_intersection',) -clust_prepr_params_prefix = 'pullback_cover__' -nodes_params_prefix = 'pullback_cover__map_and_cover__' -clust_params_prefix = 'clustering__' -nerve_params_prefix = 'nerve__' +global_pipeline_params = ("memory", "verbose") +nodes_params = ("scaler", "filter_func", "cover") +clust_prepr_params = ("clustering_preprocessing",) +clust_params = ("clusterer", "n_jobs", + "parallel_backend_prefer") +nerve_params = ("min_intersection", "store_edge_elements", "contract_nodes") +clust_prepr_params_prefix = "pullback_cover__" +nodes_params_prefix = "pullback_cover__map_and_cover__" +clust_params_prefix = "clustering__" +nerve_params_prefix = "nerve__" class MapperPipeline(Pipeline): @@ -43,10 +43,10 @@ class MapperPipeline(Pipeline): >>> pipe = make_mapper_pipeline(filter_func=filter_func, ... cover=cover, ... clusterer=clusterer) - >>> print(pipe.get_mapper_params()['clusterer__eps']) + >>> print(pipe.get_mapper_params()["clusterer__eps"]) 0.5 >>> pipe.set_params(clusterer___eps=0.1) - >>> print(pipe.get_mapper_params()['clusterer__eps']) + >>> print(pipe.get_mapper_params()["clusterer__eps"]) 0.1 See also @@ -130,14 +130,14 @@ def _clean_dict_keys(kwargs, prefix): key[len(prefix):]: kwargs[key] for key in kwargs if (key.startswith(prefix) - and not key.startswith(prefix + 'steps') - and not key.startswith(prefix + 'memory') - and not key.startswith(prefix + 'verbose') - and not key.startswith(prefix + 'transformer_list') - and not key.startswith(prefix + 'n_jobs') - and not key.startswith(prefix + 'transformer_weights') - and not key.startswith(prefix + 'map_and_cover')) - } + and not key.startswith(prefix + "steps") + and not key.startswith(prefix + "memory") + and not key.startswith(prefix + "verbose") + and not key.startswith(prefix + "transformer_list") + and not key.startswith(prefix + "n_jobs") + and not key.startswith(prefix + "transformer_weights") + and not key.startswith(prefix + "map_and_cover")) + } def make_mapper_pipeline(scaler=None, @@ -146,13 +146,15 @@ def make_mapper_pipeline(scaler=None, clustering_preprocessing=None, clusterer=None, n_jobs=None, - parallel_backend_prefer='threads', + parallel_backend_prefer=None, graph_step=True, min_intersection=1, + store_edge_elements=False, + contract_nodes=False, memory=None, verbose=False): """Construct a MapperPipeline object according to the specified Mapper - steps. [1]_ + steps [1]_. The role of this function's main parameters is illustrated in `this diagram <../../../../_images/mapper_pipeline.svg>`_. All computational steps may @@ -165,7 +167,7 @@ def make_mapper_pipeline(scaler=None, object with a ``fit_transform`` method. filter_func : object, callable or None, optional, default: ``None`` - If `None``, PCA (:class:`sklearn.decomposition.PCA`) with 2 + If ``None``, PCA (:class:`sklearn.decomposition.PCA`) with 2 components and default parameters is used as a default filter function. Otherwise, it may be an object with a ``fit_transform`` method, or a callable acting on one-dimensional arrays -- in which @@ -198,22 +200,35 @@ def make_mapper_pipeline(scaler=None, in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. - parallel_backend_prefer : ``'processes'`` | ``'threads'``, optional, \ - default: ``'threads'`` + parallel_backend_prefer : ``"processes"`` | ``"threads"`` | ``None``, \ + optional, default: ``None`` Soft hint for the default joblib backend to use in a joblib-parallel application of the clustering step across pullback cover sets. To be used in conjunction with `n_jobs`. The default process-based backend is - 'loky' and the default thread-based backend is 'threading'. See [2]_. + "loky" and the default thread-based backend is "threading". See [2]_. graph_step : bool, optional, default: ``True`` Whether the resulting pipeline should stop at the calculation of the - Mapper cover, or include the construction of the Mapper graph. + (refined) Mapper cover, or include the construction of the Mapper + graph. min_intersection : int, optional, default: ``1`` Minimum size of the intersection between clusters required for creating an edge in the Mapper graph. Ignored if `graph_step` is set to ``False``. + store_edge_elements : bool, optional, default: ``False`` + Whether the indices of data elements associated to Mapper edges (i.e. + in the intersections allowed by `min_intersection`) should be stored in + the :class:`igraph.Graph` object output by the pipeline's + :meth:`fit_transform`. When ``True``, might lead to large + :class:`igraph.Graph` objects. + + contract_nodes : bool, optional, default: ``False`` + If ``True``, any node representing a cluster which is a strict subset + of the cluster corresponding to another node is eliminated, and only + one maximal node is kept. + memory : None, str or object with the joblib.Memory interface, \ optional, default: ``None`` Used to cache the fitted transformers which make up the pipeline. This @@ -231,7 +246,12 @@ def make_mapper_pipeline(scaler=None, Returns ------- mapper_pipeline : :class:`~gtda.mapper.pipeline.MapperPipeline` object - Output Mapper pipeline. + Output Mapper pipeline. The output of `mapper_pipeline`'s + :meth:`fit_transform` is: a) an :class:`igraph.Graph` object as per the + output of :class:`~gtda.mapper.nerve.Nerve`, when `graph_step` is + ``True``; b) a list of lists of tuples as per the output of + :class:`~gtda.mapper.ParallelClustering` (or input of + :class:`~gtda.mapper.Nerve`), otherwise. Examples -------- @@ -243,31 +263,30 @@ def make_mapper_pipeline(scaler=None, >>> print(mapper.__class__) >>> mapper_params = mapper.get_mapper_params() - >>> print(mapper_params['filter_func'].__class__) + >>> print(mapper_params["filter_func"].__class__) - >>> print(mapper_params['cover'].__class__) + >>> print(mapper_params["cover"].__class__) - >>> print(mapper_params['clusterer'].__class__) + >>> print(mapper_params["clusterer"].__class__) >>> X = np.random.random((10000, 4)) # 10000 points in 4-dimensional space >>> mapper_graph = mapper.fit_transform(X) # Create the mapper graph >>> print(type(mapper_graph)) igraph.Graph - >>> # Node metadata stored as dict in graph object - >>> print(mapper_graph['node_metadata'].keys()) - dict_keys(['node_id', 'pullback_set_label', 'partial_cluster_label', - 'node_elements']) + >>> # Node metadata stored as vertex attributes in graph object + >>> print(mapper_graph.vs.attributes()) + ['pullback_set_label', 'partial_cluster_label', 'node_elements'] >>> # Find which points belong to first node of graph - >>> node_id = mapper_graph['node_metadata']['node_id'] - >>> node_elements = mapper_graph['node_metadata']['node_elements'] - >>> print(f"Node ID: {node_id[0]}, Node elements: {node_elements[0]}, " - ... f"Data points: {X[node_elements[0]]}") + >>> node_id = 0 + >>> node_elements = mapper_graph.vs["node_elements"] + >>> print(f"Node ID: {node_id}, Node elements: {node_elements[node_id]}, " + ... f"Data points: {X[node_elements[node_id]") Node Id: 0, Node elements: [8768], Data points: [[0.01838998 0.76928754 0.98199244 0.0074299 ]] Using a scaler from scikit-learn, a filter function from - gtda.mapper.filter, and a clusterer from gtda.mapper.cluster + ``gtda.mapper.filter``, and a clusterer from ``gtda.mapper.cluster`` >>> from sklearn.preprocessing import MinMaxScaler >>> from gtda.mapper import Projection, FirstHistogramGap @@ -328,7 +347,7 @@ def make_mapper_pipeline(scaler=None, See also -------- - MapperPipeline, :func:`~gtda.mapper.utils.decorators.method_to_transform` + MapperPipeline, method_to_transform References ---------- @@ -359,7 +378,7 @@ def make_mapper_pipeline(scaler=None, if filter_func is None: from sklearn.decomposition import PCA _filter_func = PCA(n_components=2) - elif not hasattr(filter_func, 'fit_transform'): + elif not hasattr(filter_func, "fit_transform"): _filter_func = transformer_from_callable_on_rows(filter_func) else: _filter_func = filter_func @@ -382,23 +401,27 @@ def make_mapper_pipeline(scaler=None, _clusterer = clusterer map_and_cover = Pipeline( - steps=[('scaler', _scaler), - ('filter_func', _filter_func), - ('cover', _cover)], + steps=[("scaler", _scaler), + ("filter_func", _filter_func), + ("cover", _cover)], verbose=verbose) all_steps = [ - ('pullback_cover', ListFeatureUnion( - [('clustering_preprocessing', _clustering_preprocessing), - ('map_and_cover', map_and_cover)])), - ('clustering', ParallelClustering( - clusterer=_clusterer, + ("pullback_cover", ListFeatureUnion( + [("clustering_preprocessing", _clustering_preprocessing), + ("map_and_cover", map_and_cover)])), + ("clustering", ParallelClustering( + _clusterer, n_jobs=n_jobs, parallel_backend_prefer=parallel_backend_prefer)) - ] + ] if graph_step: - all_steps.append(('nerve', Nerve(min_intersection=min_intersection))) + all_steps.append( + ("nerve", Nerve(min_intersection=min_intersection, + store_edge_elements=store_edge_elements, + contract_nodes=contract_nodes)) + ) mapper_pipeline = MapperPipeline( steps=all_steps, memory=memory, verbose=verbose) diff --git a/gtda/mapper/tests/test_cluster.py b/gtda/mapper/tests/test_cluster.py index e7ca521f4..1898e8061 100644 --- a/gtda/mapper/tests/test_cluster.py +++ b/gtda/mapper/tests/test_cluster.py @@ -1,24 +1,108 @@ +"""Testing for FirstHistogramGap and FirstSimpleGap clusterers, and testing +for ParallelClustering.""" +# License: GNU AGPLv3 + import numpy as np +import pytest +import sklearn as sk from hypothesis import given from hypothesis.extra.numpy import arrays from hypothesis.strategies import floats, integers, composite from numpy.testing import assert_almost_equal from scipy.spatial import distance_matrix -from gtda.mapper import FirstHistogramGap, FirstSimpleGap +from gtda.mapper import ParallelClustering, FirstHistogramGap, FirstSimpleGap + + +def test_parallel_clustering_bad_input(): + pc = ParallelClustering(sk.cluster.DBSCAN()) + X = [np.random.random((5, 4)), np.random.random((5, 4))] + + with pytest.raises(TypeError, match="`masks` must be a boolean array."): + pc.fit(X) + + X[1] = np.ones((6, 4), dtype=bool) + with pytest.raises(ValueError, + match="`X_tot` and `masks` must have the same number"): + pc.fit(X) + + +def test_parallel_clustering_bad_clusterer(): + pc = ParallelClustering(sk.decomposition.PCA()) + X = [np.random.random((5, 4)), np.ones((5, 4), dtype=bool)] + + with pytest.raises(TypeError, match="`clusterer` must be an instance of"): + pc.fit(X) + + +def test_parallel_clustering_transform_not_implemented(): + pc = ParallelClustering(sk.cluster.DBSCAN()) + X = [np.random.random((5, 4)), np.ones((5, 4), dtype=bool)] + + with pytest.raises(NotImplementedError): + pc.transform(X) + + +@pytest.mark.parametrize("sample_weight", [None, np.random.random(5)]) +def test_parallel_clustering_kmeans(sample_weight): + kmeans = sk.cluster.KMeans(n_clusters=2, random_state=0) + pc = ParallelClustering(kmeans) + X = [np.random.random((5, 4)), np.ones((5, 4), dtype=bool)] + single_labels = kmeans.fit_predict(X[0], sample_weight=sample_weight) + unique_labels, inverse = np.unique(single_labels, return_inverse=True) + + res = pc.fit_predict(X, sample_weight=sample_weight) + res = [[(i, label, list(indices)) for [i, label, indices] in sublist] + for sublist in res] + exp = [[(i, label, list(np.flatnonzero(inverse == label))) + for label in unique_labels] + for i in range(X[1].shape[1])] + + assert res == exp + + +def test_parallel_clustering_metric_affinity_precomputed_not_implemented(): + class DummyClusterer(sk.base.BaseEstimator, sk.base.ClusterMixin): + def __init__(self, metric="precomputed", affinity="precomputed"): + self.metric = metric + self.affinity = affinity + + pc = ParallelClustering(DummyClusterer()) + X = [np.random.random((5, 4)), np.ones((5, 4), dtype=bool)] + + with pytest.raises(NotImplementedError, + match="Behaviour when metric and affinity"): + pc.fit(X) + + +def test_parallel_clustering_precomputed(): + pc = ParallelClustering(sk.cluster.DBSCAN()) + masks = np.random.choice([True, False], size=20).reshape((10, 2)) + X = [np.random.random((10, 4)), masks] + pc_precomp = ParallelClustering(sk.cluster.DBSCAN(metric="precomputed")) + X_precomp = [sk.metrics.pairwise_distances(X[0]), masks] + + res = pc.fit_predict(X) + res_precomp = pc_precomp.fit_predict(X_precomp) + res = [[(i, label, list(indices)) for [i, label, indices] in sublist] + for sublist in res] + res_precomp = [[(i, label, list(indices)) + for [i, label, indices] in sublist] + for sublist in res_precomp] + + assert res == res_precomp @composite def get_one_cluster(draw, n_points, dim): - """Get an array of n_points in a dim-dimensional space, - in the [-1,1]-hypercube""" - f = draw(arrays(dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False, - min_value=-1., - max_value=1.), - shape=(n_points, dim), unique=False)) - return f + """Get an array of n_points in a dim-dimensional space, in the + [-1, 1]-hypercube.""" + return draw(arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=-1., + max_value=1.), + shape=(n_points, dim), unique=False)) @composite @@ -56,8 +140,8 @@ def get_input(draw, n_clusters=None, n_points_per_cluster=None, @given(inp=get_input(n_clusters=1, n_points_per_cluster=1, std=1)) def test_on_trivial_input(inp): - """Test that with one cluster, and one point, - we always get one cluster, regardless of its location.""" + """Test that with one cluster, and one point, we always get one cluster, + regardless of its location.""" n_points_per_cluster, n_clusters, dim, pts = inp fs = FirstSimpleGap() fs = fs.fit(pts) @@ -70,13 +154,12 @@ def test_on_trivial_input(inp): @given(inp=get_input(std=0.02)) def test_firstsimplegap(inp): - """For a multimodal distribution, check that the ``FirstSimpleGap`` - with appropriate parameters finds the right number of clusters, - and that each has the correct number of points - ``n_points_per_cluster``.""" + """For a multimodal distribution, check that ``FirstSimpleGap`` with + appropriate parameters finds the right number of clusters, and that each + has the correct number of points ``n_points_per_cluster``.""" n_points_per_cluster, n_clusters, _, pts = inp fs = FirstSimpleGap(relative_gap_size=0.5, - max_fraction=None, + max_fraction=1., affinity='euclidean', memory=None, linkage='single') preds = fs.fit_predict(pts).astype(int) unique, counts = np.unique(preds, return_counts=True) @@ -88,12 +171,11 @@ def test_firstsimplegap(inp): @given(inp=get_input(n_clusters=2, std=0.02)) def test_firsthistogramgap(inp): - """For a multimodal distribution, check that the ``FirstHistogramGap`` - with appropriate parameters finds the right number of clusters, - and that each has the correct number of points - ``n_points_per_cluster``.""" + """For a multimodal distribution, check that the ``FirstHistogramGap`` with + appropriate parameters finds the right number of clusters, and that each + has the correct number of points ``n_points_per_cluster``.""" n_points_per_cluster, n_clusters, _, pts = inp - fh = FirstHistogramGap(freq_threshold=0, max_fraction=None, n_bins_start=5, + fh = FirstHistogramGap(freq_threshold=0, max_fraction=1., n_bins_start=5, affinity='euclidean', memory=None, linkage='single') preds = fh.fit_predict(pts) unique, counts = np.unique(preds, return_counts=True) @@ -104,38 +186,35 @@ def test_firsthistogramgap(inp): @given(inp=get_input(), max_frac=floats(min_value=0., exclude_min=True, - max_value=1., exclude_max=True)) + max_value=1., exclude_max=False)) def test_max_fraction_clusters(inp, max_frac): - """ Check that the clusterers (``FirstSimpleGap``, - ``FirstHistogramGap``) respect the ``max_num_clusters`` constraint, - if it is set.""" + """ Check that ``FirstSimpleGap`` and ``FirstHistogramGap`` respect the + ``max_num_clusters`` constraint, if it is set.""" n_points_per_cluster, n_clusters, _, pts = inp - max_num_clusters = max_frac * (n_points_per_cluster * n_clusters - - 1) + max_num_clusters = max_frac * n_points_per_cluster * n_clusters fs = FirstSimpleGap(max_fraction=max_frac) _ = fs.fit_predict(pts) - assert fs.n_clusters_ <= np.ceil(max_num_clusters*n_clusters) + assert fs.n_clusters_ <= np.floor(max_num_clusters) fh = FirstHistogramGap(max_fraction=max_frac) _ = fh.fit_predict(pts) - assert fh.n_clusters_ <= np.ceil(max_num_clusters*n_clusters) + assert fh.n_clusters_ <= np.floor(max_num_clusters) @given(inp=get_input()) def test_precomputed_distances(inp): - """Verify that the clustering based on ``distance_matrix`` is the same - as the clustering on points, that were used to calculate - that distance matrix.""" + """Verify that the clustering based on a distance matrix is the same as + the clustering on points used to calculate that distance matrix.""" n_points_per_cluster, n_clusters, _, pts = inp dist_matrix = distance_matrix(pts, pts, p=2) - fh_matrix = FirstHistogramGap(freq_threshold=0, max_fraction=None, + fh_matrix = FirstHistogramGap(freq_threshold=0, max_fraction=1., n_bins_start=5, affinity='precomputed', memory=None, linkage='single') preds_mat = fh_matrix.fit_predict(dist_matrix) - fh = FirstHistogramGap(freq_threshold=0, max_fraction=None, + fh = FirstHistogramGap(freq_threshold=0, max_fraction=1., n_bins_start=5, affinity='euclidean', memory=None, linkage='single') preds = fh.fit_predict(pts) @@ -149,5 +228,5 @@ def get_partition_from_preds(preds): return set([frozenset(np.where(preds == c)[0]) for c in indices_cluster]) - assert(get_partition_from_preds(preds) - == get_partition_from_preds(preds_mat)) + assert get_partition_from_preds(preds) == \ + get_partition_from_preds(preds_mat) diff --git a/gtda/mapper/tests/test_cover.py b/gtda/mapper/tests/test_cover.py index 48dcd6f6d..c07bbaf22 100644 --- a/gtda/mapper/tests/test_cover.py +++ b/gtda/mapper/tests/test_cover.py @@ -1,64 +1,60 @@ +"""Testing for OneDimensionalCover and CubicalCover.""" +# License: GNU AGPLv3 + +from functools import reduce + import numpy as np +import pytest from hypothesis import given from hypothesis.extra.numpy import arrays, array_shapes from hypothesis.strategies import floats, integers, booleans, composite from numpy.testing import assert_almost_equal -from functools import reduce -import pytest - +from sklearn.base import clone from sklearn.exceptions import NotFittedError + from gtda.mapper import OneDimensionalCover, CubicalCover @composite -def get_filter(draw, shape=None): - """Generate a 1d array of floats, of a given shape. - If the shape is not given, generate a shape of at least (4,).""" +def get_filter_values(draw, shape=None): + """Generate a 1d array of floats, of a given shape. If the shape is not + given, generate a shape of at least (4,).""" if shape is None: - shape = array_shapes(min_dims=1, max_dims=1, - min_side=4) - points = draw(arrays(dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False, - min_value=-1e10, - max_value=1e10), - shape=shape, unique=True)) - return points + shape = array_shapes(min_dims=1, max_dims=1, min_side=4) + return draw(arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=-1e10, + max_value=1e10), + shape=shape, unique=True)) @composite def get_nb_intervals(draw): - nb_intervals = draw(integers(min_value=3, max_value=20)) - return nb_intervals + return draw(integers(min_value=3, max_value=20)) @composite def get_overlap_fraction(draw): - overlap = draw(floats(allow_nan=False, - allow_infinity=False, - min_value=1e-8, exclude_min=True, - max_value=1., exclude_max=True), - ) - return overlap + return draw(floats(allow_nan=False, + allow_infinity=False, + min_value=1e-8, exclude_min=True, + max_value=1., exclude_max=True)) @composite def get_kind(draw): is_uniform = draw(booleans()) - return 'uniform' if is_uniform else 'balanced' + return "uniform" if is_uniform else "balanced" -@given( - filter_values=get_filter(), - n_intervals=get_nb_intervals() -) +@given(filter_values=get_filter_values(), n_intervals=get_nb_intervals()) def test_one_dimensional_cover_shape(filter_values, n_intervals): - """Assert that the length of the mask ``unique_interval_masks`` - corresponds to the pre-specified ``n_samples`` and that there - are no more intervals in the cover than ``n_intervals``. - The case when the filter has only a unique value, in which - case the fit_transform should throw an error, is treated separately. - """ + """Assert that the length of the mask ``unique_interval_masks`` corresponds + to the pre-specified ``n_samples`` and that there are no more intervals in + the cover than ``n_intervals``. The case when the filter has only a unique + value, in which case fit_transform should throw an error, is treated + separately.""" # TODO: Extend to inputs with shape (n_samples, 1) cover = OneDimensionalCover(n_intervals=n_intervals) n_samples, n_intervals = len(filter_values), cover.n_intervals @@ -72,9 +68,7 @@ def test_one_dimensional_cover_shape(filter_values, n_intervals): assert (n_intervals > 1) and (len(np.unique(filter_values)) == 1) -@given( - filter_values=get_filter() -) +@given(filter_values=get_filter_values()) def test_filter_values_covered_by_single_interval(filter_values): """Verify that a single intervals covers all the values in ``filter_values``""" @@ -82,25 +76,22 @@ def test_filter_values_covered_by_single_interval(filter_values): cover = OneDimensionalCover(n_intervals=1) interval_masks = cover.fit_transform(filter_values) # TODO: Generate filter_values with desired shape - assert_almost_equal( - filter_values[:, None][interval_masks], filter_values) + assert_almost_equal(filter_values[:, None][interval_masks], filter_values) -@given( - filter_values=get_filter(), - n_intervals=get_nb_intervals(), - overlap_frac=get_overlap_fraction() -) +@given(filter_values=get_filter_values(), + n_intervals=get_nb_intervals(), + overlap_frac=get_overlap_fraction()) def test_equal_interval_length(filter_values, n_intervals, overlap_frac): """Test that all the intervals have the same length, up to an additive constant of 0.1.""" - cover = OneDimensionalCover(kind='uniform', - n_intervals=n_intervals, + cover = OneDimensionalCover(kind="uniform", n_intervals=n_intervals, overlap_frac=overlap_frac) cover = cover.fit(filter_values) lower_limits, upper_limits = np.array( - list(map(tuple, zip(*cover.get_fitted_intervals()[1:-1])))) + list(map(tuple, zip(*cover.get_fitted_intervals()[1:-1]))) + ) # rounding precision decimals = 10 @@ -113,31 +104,28 @@ def get_input_tests_balanced(draw): """Points, nb_in_each_interval and nb_intervals""" nb_intervals = draw(get_nb_intervals()) nb_in_each_interval = draw(integers(min_value=2, max_value=5)) - points = draw(get_filter(shape=(nb_in_each_interval * nb_intervals,))) + points = draw( + get_filter_values(shape=(nb_in_each_interval * nb_intervals,)) + ) return [points, nb_in_each_interval, nb_intervals] -@given(input=get_input_tests_balanced()) -def test_balanced_is_balanced(input): +@given(balanced_cover=get_input_tests_balanced()) +def test_balanced_is_balanced(balanced_cover): """Test that each point is in one interval, and that each interval has ``nb_in_each_interval`` points.""" - points, nb_in_each_interval, nb_intervals = input - oneD_cover = OneDimensionalCover(kind='balanced', - n_intervals=nb_intervals, - overlap_frac=0.01) - mask = oneD_cover.fit_transform(points) + points, nb_in_each_interval, nb_intervals = balanced_cover + cover = OneDimensionalCover(kind='balanced', n_intervals=nb_intervals, + overlap_frac=0.01) + mask = cover.fit_transform(points) # each interval contains nb_in_each_interval points assert all([s == nb_in_each_interval for s in np.sum(mask, axis=0)]) # each point is in exactly one interval assert all([s == 1 for s in np.sum(mask, axis=1)]) -@given( - filter_values=get_filter(), - n_intervals=get_nb_intervals() -) -def test_filter_values_covered_by_interval_union(filter_values, - n_intervals): +@given(filter_values=get_filter_values(), n_intervals=get_nb_intervals()) +def test_filter_values_covered_by_interval_union(filter_values, n_intervals): """Test that each value is at least in one interval. (that is, the cover is a true cover).""" # TODO: Extend to inputs with shape (n_samples, 1) @@ -151,14 +139,11 @@ def test_filter_values_covered_by_interval_union(filter_values, assert_almost_equal(filter_values_union, filter_values) -@given( - pts=get_filter(), - n_intervals=get_nb_intervals(), - overlap_frac=get_overlap_fraction(), - kind=get_kind() -) -def test_fit_transform_against_fit_and_transform(pts, n_intervals, - kind, overlap_frac): +@given(pts=get_filter_values(), n_intervals=get_nb_intervals(), + overlap_frac=get_overlap_fraction(), kind=get_kind()) +def test_fit_transform_against_fit_and_transform( + pts, n_intervals, kind, overlap_frac + ): """Fitting and transforming should give the same result as fit_transform""" cover = OneDimensionalCover(n_intervals=n_intervals, kind=kind, overlap_frac=overlap_frac) @@ -182,7 +167,9 @@ def test_fit_transform_limits_not_computed(): _ = cover.get_fitted_intervals() -@given(pts=get_filter(shape=array_shapes(min_dims=2, max_dims=2, min_side=2))) +@given(pts=get_filter_values( + shape=array_shapes(min_dims=2, max_dims=2, min_side=2) + )) def test_two_dimensional_tensor(pts): """Verify that the oneDimensionalCover fails for an input with more than one dimension, and that the CubicalCover @@ -194,34 +181,31 @@ def test_two_dimensional_tensor(pts): _ = cubical.fit(pts) -@given(filter=get_filter(), - kind=get_kind(), - n_intervals=get_nb_intervals(), - overlap_fraction=get_overlap_fraction(),) -def test_cubical_fit_transform_consistent_with_OneD(filter, kind, - n_intervals, - overlap_fraction): +@given(filter_values=get_filter_values(), kind=get_kind(), + n_intervals=get_nb_intervals(), overlap_fraction=get_overlap_fraction()) +@pytest.mark.parametrize("cover_cls", [OneDimensionalCover, CubicalCover]) +def test_fit_transform_equals_fittransform( + filter_values, kind, n_intervals, overlap_fraction, cover_cls + ): """Check that CubicalCover gives the same results as OneDimensionalCover, - on one-d data """ - one_d = OneDimensionalCover(kind, n_intervals, overlap_fraction) - cubical = CubicalCover(kind, n_intervals, overlap_fraction) - x_one_d = one_d.fit_transform(filter) - x_cubical = cubical.fit_transform(filter) - assert_almost_equal(x_one_d, x_cubical) - - -@given(filter=get_filter(), - kind=get_kind(), - n_intervals=get_nb_intervals(), - overlap_fraction=get_overlap_fraction(), - ) -def test_cubical_fit_A_transform_consistent_with_OneD(filter, kind, - n_intervals, - overlap_fraction): + on 1D data.""" + cover = cover_cls(kind, n_intervals, overlap_fraction) + cover_clone = clone(cover) + assert np.array_equal( + cover.fit_transform(filter_values), + cover_clone.fit(filter_values).transform(filter_values) + ) + + +@given(filter_values=get_filter_values(), kind=get_kind(), + n_intervals=get_nb_intervals(), overlap_fraction=get_overlap_fraction()) +def test_cubical_fit_transform_consistent_with_1D( + filter_values, kind, n_intervals, overlap_fraction + ): """Check that CubicalCover gives the same results as OneDimensionalCover, on one-d data """ one_d = OneDimensionalCover(kind, n_intervals, overlap_fraction) cubical = CubicalCover(kind, n_intervals, overlap_fraction) - x_one_d = one_d.fit(filter).transform(filter) - x_cubical = cubical.fit(filter).transform(filter) - assert_almost_equal(x_one_d, x_cubical) + x_one_d = one_d.fit(filter_values).transform(filter_values) + x_cubical = cubical.fit(filter_values).transform(filter_values) + assert np.array_equal(x_one_d, x_cubical) diff --git a/gtda/mapper/tests/test_filter.py b/gtda/mapper/tests/test_filter.py index 6c7815b90..527149310 100644 --- a/gtda/mapper/tests/test_filter.py +++ b/gtda/mapper/tests/test_filter.py @@ -1,24 +1,28 @@ +"""Testing for Mapper filter functions.""" +# License: GNU AGPLv3 + +import warnings + import numpy as np from hypothesis import given from hypothesis.extra.numpy import array_shapes, arrays from hypothesis.strategies import integers, floats from numpy.testing import assert_almost_equal from scipy.spatial.distance import pdist, squareform +from sklearn.neighbors import KernelDensity from gtda.mapper import Eccentricity, Entropy, Projection from gtda.mapper.utils._list_feature_union import ListFeatureUnion from gtda.mapper.utils.decorators import method_to_transform -from sklearn.neighbors import KernelDensity - -@given( - X=arrays(dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False), - shape=array_shapes(min_dims=2, max_dims=2)), - exponent=integers(min_value=1, max_value=100) -) +@given(X=arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=-1e3, + max_value=1e3), + shape=array_shapes(min_dims=2, max_dims=2)), + exponent=integers(min_value=1, max_value=10)) def test_eccentricity_shape_equals_number_of_samples(X, exponent): """Verify that eccentricity preserves the nb of samples in the input.""" eccentricity = Eccentricity(exponent=exponent) @@ -28,7 +32,9 @@ def test_eccentricity_shape_equals_number_of_samples(X, exponent): @given(X=arrays(dtype=np.float, elements=floats(allow_nan=False, - allow_infinity=False), + allow_infinity=False, + min_value=-1e3, + max_value=1e3), shape=array_shapes(min_dims=2, max_dims=2))) def test_eccentricity_values_with_infinity_norm_equals_max_row_values(X): eccentricity = Eccentricity(exponent=np.inf) @@ -37,35 +43,33 @@ def test_eccentricity_values_with_infinity_norm_equals_max_row_values(X): assert_almost_equal(Xt, np.max(distance_matrix, axis=1).reshape(-1, 1)) -@given(X=arrays( - dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False, - min_value=-1e3, - max_value=-1), - shape=array_shapes(min_dims=2, max_dims=2, min_side=2) -)) +@given(X=arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=-1e3, + max_value=-1), + shape=array_shapes(min_dims=2, max_dims=2, min_side=2))) def test_entropy_values_for_negative_inputs(X): """Verify the numerical results of entropy (does it have the correct logic), on a collection of **negative** inputs.""" entropy = Entropy() - Xt = entropy.fit_transform(X) - probs = X / X.sum(axis=1, keepdims=True) - entropies = - np.einsum('ij,ij->i', probs, - np.where(probs != 0, np.log2(probs), 0)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + Xt = entropy.fit_transform(X) + probs = X / X.sum(axis=1, keepdims=True) + entropies = - np.einsum('ij,ij->i', probs, + np.where(probs != 0, np.log2(probs), 0)) assert_almost_equal(Xt, entropies[:, None]) -@given(X=arrays( - dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False, - min_value=1, - max_value=1e3), - shape=array_shapes(min_dims=2, max_dims=2, min_side=2) -)) +@given(X=arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=1, + max_value=1e3), + shape=array_shapes(min_dims=2, max_dims=2, min_side=2))) def test_entropy_values_for_positive_inputs(X): - """Verify the numerical results of entropy (does it have the correct logic), + """Verify the numerical results of entropy (does it have the correct logic) on a collection of **positive** inputs.""" entropy = Entropy() Xt = entropy.fit_transform(X) @@ -77,25 +81,25 @@ def test_entropy_values_for_positive_inputs(X): @given(X=arrays(dtype=np.float, elements=floats(allow_nan=False, - allow_infinity=False), - shape=array_shapes(min_dims=2, max_dims=2))) + allow_infinity=False, + min_value=-1e3, + max_value=1e3), + shape=array_shapes(min_dims=2, max_dims=2, min_side=2))) def test_projection_values_equal_slice(X): """Test the logic of the ``Projection`` transformer.""" columns = np.random.choice( - X.shape[1], 1 + np.random.randint(X.shape[1])) + X.shape[1], 1 + np.random.randint(X.shape[1] - 1)) Xt = Projection(columns=columns).fit_transform(X) assert_almost_equal(Xt, X[:, columns]) -@given(X=arrays( - dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False, - min_value=1, - max_value=1e3), - shape=array_shapes(min_dims=2, max_dims=2, min_side=2), - unique=True -)) +@given(X=arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=1, + max_value=1e3), + shape=array_shapes(min_dims=2, max_dims=2, min_side=2), + unique=True)) def test_gaussian_density_values(X): """Check that ``fit_transform`` and ``fit + score_samples`` of ``KernelDensity`` are the same.""" @@ -107,15 +111,13 @@ def test_gaussian_density_values(X): assert_almost_equal(Xt_actual, Xt_desired) -@given(X=arrays( - dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False, - min_value=1, - max_value=1e3), - shape=array_shapes(min_dims=2, max_dims=2, min_side=2), - unique=True -)) +@given(X=arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=1, + max_value=1e3), + shape=array_shapes(min_dims=2, max_dims=2, min_side=2), + unique=True)) def test_list_feature_union_transform(X): """Check that a ``ListFeatureUnion`` of two projections gives the same result as stacking the projections.""" @@ -131,20 +133,17 @@ def test_list_feature_union_transform(X): assert_almost_equal(x_12, x_1_2) -@given(X=arrays( - dtype=np.float, - elements=floats(allow_nan=False, - allow_infinity=False, - min_value=1, - max_value=1e3), - shape=array_shapes(min_dims=2, max_dims=2, min_side=2), - unique=True -)) +@given(X=arrays(dtype=np.float, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=1, + max_value=1e3), + shape=array_shapes(min_dims=2, max_dims=2, min_side=2), + unique=True)) def test_list_feature_union_drops(X): """Check the the drop of ``ListFeatureUnion`` keeps the correct number of samples""" - drop_0_1 = ListFeatureUnion([('drop' + str(k), 'drop') - for k in range(2)]) + drop_0_1 = ListFeatureUnion([('drop' + str(k), 'drop') for k in range(2)]) x_01_a = drop_0_1.fit_transform(X) x_01_b = drop_0_1.transform(X) assert x_01_a.shape == (X.shape[0], 0) diff --git a/gtda/mapper/tests/test_nerve.py b/gtda/mapper/tests/test_nerve.py index 912ed09fa..9616b1520 100644 --- a/gtda/mapper/tests/test_nerve.py +++ b/gtda/mapper/tests/test_nerve.py @@ -1,16 +1,23 @@ +"""Testing for Nerve (Mapper graph construction).""" +# License: GNU AGPLv3 + import numpy as np -from hypothesis import given +import pytest +from hypothesis import given, settings from hypothesis.extra.numpy import arrays, array_shapes from hypothesis.strategies import floats -from gtda.mapper.pipeline import make_mapper_pipeline +from sklearn.cluster import DBSCAN +from sklearn.datasets import make_circles + +from gtda.mapper import Projection, OneDimensionalCover, make_mapper_pipeline +@settings(deadline=5000) @given(X=arrays(dtype=np.float, unique=True, elements=floats(allow_nan=False, allow_infinity=False, - min_value=-1e10, - max_value=1e10 - ), + min_value=-1e6, + max_value=1e6), shape=array_shapes(min_dims=2, max_dims=2, min_side=11))) def test_node_intersection(X): # TODO: Replace pipe and graph by Nerve transformer @@ -22,10 +29,118 @@ def test_node_intersection(X): # Check if the elements of nodes defining an edge are disjoint or not: # If True, they are disjoint, i.e. the created edge is incorrect. # If all are False, all edges are correct. - disjoint_nodes = [set(graph['node_metadata']['node_elements'][node_1]) - .isdisjoint(graph['node_metadata']['node_elements'] - [node_2]) + disjoint_nodes = [set(graph.vs['node_elements'][node_1]) + .isdisjoint(graph.vs['node_elements'][node_2]) for node_1, node_2 in graph.get_edgelist()] # Check if there is a disjoint node pair given by an edge. assert not any(disjoint_nodes) + + +@settings(deadline=5000) +@given(X=arrays(dtype=np.float, unique=True, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=-1e6, + max_value=1e6), + shape=array_shapes(min_dims=2, max_dims=2, min_side=11))) +def test_edge_elements(X): + # TODO: Replace pipe and graph by Nerve transformer + # TODO: Improve the Hypothesis strategy to avoid needing to hardcode the + # min_side to be greater than n_intervals (10 by default). + pipe = make_mapper_pipeline() + pipe_edge_elems = make_mapper_pipeline(store_edge_elements=True) + + graph = pipe.fit_transform(X) + graph_edge_elems = pipe_edge_elems.fit_transform(X) + + # Check that when store_edge_elements=False (default) there is no + # "edge_elements" attribute. + with pytest.raises(KeyError): + _ = graph.es["edge_elements"] + + # Check that graph and graph_ee agree otherwise + # Vertices + assert graph.vs.indices == graph_edge_elems.vs.indices + for attr_name in ["pullback_set_label", "partial_cluster_label"]: + assert graph.vs[attr_name] == graph_edge_elems.vs[attr_name] + node_elements = graph.vs["node_elements"] + node_elements_ee = graph_edge_elems.vs["node_elements"] + assert all([np.array_equal(node, node_ee) + for node, node_ee in zip(node_elements, node_elements_ee)]) + assert graph.vs.indices == graph_edge_elems.vs.indices + # Edges + assert graph.es.indices == graph_edge_elems.es.indices + assert graph.es["weight"] == graph_edge_elems.es["weight"] + assert all([edge.tuple == edge_ee.tuple + for edge, edge_ee in zip(graph.es, graph_edge_elems.es)]) + + # Check that the arrays edge_elements contain precisely those indices which + # are in the element sets associated to both the first and second vertex, + # and that the edge weight equals the size of edge_elements. + flag = True + for edge in graph_edge_elems.es: + v1, v2 = edge.vertex_tuple + flag *= np.array_equal( + edge["edge_elements"], + np.intersect1d(v1["node_elements"], v2["node_elements"]) + ) + flag *= len(edge["edge_elements"]) == edge["weight"] + assert flag + + +@settings(deadline=5000) +@pytest.mark.parametrize("min_intersection", [1, 2, 3, 10]) +@given(X=arrays(dtype=np.float, unique=True, + elements=floats(allow_nan=False, + allow_infinity=False, + min_value=-1e6, + max_value=1e6), + shape=array_shapes(min_dims=2, max_dims=2, min_side=11))) +def test_min_intersection(X, min_intersection): + # TODO: Replace pipe and graph by Nerve transformer + # TODO: Improve the Hypothesis strategy to avoid needing to hardcode the + # min_side to be greater than n_intervals (10 by default). + pipe = make_mapper_pipeline(min_intersection=min_intersection) + graph = pipe.fit_transform(X) + + # Check that there are no edges with weight less than min_intersection + assert all([x >= min_intersection for x in graph.es["weight"]]) + + +def test_contract_nodes(): + """Test that, on a pathological dataset, we generate a graph without edges + when `contract_nodes` is set to False and with edges when it is set to + True.""" + X = make_circles(n_samples=2000)[0] + + filter_func = Projection() + cover = OneDimensionalCover(n_intervals=5, overlap_frac=0.4) + p = filter_func.fit_transform(X) + m = cover.fit_transform(p) + + gap = 0.1 + idx_to_remove = [] + for i in range(m.shape[1] - 1): + inters = np.logical_and(m[:, i], m[:, i + 1]) + inters_idx = np.flatnonzero(inters) + p_inters = p[inters_idx] + min_p, max_p = np.min(p_inters), np.max(p_inters) + idx_to_remove += list( + np.flatnonzero((min_p <= p) & (p <= min_p + gap))) + idx_to_remove += list( + np.flatnonzero((max_p - gap <= p) & (p <= max_p))) + + X_f = X[[x for x in range(len(X)) if x not in idx_to_remove]] + + clusterer = DBSCAN(eps=0.05) + pipe = make_mapper_pipeline(filter_func=filter_func, + cover=cover, + clusterer=clusterer, + contract_nodes=True) + graph = pipe.fit_transform(X_f) + assert not len(graph.es) + + pipe.set_params(contract_nodes=False) + graph = pipe.fit_transform(X_f) + assert len(graph.es) diff --git a/gtda/mapper/tests/test_visualization.py b/gtda/mapper/tests/test_visualization.py index 0032e4067..a157951ec 100644 --- a/gtda/mapper/tests/test_visualization.py +++ b/gtda/mapper/tests/test_visualization.py @@ -1,13 +1,15 @@ -import plotly.io as pio -import numpy as np -import warnings +"""Testing for Mapper plotting functions.""" +# License: GNU AGPLv3 from unittest import TestCase -from gtda.mapper import make_mapper_pipeline -from gtda.mapper import (plot_interactive_mapper_graph, - plot_static_mapper_graph) -from gtda.mapper import FirstSimpleGap +import numpy as np +import pandas as pd +import plotly.io as pio +import pytest + +from gtda.mapper import FirstSimpleGap, CubicalCover, make_mapper_pipeline, \ + plot_static_mapper_graph, plot_interactive_mapper_graph class TestCaseNoTemplate(TestCase): @@ -18,28 +20,142 @@ def tearDown(self): pio.templates.default = "plotly" -X = np.array([[-19.33965799, -284.58638371], - [-290.25710696, 184.31095197], - [250.38108853, 134.5112574], - [-259.46357187, -172.12937543], - [115.72180479, -69.67624071], - [120.12187185, 248.39783826], - [234.08476944, 115.54743986], - [246.68634685, 119.170029], - [-154.27214561, -272.07656956], - [225.37435664, 186.3253872], - [54.17543392, 76.4066916], - [175.28163213, -193.46279193], - [228.63910018, -121.16687597], - [-101.58902866, 48.86471748], - [-185.23421146, 244.14414753], - [-275.05799067, -204.99265911], - [-170.12180583, 176.10258455], - [-155.54055842, -214.420498], - [184.6940872, 2.08810678], - [-184.42012962, 28.8978038]]) -colors = np.array([8., 8., 3., 8., 0., 8., 8., 8., 5., - 8., 8., 8., 8., 4., 2., 8., 1., 8., 2., 8.]) +N = 50 +d = 3 +X_arr = np.random.randn(N, d) +X_df = pd.DataFrame(X_arr, columns=["a", "b", "c"]) +colors = np.random.randint(0, 10, N) + +viridis_colorscale = ((0.0, '#440154'), + (0.1111111111111111, '#482878'), + (0.2222222222222222, '#3e4989'), + (0.3333333333333333, '#31688e'), + (0.4444444444444444, '#26828e'), + (0.5555555555555556, '#1f9e89'), + (0.6666666666666666, '#35b779'), + (0.7777777777777778, '#6ece58'), + (0.8888888888888888, '#b5de2b'), + (1.0, '#fde725')) + +hsl_colorscale = ['hsl(19.0, 96.0%, 67.0%)', + 'hsl(60.0, 100.0%, 87.0%)', + 'hsl(203.0, 51.0%, 71.0%)'] + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +@pytest.mark.parametrize("layout_dim", [2, 3]) +def test_valid_layout_dim(X, layout_dim): + pipe = make_mapper_pipeline() + fig = plot_static_mapper_graph(pipe, X, layout_dim=layout_dim) + edge_trace = fig.data[0] + assert hasattr(edge_trace, "x") and hasattr(edge_trace, "y") + is_z_present = hasattr(edge_trace, "z") + assert is_z_present if layout_dim == 3 else not is_z_present + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +@pytest.mark.parametrize("layout_dim", [1, 4]) +def test_invalid_layout_dim(X, layout_dim): + with pytest.raises(ValueError): + pipe = make_mapper_pipeline() + _ = plot_static_mapper_graph(pipe, X, layout_dim=layout_dim) + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +def test_invalid_layout_algorithm(X): + with pytest.raises(KeyError): + pipe = make_mapper_pipeline() + _ = plot_static_mapper_graph(pipe, X, layout="foobar") + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +@pytest.mark.parametrize("layout_dim", [2, 3]) +def test_valid_hoverlabel_bgcolor(X, layout_dim): + pipe = make_mapper_pipeline() + fig = plot_static_mapper_graph( + pipe, X, layout_dim=layout_dim, + plotly_params={"node_trace": {"hoverlabel_bgcolor": "white"}} + ) + assert fig.data[1]["hoverlabel"]["bgcolor"] == "white" + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +def test_unsuitable_colorscale_for_hoverlabel_3d(X): + pipe = make_mapper_pipeline() + with pytest.warns(RuntimeWarning): + _ = plot_static_mapper_graph( + pipe, X, layout_dim=3, + plotly_params={"node_trace": {"marker_colorscale": hsl_colorscale}} + ) + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +def test_valid_colorscale(X): + pipe = make_mapper_pipeline() + + fig_2d = plot_static_mapper_graph( + pipe, X, layout_dim=2, + plotly_params={"node_trace": {"marker_colorscale": "blues"}} + ) + fig_3d = plot_static_mapper_graph( + pipe, X, layout_dim=3, + plotly_params={"node_trace": {"marker_colorscale": "blues"}} + ) + + # Test that the custom colorscale is correctly applied both in 2d and in 3d + marker_colorscale = fig_2d.data[1]["marker"]["colorscale"] + marker_colorscale_3d = fig_3d.data[1]["marker"]["colorscale"] + assert marker_colorscale == marker_colorscale_3d + + # Test that the default colorscale is "viridis" and that the custom one is + # different + fig_default = plot_static_mapper_graph(pipe, X) + marker_colorscale_default = \ + fig_default.data[1]["marker"]["colorscale"] + assert marker_colorscale_default == viridis_colorscale + assert marker_colorscale != marker_colorscale_default + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +@pytest.mark.parametrize("color_variable", [None, colors]) +@pytest.mark.parametrize("node_color_statistic", [None, np.max]) +def test_colors_same_2d_3d(X, color_variable, node_color_statistic): + pipe = make_mapper_pipeline() + fig_2d = plot_static_mapper_graph( + pipe, X, layout_dim=2, color_variable=color_variable, + node_color_statistic=node_color_statistic + ) + fig_3d = plot_static_mapper_graph( + pipe, X, layout_dim=3, color_variable=color_variable, + node_color_statistic=node_color_statistic + ) + assert fig_2d.data[1].marker.color == fig_3d.data[1].marker.color + + +@pytest.mark.parametrize("X, columns", [(X_arr, range(X_arr.shape[1])), + (X_df, X_df.columns)]) +@pytest.mark.parametrize("layout_dim", [2, 3]) +def test_color_by_column_dropdown(X, columns, layout_dim): + pipe = make_mapper_pipeline() + fig = plot_static_mapper_graph( + pipe, X, layout_dim=layout_dim, color_by_columns_dropdown=True + ) + fig_buttons = fig.layout.updatemenus[0].buttons + + assert list(fig.data[1].marker.color) == \ + list(fig_buttons[0].args[0]["marker.color"][1]) + + for i, col in enumerate(columns): + fig_col = plot_static_mapper_graph( + pipe, X, layout_dim=layout_dim, color_variable=col + ) + assert list(fig_col.data[1].marker.color) == \ + list(fig_buttons[i + 1].args[0]["marker.color"][1]) + + +def _get_size_from_hovertext(s): + size_str = s.split("
")[3].split(": ")[1] + return int(size_str) class TestStaticPlot(TestCaseNoTemplate): @@ -48,52 +164,106 @@ def test_is_data_present(self): """Verify that what we see in the graph corresponds to the number of samples in the graph.""" pipe = make_mapper_pipeline() - warnings.simplefilter("ignore") - fig = plot_static_mapper_graph(pipe, X, + fig = plot_static_mapper_graph(pipe, X_arr, color_variable=colors, clone_pipeline=False) - node_trace_x = fig.get_state()['_data'][1]["x"] - node_trace_y = fig.get_state()['_data'][1]["y"] + node_trace_x = fig.data[1].x + node_trace_y = fig.data[1].y - assert node_trace_x["shape"][0] == node_trace_y["shape"][0] + assert node_trace_x.shape[0] == node_trace_y.shape[0] - num_nodes = node_trace_x["shape"][0] - assert len(X) >= num_nodes + num_nodes = node_trace_x.shape[0] + assert len(X_arr) >= num_nodes - fig_colors = fig.get_state()['_data'][1]['marker']['color'] + fig_colors = fig.data[1].marker.color assert len(fig_colors) == num_nodes - -class TestInteractivePlot(TestCaseNoTemplate): - - def _get_widget_by_trait(self, fig, key, val=None): - for k, v in fig.widgets.items(): - try: - b = getattr(v, key) == val if val is not None \ - else getattr(v, key) - if b: - return fig.widgets[k] - except (AttributeError, TypeError): - pass - - def _get_size_from_hovertext(self, s): - size_str = s.split("
")[1].split(": ")[1] - return int(size_str) - def test_cluster_sizes(self): """Verify that the total number of calculated clusters is equal to the number of displayed clusters.""" pipe = make_mapper_pipeline(clusterer=FirstSimpleGap()) - warnings.simplefilter("ignore") - fig = plot_interactive_mapper_graph(pipe, X) - w_scatter = self._get_widget_by_trait(fig, 'data') + fig = plot_static_mapper_graph(pipe, X_arr) + node_trace = fig.data[1] - node_sizes_vis = [self._get_size_from_hovertext(s_) - for s_ in w_scatter.get_state() - ['_data'][1]['hovertext']] + node_sizes_vis = [_get_size_from_hovertext(ht) for ht in + node_trace.hovertext] - g = pipe.fit_transform(X) - node_size_real = [len(node) - for node in g['node_metadata']['node_elements']] + g = pipe.fit_transform(X_arr) + node_size_real = [len(node) for node in g.vs['node_elements']] assert sum(node_sizes_vis) == sum(node_size_real) + + +def _get_widgets_by_trait(fig, key, val=None): + """Returns a list of widgets containing attribute `key` which currently + evaluates to the value `val`.""" + widgets = [] + for k, v in fig.widgets.items(): + try: + b = getattr(v, key) == val if val is not None else getattr(v, key) + if b: + widgets.append(fig.widgets[k]) + except (AttributeError, TypeError): + continue + return widgets + + +@pytest.mark.parametrize("X", [X_arr, X_df]) +@pytest.mark.parametrize("clone_pipeline", [False, True]) +@pytest.mark.parametrize("layout_dim", [2, 3]) +@pytest.mark.parametrize("color_by_columns_dropdown", [True, False]) +def test_pipeline_cloned(X, clone_pipeline, layout_dim, + color_by_columns_dropdown): + """Verify that the pipeline is changed on interaction if and only if + `clone_pipeline` is False (with `layout_dim` set to 2 or 3).""" + # TODO: Monitor development of the ipytest project to convert these into + # true notebook tests integrated with pytest + params = { + "cover": { + "initial": {"n_intervals": 10, "kind": "uniform", + "overlap_frac": 0.1}, + "new": {"n_intervals": 15, "kind": "balanced", "overlap_frac": 0.2} + }, + "clusterer": { + "initial": {"affinity": "euclidean"}, + "new": {"affinity": "manhattan"} + }, + "contract_nodes": {"initial": True, "new": False}, + "min_intersection": {"initial": 4, "new": 1}, + } + + pipe = make_mapper_pipeline( + cover=CubicalCover(**params["cover"]["initial"]), + clusterer=FirstSimpleGap(**params["clusterer"]["initial"]), + contract_nodes=params["contract_nodes"]["initial"], + min_intersection=params["min_intersection"]["initial"] + ) + fig = plot_interactive_mapper_graph( + pipe, X, clone_pipeline=clone_pipeline, layout_dim=layout_dim, + color_by_columns_dropdown=color_by_columns_dropdown + ) + + # Get relevant widgets and change their states, then check final values + for step, values in params.items(): + if step in ["cover", "clusterer"]: + for param_name, initial_param_value in values["initial"].items(): + new_param_value = values["new"][param_name] + widgets = _get_widgets_by_trait(fig, "description", param_name) + for w in widgets: + w.set_state({'value': new_param_value}) + final_param_value_actual = \ + pipe.get_mapper_params()[f"{step}__{param_name}"] + final_param_value_expected = \ + initial_param_value if clone_pipeline else new_param_value + assert final_param_value_actual == final_param_value_expected + else: + initial_param_value = values["initial"] + new_param_value = values["new"] + widgets = _get_widgets_by_trait(fig, "description", step) + for w in widgets: + w.set_state({'value': new_param_value}) + final_param_value_actual = \ + pipe.get_mapper_params()[f"{step}"] + final_param_value_expected = \ + initial_param_value if clone_pipeline else new_param_value + assert final_param_value_actual == final_param_value_expected diff --git a/gtda/mapper/utils/_cluster.py b/gtda/mapper/utils/_cluster.py index 003875b8e..5964c13be 100644 --- a/gtda/mapper/utils/_cluster.py +++ b/gtda/mapper/utils/_cluster.py @@ -1,5 +1,5 @@ from functools import partial -from math import ceil +from math import floor import numpy as np @@ -15,7 +15,7 @@ def _num_clusters_histogram(distances, freq_threshold, n_bins_start, max_frac): zero_bins = False i = 0 - if max_frac is None: + if max_frac == 1.: while not zero_bins: hist, edges = np.histogram(distances, bins=n_bins_start + i) zero_bins_indices = threshold_func(hist) @@ -26,7 +26,7 @@ def _num_clusters_histogram(distances, freq_threshold, n_bins_start, max_frac): gap_idx = (distances <= left_bin_edge_first_gap).sum() num_clust = distances.size + 1 - gap_idx else: - max_num_clust = ceil(max_frac * distances.size) + max_num_clust = max_frac * (distances.size + 1) over_max_num = True while over_max_num: while (not zero_bins) and over_max_num: @@ -36,14 +36,15 @@ def _num_clusters_histogram(distances, freq_threshold, n_bins_start, max_frac): i += 1 first_gap = zero_bins_indices[0] left_bin_edge_first_gap = edges[first_gap] - gap_idx = (distances <= left_bin_edge_first_gap).sum() + gap_idx = np.sum(distances <= left_bin_edge_first_gap) num_clust = distances.size + 1 - gap_idx if num_clust > max_num_clust: num_clust = max_num_clust break else: over_max_num = False - return num_clust + + return floor(num_clust) def _zero_bins(hist): @@ -55,16 +56,16 @@ def _bins_below_threshold(freq_threshold, hist): def _num_clusters_simple(distances, min_gap_size, max_frac): - # Differences between subsequent elements (padding by the first - # distance) + # Differences between subsequent elements (padding by the first distance) diff = np.ediff1d(distances, to_begin=distances[0]) gap_indices = np.flatnonzero(diff >= min_gap_size) if gap_indices.size: num_clust = distances.size + 1 - gap_indices[0] if max_frac is None: return num_clust - max_num_clust = ceil(max_frac * distances.size) - num_clust = num_clust if num_clust <= max_num_clust else max_num_clust - return num_clust + max_num_clust = max_frac * (distances.size + 1) + if num_clust > max_num_clust: + num_clust = max_num_clust + return floor(num_clust) # No big enough gaps -> one cluster return 1 diff --git a/gtda/mapper/utils/_logging.py b/gtda/mapper/utils/_logging.py index 28c3326e8..afdb2c4c1 100644 --- a/gtda/mapper/utils/_logging.py +++ b/gtda/mapper/utils/_logging.py @@ -13,7 +13,7 @@ def __init__(self, *args, **kwargs): 'width': '100%', 'height': '160px', 'border': '1px solid black', - 'overflow_y': 'auto' + 'overflow': 'auto' } self.out = widgets.Output(layout=layout) @@ -24,7 +24,7 @@ def emit(self, record): 'name': 'stdout', 'output_type': 'stream', 'text': formatted_record+'\n' - } + } self.out.outputs = (new_output,) + self.out.outputs def show_logs(self): diff --git a/gtda/mapper/utils/_visualization.py b/gtda/mapper/utils/_visualization.py index 38eae138d..796652915 100644 --- a/gtda/mapper/utils/_visualization.py +++ b/gtda/mapper/utils/_visualization.py @@ -1,7 +1,7 @@ """Graph layout functions and plotly layout functions.""" # License: GNU AGPLv3 -import operator +from operator import iconcat from copy import deepcopy from functools import reduce, partial @@ -23,16 +23,16 @@ "colorbar": { "thickness": 15, "title": "", "xanchor": "left", "titleside": "right" + } } } -} PLOT_OPTIONS_EDGE_TRACE_DEFAULTS = { "name": "edge_trace", "mode": "lines", "line": {"width": 1, "color": "#888"}, "hoverinfo": "none" -} + } PLOT_OPTIONS_LAYOUT_COMMON_DEFAULTS = { "showlegend": False, @@ -41,17 +41,17 @@ "margin": {"b": 20, "l": 5, "r": 5, "t": 40}, "autosize": False, "annotations": [] -} + } PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_2D = { "title": "", "showgrid": False, "zeroline": False, "showticklabels": False, "ticks": "", "showline": False -} + } PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D = { "title": "", "showbackground": False, "showline": False, "zeroline": False, "showgrid": False, "showticklabels": False, -} + } PLOT_OPTIONS_LAYOUT_DEFAULTS = { "common": PLOT_OPTIONS_LAYOUT_COMMON_DEFAULTS, @@ -59,15 +59,15 @@ "template": "simple_white", "xaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_2D, "yaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_2D - }, + }, 3: { "scene": { "xaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D, "yaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D, "zaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D + } } } -} def _set_node_sizeref(node_sizes, node_scale=12): @@ -90,14 +90,17 @@ def _get_node_size(node_elements): def _get_node_text( - node_ids, num_node_elements, node_summary_statistics -): + node_ids, pullback_set_labels, partial_cluster_labels, + num_node_elements, node_summary_statistics + ): return [ - f"Node ID: {node_id}
Node size: {num_elements}" - f"
Summary statistic: {node_summary_statistic}" - for node_id, num_elements, node_summary_statistic in zip( - node_ids, num_node_elements, node_summary_statistics - ) + f"Node ID: {node_id}
Pullback set label: {pullback_set_label}
" + f"Partial cluster label: {partial_cluster_label}
Node size: " + f"{num_elements}
Summary statistic: {node_summary_statistic}" + for (node_id, pullback_set_label, partial_cluster_label, num_elements, + node_summary_statistic) + in zip(node_ids, pullback_set_labels, partial_cluster_labels, + num_node_elements, node_summary_statistics) ] @@ -109,7 +112,7 @@ def _get_column_color_buttons( data, is_data_dataframe, node_elements, node_colors_color_variable, summary_statistic, hovertext_color_variable, colorscale_for_hoverlabel, n_sig_figs -): + ): # TODO: Consider opting for just-in-time computation instead of computing # all node summary values ahead of time. Solution should preserve scroll # zoom functionality of 2D static visualisation. @@ -129,11 +132,11 @@ def replace_summary_statistic(current_hovertext, new_statistic): "args": [{ "marker.color": [None, node_colors_color_variable], "hovertext": [None, hovertext_color_variable] - }], + }], "label": "color_variable", "method": "restyle" - } - ] + } + ] for column in columns_to_color: if is_data_dataframe: @@ -143,20 +146,20 @@ def replace_summary_statistic(current_hovertext, new_statistic): node_colors = _get_node_summary( column_values, node_elements, summary_statistic - ) + ) hovertext = list(map( replace_summary_statistic, hovertext_color_variable, node_colors - )) + )) new_button = { "args": [{ "marker.color": [None, node_colors], "hovertext": [None, hovertext] - }], + }], "label": f"Column {column}", "method": "restyle" - } + } if colorscale_for_hoverlabel is not None: node_colors = np.asarray(node_colors) @@ -166,7 +169,7 @@ def replace_summary_statistic(current_hovertext, new_statistic): None, _get_colors_for_vals(node_colors, min_col, max_col, colorscale_for_hoverlabel) - ] + ] column_color_buttons.append(new_button) @@ -198,7 +201,7 @@ def _infer_color_variable_kind(color_variable, data): def _get_node_summary_statistics( data, is_data_dataframe, node_elements, summary_statistic, color_variable -): + ): """Calculate values of node summary statistics.""" color_variable_kind = _infer_color_variable_kind(color_variable, data) @@ -227,16 +230,16 @@ def _get_node_summary_statistics( def _calculate_graph_data( pipeline, data, is_data_dataframe, layout, layout_dim, color_variable, node_color_statistic, n_sig_figs, node_scale -): + ): graph = pipeline.fit_transform(data) - node_elements = graph["node_metadata"]["node_elements"] + node_elements = graph.vs["node_elements"] # Determine whether node_color_statistic is an array of node colors is_node_color_statistic_ndarray = hasattr(node_color_statistic, "dtype") if not (is_node_color_statistic_ndarray or callable(node_color_statistic)): raise ValueError( "`node_color_statistic` must be a callable or ndarray." - ) + ) # Compute the raw values of node summary statistics if is_node_color_statistic_ndarray: @@ -245,13 +248,13 @@ def _calculate_graph_data( node_colors_color_variable = _get_node_summary_statistics( data, is_data_dataframe, node_elements, node_color_statistic, color_variable - ) + ) # Load defaults for node and edge traces plot_options = { "node_trace": deepcopy(PLOT_OPTIONS_NODE_TRACE_DEFAULTS), "edge_trace": deepcopy(PLOT_OPTIONS_EDGE_TRACE_DEFAULTS) - } + } # Update size and color of nodes node_sizes = _get_node_size(node_elements) @@ -259,78 +262,72 @@ def _calculate_graph_data( "size": node_sizes, "sizeref": _set_node_sizeref(node_sizes, node_scale=node_scale), "color": node_colors_color_variable - }) + }) # Generate hovertext - node_ids = graph["node_metadata"]["node_id"] - num_node_elements = map(len, graph["node_metadata"]["node_elements"]) + node_ids = graph.vs.indices + pullback_set_ids = graph.vs["pullback_set_label"] + partial_cluster_labels = graph.vs["partial_cluster_label"] + num_node_elements = map(len, graph.vs["node_elements"]) node_colors_round = map( partial(_round_to_n_sig_figs, n=n_sig_figs), node_colors_color_variable - ) + ) plot_options["node_trace"]["hovertext"] = _get_node_text( - node_ids, num_node_elements, node_colors_round - ) + node_ids, pullback_set_ids, partial_cluster_labels, + num_node_elements, node_colors_round + ) # Compute graph layout - is_layout_ndarray = hasattr(layout, "dtype") - if is_layout_ndarray: - if layout.shape[1] not in [2, 3]: - raise ValueError( - f"If an ndarray, `layout` must be 2D with 2 or 3 columns. " - f"Array with {layout.shape[1]} columns passed." - ) - node_pos = layout - else: - if layout_dim not in [2, 3]: - raise ValueError( - f"`layout_dim` must be either 2 or 3. {layout_dim} entered." + if layout_dim not in [2, 3]: + raise ValueError( + f"`layout_dim` must be either 2 or 3. {layout_dim} entered." ) - node_pos = np.asarray(graph.layout(layout, dim=layout_dim).coords) + node_pos = np.asarray(graph.layout(layout, dim=layout_dim).coords) # Store x and y coordinates of edge endpoints edge_x = list( reduce( - operator.iconcat, map( + iconcat, map( lambda e: [node_pos[e.source, 0], node_pos[e.target, 0], None], graph.es - ), [] + ), [] + ) ) - ) edge_y = list( reduce( - operator.iconcat, map( + iconcat, map( lambda e: [node_pos[e.source, 1], node_pos[e.target, 1], None], graph.es - ), [] + ), [] + ) ) - ) if layout_dim == 2: node_trace = go.Scatter( x=node_pos[:, 0], y=node_pos[:, 1], **plot_options["node_trace"] - ) + ) edge_trace = go.Scatter( x=edge_x, y=edge_y, **plot_options["edge_trace"] - ) + ) else: node_trace = go.Scatter3d( x=node_pos[:, 0], y=node_pos[:, 1], z=node_pos[:, 2], **plot_options["node_trace"] - ) + ) edge_z = list( reduce( - operator.iconcat, map( + iconcat, map( lambda e: [node_pos[e.source][2], node_pos[e.target][2], None], graph.es - ), [] + ), [] + ) ) - ) edge_trace = go.Scatter3d( x=edge_x, y=edge_y, z=edge_z, **plot_options["edge_trace"]) @@ -389,7 +386,7 @@ def _get_colors_for_vals(vals, vmin, vmax, colorscale, return_hex=True): vals_rgb = ( left_endpts[:, 1:] + diff_ratios * vals_scaled[:, np.newaxis] + 0.5 - ).astype(np.uint8) + ).astype(np.uint8) if return_hex: return list(map(_rbg_to_hex, vals_rgb)) diff --git a/gtda/mapper/utils/decorators.py b/gtda/mapper/utils/decorators.py index 5c985c197..9e2a89f0b 100644 --- a/gtda/mapper/utils/decorators.py +++ b/gtda/mapper/utils/decorators.py @@ -9,7 +9,7 @@ def method_to_transform(cls, method_name): existing method. An example of use is for classes possessing a :meth:`score` method such as - kernel density estimators and anomaly/novelty detection estimators, to + kernel density estimators and anomaly/novelty detection estimators, allow for these estimators are to be used as steps in a pipeline. Note that 1D array outputs are reshaped into 2D column vectors before @@ -40,29 +40,33 @@ def method_to_transform(cls, method_name): >>> from gtda.mapper import method_to_transform >>> X = np.random.random((100, 2)) >>> kde = KernelDensity() - >>> kde_extended = method_to_transform( - ... KernelDensity, 'score_samples')() + + Extend ``KernelDensity`` to give it a ``transform`` method as an alias + of ``score_samples`` (up to output shape). The new class is instantiated + with the same parameters as the original one. + + >>> ExtendedKDE = method_to_transform(KernelDensity, 'score_samples') + >>> extended_kde = ExtendedKDE() >>> Xt = kde.fit(X).score_samples(X) >>> print(Xt.shape) (100,) - >>> Xt_extended = kde_extended.fit_transform(X) + >>> Xt_extended = extended_kde.fit_transform(X) >>> print(Xt_extended.shape) (100, 1) >>> np.array_equal(Xt, Xt_extended.flatten()) True """ - def wrapper(wrapped): - class ExtendedEstimator(wrapped, TransformerMixin): - def transform(self, X, y=None): - has_method = hasattr(self, method_name) - if has_method: - Xt = getattr(self, method_name)(X) - # reshape 1D estimators to have shape (n_samples, 1) - if Xt.ndim == 1: - Xt = Xt[:, None] - return Xt - ExtendedEstimator.__name__ = 'Extended' + wrapped.__name__ - return ExtendedEstimator - wrapped_cls = wrapper(cls) - return wrapped_cls + class ExtendedEstimator(cls, TransformerMixin): + def transform(self, X, y=None): + has_method = hasattr(self, method_name) + if has_method: + Xt = getattr(self, method_name)(X) + # reshape 1D estimators to have shape (n_samples, 1) + if Xt.ndim == 1: + Xt = Xt[:, None] + return Xt + + ExtendedEstimator.__name__ = 'Extended' + cls.__name__ + + return ExtendedEstimator diff --git a/gtda/mapper/utils/pipeline.py b/gtda/mapper/utils/pipeline.py index d33b4e0b8..cf4e6e5d6 100644 --- a/gtda/mapper/utils/pipeline.py +++ b/gtda/mapper/utils/pipeline.py @@ -13,9 +13,10 @@ def _make_func_apply_along_axis_1(func): def _reshape_after_apply(func, arr): - if func(arr).ndim == 1: - return func(arr).reshape(-1, 1) - return func(arr) + res = func(arr) + if res.ndim == 1: + res = res[:, None] + return res def transformer_from_callable_on_rows(func, validate=True): @@ -28,8 +29,8 @@ def transformer_from_callable_on_rows(func, validate=True): Parameters ---------- - func : callable - A callable object. + func : callable or None + A callable object, or ``None`` which returns the identity transformer. validate : bool, optional, default: ``True`` Whether the output transformer should implement input validation. @@ -58,7 +59,10 @@ def transformer_from_callable_on_rows(func, validate=True): else: func_along_axis = partial(_reshape_after_apply, _make_func_apply_along_axis_1(func)) - return FunctionTransformer(func=func_along_axis, validate=validate) + else: + func_along_axis = None + + return FunctionTransformer(func=func_along_axis, validate=validate) def identity(validate=False): diff --git a/gtda/mapper/visualization.py b/gtda/mapper/visualization.py index e64641ce2..e00e519d0 100644 --- a/gtda/mapper/visualization.py +++ b/gtda/mapper/visualization.py @@ -8,7 +8,7 @@ import numpy as np import plotly.graph_objects as go -from ipywidgets import Layout, widgets +from ipywidgets import widgets, Layout, HTML from sklearn.base import clone from .utils._logging import OutputWidgetHandler @@ -17,7 +17,7 @@ _get_column_color_buttons, _get_colors_for_vals, PLOT_OPTIONS_LAYOUT_DEFAULTS -) + ) def plot_static_mapper_graph( @@ -25,13 +25,32 @@ def plot_static_mapper_graph( color_variable=None, node_color_statistic=None, color_by_columns_dropdown=False, clone_pipeline=True, n_sig_figs=3, node_scale=12, plotly_params=None -): - """Plotting function for static Mapper graphs. - - Nodes are colored according to `color_variable` and `node_color_statistic`. - By default, the hovertext on each node displays a globally unique ID for - the node, the number of data points associated with the node, and the - summary statistic which determines its color. + ): + """Plot Mapper graphs without interactivity on pipeline parameters. + + The output graph is a rendition of the :class:`igraph.Graph` object + computed by calling the :meth:`fit_transform` method of the + :class:`~gtda.mapper.pipeline.MapperPipeline` instance `pipeline` on the + input `data`. The graph's nodes correspond to subsets of elements (rows) in + `data`; these subsets are clusters in larger portions of `data` called + "pullback (cover) sets", which are computed by means of the `pipeline`'s + "filter function" and "cover" and correspond to the differently-colored + portions in `this diagram <../../../../_images/mapper_pipeline.svg>`_. + Two clusters from different pullback cover sets can overlap; if they do, an + edge between the corresponding nodes in the graph may be drawn. + + Nodes are colored according to `color_variable` and `node_color_statistic` + and are sized according to the number of elements they represent. The + hovertext on each node displays, in this order: + + - a globally unique ID for the node, which can be used to retrieve + node information from the :class:`igraph.Graph` object, see + :class:`~gtda.mapper.nerve.Nerve`; + - the label of the pullback (cover) set which the node's elements + form a cluster in; + - a label identifying the node as a cluster within that pullback set; + - the number of elements of `data` associated with the node; + - the value of the summary statistic which determines the node's color. Parameters ---------- @@ -44,7 +63,7 @@ def plot_static_mapper_graph( layout : None, str or callable, optional, default: ``"kamada-kawai"`` Layout algorithm for the graph. Can be any accepted value for the ``layout`` parameter in the :meth:`layout` method of - :class:`igraph.Graph`. [1]_ + :class:`igraph.Graph` [1]_. layout_dim : int, default: ``2`` The number of dimensions for the layout. Can be 2 or 3. @@ -85,7 +104,7 @@ def plot_static_mapper_graph( n_sig_figs : int or None, optional, default: ``3`` If not ``None``, number of significant figures to which to round node - node summary statistics. If ``None``, no rounding is performed. + summary statistics. If ``None``, no rounding is performed. node_scale : int or float, optional, default: ``12`` Sets the scale factor used to determine the rendered size of the @@ -112,6 +131,7 @@ def plot_static_mapper_graph( Setting a colorscale different from the default one: >>> import numpy as np + >>> np.random.seed(1) >>> from gtda.mapper import make_mapper_pipeline, plot_static_mapper_graph >>> pipeline = make_mapper_pipeline() >>> data = np.random.random((100, 3)) @@ -119,10 +139,16 @@ def plot_static_mapper_graph( >>> fig = plot_static_mapper_graph(pipeline, data, ... plotly_params=plotly_params) + Inspect the composition of a node with "Node ID" displayed as 0 in the + hovertext: + + >>> graph = pipeline.fit_transform(data) + >>> graph.vs[0]["node_elements"] + array([70]) + See also -------- - :func:`~gtda.mapper.visualization.plot_interactive_mapper_graph`, - :func:`~gtda.mapper.pipeline.make_mapper_pipeline` + plot_interactive_mapper_graph, gtda.mapper.make_mapper_pipeline References ---------- @@ -144,13 +170,13 @@ def plot_static_mapper_graph( _calculate_graph_data( _pipeline, data, is_data_dataframe, layout, layout_dim, color_variable, _node_color_statistic, n_sig_figs, node_scale - ) + ) # Define layout options layout_options = go.Layout( **PLOT_OPTIONS_LAYOUT_DEFAULTS["common"], **PLOT_OPTIONS_LAYOUT_DEFAULTS[layout_dim] - ) + ) fig = go.FigureWidget(data=[edge_trace, node_trace], layout=layout_options) @@ -169,17 +195,17 @@ def plot_static_mapper_graph( fig.update_traces( hoverlabel_bgcolor=_plotly_params["node_trace"].pop( "hoverlabel_bgcolor" - ), + ), selector={"name": "node_trace"} - ) + ) compute_hoverlabel_bgcolor = False if "marker_colorscale" in _plotly_params["node_trace"]: fig.update_traces( marker_colorscale=_plotly_params["node_trace"].pop( "marker_colorscale" - ), + ), selector={"name": "node_trace"} - ) + ) if compute_hoverlabel_bgcolor: colorscale_for_hoverlabel = fig.data[1].marker.colorscale @@ -190,22 +216,24 @@ def plot_static_mapper_graph( hoverlabel_bgcolor = _get_colors_for_vals( node_colors_color_variable, min_col, max_col, colorscale_for_hoverlabel - ) + ) except Exception as e: if e.args[0] == "This colorscale is not supported.": warn("Data-dependent background hoverlabel colors cannot " "be generated with this choice of colorscale. Please " - "use a standard hex- or RGB-formatted colorscale.") + "use a standard hex- or RGB-formatted colorscale.", + RuntimeWarning) else: warn("Something went wrong in generating data-dependent " "background hoverlabel colors. All background " - "hoverlabel colors will be set to white.") + "hoverlabel colors will be set to white.", + RuntimeWarning) hoverlabel_bgcolor = "white" colorscale_for_hoverlabel = None fig.update_traces( hoverlabel_bgcolor=hoverlabel_bgcolor, selector={"name": "node_trace"} - ) + ) # Compute node colors according to data columns only if necessary if color_by_columns_dropdown: @@ -214,7 +242,7 @@ def plot_static_mapper_graph( data, is_data_dataframe, node_elements, node_colors_color_variable, _node_color_statistic, hovertext_color_variable, colorscale_for_hoverlabel, n_sig_figs - ) + ) # Avoid recomputing hoverlabel bgcolor for top button column_color_buttons[0]["args"][0]["hoverlabel.bgcolor"] = \ [None, fig.data[1].hoverlabel.bgcolor] @@ -224,30 +252,27 @@ def plot_static_mapper_graph( button_height = 1.1 fig.update_layout( updatemenus=[ - go.layout.Updatemenu( - buttons=column_color_buttons, - direction="down", - pad={"r": 10, "t": 10}, - showactive=True, - x=0.11, - xanchor="left", - y=button_height, - yanchor="top" - ), - ]) + go.layout.Updatemenu(buttons=column_color_buttons, + direction="down", + pad={"r": 10, "t": 10}, + showactive=True, + x=0.11, + xanchor="left", + y=button_height, + yanchor="top") + ] + ) if color_by_columns_dropdown: fig.add_annotation( - go.layout.Annotation( - text="Color by:", - x=0, - xref="paper", - y=button_height - 0.045, - yref="paper", - align="left", - showarrow=False + go.layout.Annotation(text="Color by:", + x=0, + xref="paper", + y=button_height - 0.045, + yref="paper", + align="left", + showarrow=False) ) - ) # Update traces and layout according to user input if _plotly_params: @@ -255,7 +280,7 @@ def plot_static_mapper_graph( fig.update_traces( _plotly_params.pop(key, None), selector={"name": key} - ) + ) fig.update_layout(_plotly_params.pop("layout", None)) return fig @@ -266,15 +291,12 @@ def plot_interactive_mapper_graph( color_variable=None, node_color_statistic=None, clone_pipeline=True, color_by_columns_dropdown=False, n_sig_figs=3, node_scale=12, plotly_params=None -): - """Plotting function for interactive Mapper graphs. + ): + """Plot Mapper graphs with interactivity on pipeline parameters. - Provides functionality to interactively update parameters from the cover - and clustering steps defined in `pipeline`. Nodes are colored according to - `color_variable` and `node_color_statistic`. By default, the hovertext on - each node displays a globally unique ID for the node, the number of data - points associated with the node, and the summary statistic which determines - its color. + Extends :func:`~gtda.mapper.visualization.plot_static_mapper_graph` by + providing functionality to interactively update parameters from the cover, + clustering and graph construction steps defined in `pipeline`. Parameters ---------- @@ -287,7 +309,7 @@ def plot_interactive_mapper_graph( layout : None, str or callable, optional, default: ``"kamada-kawai"`` Layout algorithm for the graph. Can be any accepted value for the ``layout`` parameter in the :meth:`layout` method of - :class:`igraph.Graph`. [1]_ + :class:`igraph.Graph` [1]_. layout_dim : int, default: ``2`` The number of dimensions for the layout. Can be 2 or 3. @@ -325,7 +347,7 @@ def plot_interactive_mapper_graph( n_sig_figs : int or None, optional, default: ``3`` If not ``None``, number of significant figures to which to round node - node summary statistics. If ``None``, no rounding is performed. + summary statistics. If ``None``, no rounding is performed. node_scale : int or float, optional, default: ``12`` Sets the scale factor used to determine the rendered size of the @@ -349,8 +371,7 @@ def plot_interactive_mapper_graph( See also -------- - :func:`~gtda.mapper.visualization.plot_static_mapper_graph`, - :func:`~gtda.mapper.pipeline.make_mapper_pipeline` + plot_static_mapper_graph, gtda.mapper.pipeline.make_mapper_pipeline References ---------- @@ -365,32 +386,47 @@ def plot_interactive_mapper_graph( _node_color_statistic = node_color_statistic or np.mean - def get_widgets_per_param(param, value): - if isinstance(value, float): - return (param, widgets.FloatText( - value=value, - step=0.05, - description=param.split("__")[1], - continuous_update=False, - disabled=False - )) - elif isinstance(value, int): - return (param, widgets.IntText( - value=value, - step=1, - description=param.split("__")[1], - continuous_update=False, - disabled=False - )) - elif isinstance(value, str): - return (param, widgets.Text( - value=value, - description=param.split("__")[1], - continuous_update=False, - disabled=False - )) - else: - return None + def get_widgets_per_param(params): + for key, value in params.items(): + style = {'description_width': 'initial'} + description = key.split("__")[1] if "__" in key else key + if isinstance(value, float): + yield (key, widgets.FloatText( + value=value, + step=0.05, + description=description, + continuous_update=False, + disabled=False, + layout=Layout(width="90%"), + style=style + )) + elif isinstance(value, bool): + yield (key, widgets.ToggleButton( + value=value, + description=description, + disabled=False, + layout=Layout(width="90%"), + style=style + )) + elif isinstance(value, int): + yield (key, widgets.IntText( + value=value, + step=1, + description=description, + continuous_update=False, + disabled=False, + layout=Layout(width="90%"), + style=style + )) + elif isinstance(value, str): + yield (key, widgets.Text( + value=value, + description=description, + continuous_update=False, + disabled=False, + layout=Layout(width="90%"), + style=style + )) def on_parameter_change(change): handler.clear_logs() @@ -399,37 +435,37 @@ def on_parameter_change(change): if isinstance(value, (int, float, str)): _pipeline.set_params( **{param: cover_params_widgets[param].value} - ) + ) for param, value in cluster_params.items(): if isinstance(value, (int, float, str)): _pipeline.set_params( **{param: cluster_params_widgets[param].value} - ) + ) + for param, value in nerve_params.items(): + if isinstance(value, (int, bool)): + _pipeline.set_params( + **{param: nerve_params_widgets[param].value} + ) logger.info("Updating figure...") with fig.batch_update(): - ( - edge_trace, node_trace, node_elements, - node_colors_color_variable - ) = _calculate_graph_data( + (edge_trace, node_trace, node_elements, + node_colors_color_variable) = _calculate_graph_data( _pipeline, data, is_data_dataframe, layout, layout_dim, color_variable, _node_color_statistic, n_sig_figs, node_scale - ) - if colorscale_for_hoverlabel is not None: - node_colors_color_variable = np.asarray( - node_colors_color_variable ) + if colorscale_for_hoverlabel is not None: + node_colors_color_variable = \ + np.asarray(node_colors_color_variable) min_col = np.min(node_colors_color_variable) max_col = np.max(node_colors_color_variable) hoverlabel_bgcolor = _get_colors_for_vals( node_colors_color_variable, min_col, max_col, colorscale_for_hoverlabel - ) - fig.update_traces( - hoverlabel_bgcolor=hoverlabel_bgcolor, - selector={"name": "node_trace"} - ) + ) + fig.update_traces(hoverlabel_bgcolor=hoverlabel_bgcolor, + selector={"name": "node_trace"}) fig.update_traces( x=node_trace.x, @@ -440,13 +476,13 @@ def on_parameter_change(change): hovertext=node_trace.hovertext, **({"z": node_trace.z} if layout_dim == 3 else dict()), selector={"name": "node_trace"} - ) + ) fig.update_traces( x=edge_trace.x, y=edge_trace.y, **({"z": edge_trace.z} if layout_dim == 3 else dict()), selector={"name": "edge_trace"} - ) + ) # Update color by column buttons if color_by_columns_dropdown: @@ -456,7 +492,7 @@ def on_parameter_change(change): node_colors_color_variable, _node_color_statistic, hovertext_color_variable, colorscale_for_hoverlabel, n_sig_figs - ) + ) # Avoid recomputing hoverlabel bgcolor for top button if colorscale_for_hoverlabel is not None: column_color_buttons[0]["args"][0][ @@ -476,8 +512,8 @@ def on_parameter_change(change): xanchor="left", y=button_height, yanchor="top" - ), - ]) + ) + ]) valid.value = True except Exception: @@ -509,49 +545,32 @@ def click_box(change): logger.addHandler(handler) logger.setLevel(logging.INFO) - # Initialise cover and cluster dictionaries of parameters and widgets - cover_params = dict( - filter( - lambda x: x[0].startswith("cover"), - _pipeline.get_mapper_params().items() - ) - ) - cover_params_widgets = dict( - filter( - None, map( - lambda x: get_widgets_per_param(*x), - cover_params.items() - ) - ) - ) - cluster_params = dict( - filter( - lambda x: x[0].startswith("clusterer"), - _pipeline.get_mapper_params().items() - ) - ) - cluster_params_widgets = dict( - filter( - None, map( - lambda x: get_widgets_per_param(*x), - cluster_params.items() - ) - ) - ) + # Initialise cover, cluster and nerve dictionaries of parameters and + # widgets + mapper_params_items = _pipeline.get_mapper_params().items() + cover_params = {key: value for key, value in mapper_params_items + if key.startswith("cover__")} + cover_params_widgets = dict(get_widgets_per_param(cover_params)) + cluster_params = {key: value for key, value in mapper_params_items + if key.startswith("clusterer__")} + cluster_params_widgets = dict(get_widgets_per_param(cluster_params)) + nerve_params = {key: value for key, value in mapper_params_items + if key in ["min_intersection", "contract_nodes"]} + nerve_params_widgets = dict(get_widgets_per_param(nerve_params)) # Initialise widgets for validating input parameters of pipeline valid = widgets.Valid( value=True, description="Valid parameters", style={"description_width": "100px"}, - ) + ) # Initialise widget for showing the logs logs_box = widgets.Checkbox( description="Show logs: ", value=False, indent=False - ) + ) # Initialise figure with initial pipeline and config fig = plot_static_mapper_graph( @@ -561,7 +580,7 @@ def click_box(change): color_by_columns_dropdown=color_by_columns_dropdown, clone_pipeline=False, n_sig_figs=n_sig_figs, node_scale=node_scale, plotly_params=plotly_params - ) + ) # Store variables for later updates is_data_dataframe = hasattr(data, "columns") @@ -569,7 +588,7 @@ def click_box(change): colorscale_for_hoverlabel = None if layout_dim == 3: # In plot_static_mapper_graph, hoverlabel bgcolors are set to white if - # something goes wrong computing them according to the colorscale. + # something goes wrong in computing them according to the colorscale. is_bgcolor_not_white = fig.data[1].hoverlabel.bgcolor != "white" user_hoverlabel_bgcolor = False if plotly_params: @@ -581,22 +600,32 @@ def click_box(change): observe_widgets(cover_params, cover_params_widgets) observe_widgets(cluster_params, cluster_params_widgets) + observe_widgets(nerve_params, nerve_params_widgets) logs_box.observe(click_box, names="value") # Define containers for input widgets - container_cover = widgets.HBox( - children=list(cover_params_widgets.values()) - ) + cover_title = HTML(value="Cover parameters") + container_cover = widgets.VBox( + children=[cover_title] + list(cover_params_widgets.values()) + ) + container_cover.layout.align_items = 'center' - container_cluster_layout = Layout(display="flex", flex_flow="row wrap") + cluster_title = HTML(value="Clusterer parameters") + container_cluster = widgets.VBox( + children=[cluster_title] + list(cluster_params_widgets.values()), + ) + container_cluster.layout.align_items = 'center' - container_cluster = widgets.HBox( - children=list(cluster_params_widgets.values()), - layout=container_cluster_layout - ) + nerve_title = HTML(value="Nerve parameters") + container_nerve = widgets.VBox( + children=[nerve_title] + list(nerve_params_widgets.values()), + ) + container_nerve.layout.align_items = 'center' - box = widgets.VBox( - [container_cover, container_cluster, fig, valid, logs_box, out] - ) + container_parameters = widgets.HBox( + children=[container_cover, container_cluster, container_nerve] + ) + + box = widgets.VBox([container_parameters, fig, valid, logs_box, out]) return box diff --git a/gtda/metaestimators/__init__.py b/gtda/metaestimators/__init__.py new file mode 100644 index 000000000..1a422cc1d --- /dev/null +++ b/gtda/metaestimators/__init__.py @@ -0,0 +1,8 @@ +"""The module :mod:`gtda.metaestimators` implements meta-estimators, i.e. +estimators which take other estimators as parameters.""" + +from .collection_transformer import CollectionTransformer + +__all__ = [ + 'CollectionTransformer' + ] diff --git a/gtda/metaestimators/collection_transformer.py b/gtda/metaestimators/collection_transformer.py new file mode 100644 index 000000000..b4c5b37d4 --- /dev/null +++ b/gtda/metaestimators/collection_transformer.py @@ -0,0 +1,190 @@ +"""CollectionTransformer meta-estimator.""" +# License: GNU AGPLv3 + +from functools import reduce +from operator import and_ +from warnings import warn + +import numpy as np +from joblib import Parallel, delayed +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import clone +from sklearn.utils.metaestimators import if_delegate_has_method + +from gtda.utils import check_collection + + +class CollectionTransformer(BaseEstimator, TransformerMixin): + """Meta-transformer for applying a fit-transformer to each input in a + collection. + + If `transformer` possesses a ``fit_transform`` method, + ``CollectionTransformer(transformer)`` also possesses a + :meth:`fit_transform` method which, on each entry in its input ``X``, + fit-transforms a clone of `transformer`. A collection (list or ndarray) of + outputs is returned. + + Note: to have compatibility with scikit-learn and giotto-tda pipelines, a + :meth:`transform` method is also present but it is simply an alias for + :meth:`fit_transform`. + + Parameters + ---------- + transformer : object + The fit-transformer instance from which the transformer acting on + collections is built. Should implement ``fit_transform``. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use in a joblib-parallel application of + `transformer`'s ``fit_transform`` to each input. ``None`` means 1 + unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using + all processors. + + parallel_backend_prefer : ``"processes"`` | ``"threads"`` | ``None``, \ + optional, default: ``None`` + Soft hint for the default joblib backend to use in a joblib-parallel + application of `transformer`'s ``fit_transform`` to each input. See + [1]_. + + parallel_backend_require : ``"sharedmem"`` or None, optional, default: \ + ``None`` + Hard constraint to select the backend. If set to ``'sharedmem'``, the + selected backend will be single-host and thread-based even if the user + asked for a non-thread based backend with parallel_backend. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.decomposition import PCA + >>> from gtda.metaestimators import CollectionTransformer + >>> rng = np.random.default_rng() + + Create a collection of 1000 2D inputs for PCA, as a single 3D ndarray (we + could also create a list of 2D inputs instead). + + >>> X = rng.random((1000, 100, 50)) + + In the case of PCA, joblib parallelism can be very beneficial! + + >>> multi_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1) + >>> Xt = multi_pca.fit_transform(X) + + Since all PCA outputs have the same shape, ``Xt`` is an ndarray. + >>> print(Xt.shape) + (1000, 100, 3) + + See also + -------- + gtda.mapper.utils.pipeline.transformer_from_callable_on_rows, \ + gtda.mapper.utils.decorators.method_to_transform + + References + ---------- + .. [1] "Thread-based parallelism vs process-based parallelism", in + `joblib documentation + `_. + + """ + + def __init__(self, transformer, n_jobs=None, parallel_backend_prefer=None, + parallel_backend_require=None): + self.transformer = transformer + self.n_jobs = n_jobs + self.parallel_backend_prefer = parallel_backend_prefer + self.parallel_backend_require = parallel_backend_require + + def _validate_transformer(self): + if not hasattr(self.transformer, "fit_transform"): + raise TypeError("`transformer` must possess a fit_transform " + "method.") + if not isinstance(self.transformer, BaseEstimator): + warn("`transformer` is not an instance of " + "sklearn.base.BaseEstimator. This will lead to limited " + "functionality in a scikit-learn context.", UserWarning) + + def fit(self, X, y=None): + """Do nothing and return the estimator unchanged. + + This method is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : list of length n_samples, or ndarray of shape (n_samples, ...) + Collection of inputs to be fit-transformed by `transformer`. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + self : object + + """ + check_collection(X, accept_sparse=True, accept_large_sparse=True, + force_all_finite=False) + self._validate_transformer() + + self._is_fitted = True + return self + + @if_delegate_has_method(delegate="transformer") + def fit_transform(self, X, y=None): + """Fit-transform a clone of `transformer` to each element in the + collection `X`. + + Parameters + ---------- + X : list of length n_samples, or ndarray of shape (n_samples, ...) + Collection of inputs to be fit-transformed by `transformer`. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : list of length n_samples, or ndarray of shape (n_samples, ...) + Collection of outputs. It is a list unless all outputs have the + same shape, in which case it is converted to an ndarray. + + """ + Xt = check_collection(X, accept_sparse=True, accept_large_sparse=True, + force_all_finite=False) + self._validate_transformer() + + Xt = Parallel(n_jobs=self.n_jobs, prefer=self.parallel_backend_prefer, + require=self.parallel_backend_require)( + delayed(clone(self.transformer).fit_transform)(x) for x in Xt + ) + + x0_shape = Xt[0].shape + if reduce(and_, (x.shape == x0_shape for x in Xt), True): + Xt = np.asarray(Xt) + + return Xt + + def transform(self, X, y=None): + """Alias for :meth:`fit_transform`. + + Allows for this class to be used as an intermediate step in a + scikit-learn pipeline. + + Parameters + ---------- + X : list of length n_samples, or ndarray of shape (n_samples, ...) + Collection of inputs to be fit-transformed by `transformer`. + + y : None + There is no need for a target in a transformer, yet the pipeline + API requires this parameter. + + Returns + ------- + Xt : list of length n_samples, or ndarray of shape (n_samples, ...) + Collection of outputs. It is a list unless all outputs have the + same shape, in which case it is converted to an ndarray. + + """ + return self.fit_transform(X, y) diff --git a/gtda/metaestimators/tests/__init__.py b/gtda/metaestimators/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/gtda/metaestimators/tests/test_collection_transformer.py b/gtda/metaestimators/tests/test_collection_transformer.py new file mode 100644 index 000000000..a6de70728 --- /dev/null +++ b/gtda/metaestimators/tests/test_collection_transformer.py @@ -0,0 +1,71 @@ +"""Tests for CollectionTransformer.""" +# License: GNU AGPLv3 + +import numpy as np +import pytest +from numpy.testing import assert_almost_equal +from sklearn.decomposition import PCA + +from gtda.metaestimators import CollectionTransformer + +rng = np.random.default_rng() + +X_arr = rng.random((200, 100, 50)) +X_list = list(X_arr) + + +def test_collection_transformer_input_with_nan(): + multi_pca = CollectionTransformer(PCA()) + X = X_arr.copy() + X[0, 0, 0] = np.nan + + with pytest.raises(ValueError): + multi_pca.fit(X) + + +def test_collection_transformer_invalid_transformer(): + multi_pca = CollectionTransformer(np.mean) + + with pytest.raises(TypeError): + multi_pca.fit(X_arr) + + +def test_collection_transformer_is_fitted(): + multi_pca = CollectionTransformer(PCA()) + multi_pca.fit(X_arr) + + assert multi_pca._is_fitted + + +def test_collection_transformer_no_baseestimator_warn(): + class TestTransformer: + def __init__(self): + pass + + def fit_transform(self): + pass + + test_transformer = TestTransformer() + with pytest.warns(UserWarning): + CollectionTransformer(test_transformer).fit(X_arr) + + +@pytest.mark.parametrize("X", [X_arr, X_list]) +@pytest.mark.parametrize("n_jobs", [None, 2, -1]) +def test_collection_transformer_fit_transform(X, n_jobs): + n_components = 3 + pca = PCA(n_components=n_components) + multi_pca = CollectionTransformer(pca, n_jobs=n_jobs) + Xt = multi_pca.fit_transform(X) + assert Xt.shape == (len(X), len(X[0]), n_components) + + first_few_outputs_actual = Xt[:10] + first_few_outputs_exp = np.asarray([pca.fit_transform(x) for x in X[:10]]) + assert_almost_equal(first_few_outputs_actual, first_few_outputs_exp) + + +def test_collection_transformer_transform(): + """Test that transform is an alias of fit-transform.""" + pca = PCA() + assert_almost_equal(CollectionTransformer(pca).fit_transform(X_arr), + CollectionTransformer(pca).transform(X_arr)) diff --git a/gtda/pipeline.py b/gtda/pipeline.py index 3f311b076..fc3846f25 100644 --- a/gtda/pipeline.py +++ b/gtda/pipeline.py @@ -69,7 +69,7 @@ class Pipeline(pipeline.Pipeline): >>> X = np.random.rand(600, 1) >>> n_train, n_test = 400, 200 >>> - >>> labeller = ts.Labeller(width=5, percentiles=[80], + >>> labeller = ts.Labeller(size=6, percentiles=[80], >>> n_steps_future=1) >>> X_train = X[:n_train] >>> y_train = X_train @@ -78,8 +78,8 @@ class Pipeline(pipeline.Pipeline): >>> print(X_train.shape, y_train.shape) (395, 1) (395,) >>> steps = [ - >>> ('embedding', ts.TakensEmbedding()), - >>> ('window', ts.SlidingWindow(width=5, stride=1)), + >>> ('embedding', ts.SingleTakensEmbedding()), + >>> ('window', ts.SlidingWindow(size=6, stride=1)), >>> ('diagram', hl.VietorisRipsPersistence()), >>> ('rescaler', diag.Scaler()), >>> ('filter', diag.Filtering(epsilon=0.1)), diff --git a/gtda/plotting/__init__.py b/gtda/plotting/__init__.py index b812d3178..421f2e05e 100644 --- a/gtda/plotting/__init__.py +++ b/gtda/plotting/__init__.py @@ -12,4 +12,4 @@ 'plot_heatmap', 'plot_betti_curves', 'plot_betti_surfaces' -] + ] diff --git a/gtda/plotting/diagram_representations.py b/gtda/plotting/diagram_representations.py index fe25570b0..50409f75b 100644 --- a/gtda/plotting/diagram_representations.py +++ b/gtda/plotting/diagram_representations.py @@ -5,7 +5,8 @@ import plotly.graph_objs as gobj -def plot_betti_curves(betti_numbers, samplings, homology_dimensions=None): +def plot_betti_curves(betti_numbers, samplings, homology_dimensions=None, + plotly_params=None): """Plot Betti curves by homology dimension. Parameters @@ -23,6 +24,18 @@ def plot_betti_curves(betti_numbers, samplings, homology_dimensions=None): Which homology dimensions to include in the plot. If ``None``, all available homology dimensions will be used. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should be + dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Figure representing the Betti curves. + """ if homology_dimensions is None: _homology_dimensions = list(range(betti_numbers.shape[0])) @@ -40,7 +53,7 @@ def plot_betti_curves(betti_numbers, samplings, homology_dimensions=None): "zeroline": True, "showexponent": "all", "exponentformat": "e" - }, + }, "yaxis1": { "title": "Betti number", "side": "left", @@ -51,27 +64,33 @@ def plot_betti_curves(betti_numbers, samplings, homology_dimensions=None): "zeroline": True, "showexponent": "all", "exponentformat": "e" - }, + }, "plot_bgcolor": "white" - } + } + fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) for dim in _homology_dimensions: fig.add_trace(gobj.Scatter(x=samplings[dim], y=betti_numbers[dim], - mode='lines', showlegend=True, - hoverinfo='none', - name=f'H{int(dim)}')) + mode="lines", showlegend=True, + hoverinfo="none", + name=f"H{int(dim)}")) + + # Update traces and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("traces", None)) + fig.update_layout(plotly_params.get("layout", None)) - fig.show() + return fig def plot_betti_surfaces(betti_curves, samplings=None, - homology_dimensions=None): + homology_dimensions=None, plotly_params=None): """Plot Betti surfaces (Betti numbers against "time" and filtration parameter) by homology dimension. @@ -97,6 +116,22 @@ def plot_betti_surfaces(betti_curves, samplings=None, on the x-axis against the corresponding values in `betti_curves` on the y-axis. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should be + dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + figs/fig : tuple of :class:`plotly.graph_objects.Figure`/\ + :class:`plotly.graph_objects.Figure` object + If ``n_samples > 1``, a tuple of figures representing the Betti + surfaces, with one figure per dimension in `homology_dimensions`. + Otherwise, a single figure representing the Betti curve of the + single sample present. + """ if homology_dimensions is None: _homology_dimensions = list(range(betti_curves.shape[1])) @@ -109,31 +144,44 @@ def plot_betti_surfaces(betti_curves, samplings=None, "type": "linear", "showexponent": "all", "exponentformat": "e" - }, + }, "yaxis": { "title": "Time", "type": "linear", "showexponent": "all", "exponentformat": "e" - }, + }, "zaxis": { "title": "Betti number", "type": "linear", "showexponent": "all", "exponentformat": "e" + } } - } + if betti_curves.shape[0] == 1: - plot_betti_curves(betti_curves[0], samplings, homology_dimensions) + return plot_betti_curves( + betti_curves[0], samplings, + homology_dimensions=homology_dimensions, + plotly_params=plotly_params + ) else: + figs = [] for dim in _homology_dimensions: fig = gobj.Figure() fig.update_layout(scene=scene, - title="Betti surface for homology " - "dimension {}".format(int(dim))) + title=f"Betti surface for homology " + f"dimension {int(dim)}") fig.add_trace(gobj.Surface(x=samplings[dim], y=np.arange(betti_curves.shape[0]), z=betti_curves[:, dim], - connectgaps=True, hoverinfo='none')) + connectgaps=True, hoverinfo="none")) + + # Update traces and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("traces", None)) + fig.update_layout(plotly_params.get("layout", None)) + + figs.append(fig) - fig.show() + return tuple(figs) diff --git a/gtda/plotting/images.py b/gtda/plotting/images.py index be8d7e25c..b8304c163 100644 --- a/gtda/plotting/images.py +++ b/gtda/plotting/images.py @@ -4,8 +4,8 @@ import plotly.graph_objects as gobj -def plot_heatmap(data, x=None, y=None, colorscale='greys', origin='upper', - title=None): +def plot_heatmap(data, x=None, y=None, colorscale="greys", origin="upper", + title=None, plotly_params=None): """Plot a 2D single-channel image, as a heat map from 2D array data. Parameters @@ -19,29 +19,44 @@ def plot_heatmap(data, x=None, y=None, colorscale='greys', origin='upper', y : ndarray of shape (n_pixels_y,) or None, optional, default: ``None`` Vertical coordinates of the pixels in `data`. - colorscale : str, optional, default: ``'greys'`` + colorscale : str, optional, default: ``"greys"`` Color scale to be used in the heat map. Can be anything allowed by :class:`plotly.graph_objects.Heatmap`. - origin : ``'upper'`` | ``'lower'``, optional, default: ``'upper'`` + origin : ``"upper"`` | ``"lower"``, optional, default: ``"upper"`` Position of the [0, 0] pixel of `data`, in the upper left or lower - left corner. The convention ``'upper'`` is typically used for + left corner. The convention ``"upper"`` is typically used for matrices and images. title : str or None, optional, default: ``None`` Title of the resulting figure. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should be + dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Figure representing the 2D single-channel image. + """ - autorange = True if origin == 'lower' else 'reversed' - layout = dict( - xaxis=dict(scaleanchor='y', constrain='domain'), - yaxis=dict(autorange=autorange, constrain='domain'), - plot_bgcolor='white', - title=title - ) + autorange = True if origin == "lower" else "reversed" + layout = { + "xaxis": {"scaleanchor": "y", "constrain": "domain"}, + "yaxis": {"autorange": autorange, "constrain": "domain"}, + "plot_bgcolor": "white", + "title": title + } fig = gobj.Figure(layout=layout) - fig.add_trace(gobj.Heatmap( - z=data, x=x, y=y, colorscale=colorscale - )) + fig.add_trace(gobj.Heatmap(z=data * 1, x=x, y=y, colorscale=colorscale)) + + # Update trace and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("trace", None)) + fig.update_layout(plotly_params.get("layout", None)) - fig.show() + return fig diff --git a/gtda/plotting/persistence_diagrams.py b/gtda/plotting/persistence_diagrams.py index e4e5e3eb3..a01fd4afd 100644 --- a/gtda/plotting/persistence_diagrams.py +++ b/gtda/plotting/persistence_diagrams.py @@ -5,7 +5,7 @@ import plotly.graph_objs as gobj -def plot_diagram(diagram, homology_dimensions=None, **input_layout): +def plot_diagram(diagram, homology_dimensions=None, plotly_params=None): """Plot a single persistence diagram. Parameters @@ -19,68 +19,122 @@ def plot_diagram(diagram, homology_dimensions=None, **input_layout): Homology dimensions which will appear on the plot. If ``None``, all homology dimensions which appear in `diagram` will be plotted. - """ - from ..diagrams._utils import _subdiagrams # To avoid circular imports + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"traces"`` and ``"layout"``, and the corresponding values should be + dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Figure representing the persistence diagram. + """ # TODO: increase the marker size if homology_dimensions is None: homology_dimensions = np.unique(diagram[:, 2]) - max_filt_param = np.where(np.isinf(diagram), -np.inf, diagram).max() + diagram = diagram[diagram[:, 0] != diagram[:, 1]] + diagram_no_dims = diagram[:, :2] + posinfinite_mask = np.isposinf(diagram_no_dims) + neginfinite_mask = np.isneginf(diagram_no_dims) + max_val = np.max(np.where(posinfinite_mask, -np.inf, diagram_no_dims)) + min_val = np.min(np.where(neginfinite_mask, np.inf, diagram_no_dims)) + parameter_range = max_val - min_val + extra_space_factor = 0.02 + has_posinfinite_death = np.any(posinfinite_mask[:, 1]) + if has_posinfinite_death: + posinfinity_val = max_val + 0.1 * parameter_range + extra_space_factor += 0.1 + extra_space = extra_space_factor * parameter_range + min_val_display = min_val - extra_space + max_val_display = max_val + extra_space - layout = dict( - width=500, - height=500, - xaxis1=dict( - title='Birth', - side='bottom', - type='linear', - range=[0, 1.1 * max_filt_param], - ticks='outside', - anchor='y1', - showline=True, - zeroline=True, - showexponent='all', - exponentformat='e' - ), - yaxis1=dict( - title='Death', - side='left', - type='linear', - range=[0, 1.1 * max_filt_param], - ticks='outside', - anchor='x1', - showline=True, - zeroline=True, - showexponent='all', - exponentformat='e' - ), - plot_bgcolor='white' - ) + fig = gobj.Figure() + fig.add_trace(gobj.Scatter( + x=[min_val_display, max_val_display], + y=[min_val_display, max_val_display], + mode="lines", + line={"dash": "dash", "width": 1, "color": "black"}, + showlegend=False, + hoverinfo="none" + )) - layout.update(input_layout) + for dim in homology_dimensions: + name = f"H{int(dim)}" if dim != np.inf else "Any homology dimension" + subdiagram = diagram[diagram[:, 2] == dim] + unique, inverse, counts = np.unique( + subdiagram, axis=0, return_inverse=True, return_counts=True + ) + hovertext = [ + f"{tuple(unique[unique_row_index][:2])}" + + ( + f", multiplicity: {counts[unique_row_index]}" + if counts[unique_row_index] > 1 else "" + ) + for unique_row_index in inverse + ] + y = subdiagram[:, 1] + if has_posinfinite_death: + y[np.isposinf(y)] = posinfinity_val + fig.add_trace(gobj.Scatter( + x=subdiagram[:, 0], y=y, mode="markers", + hoverinfo="text", hovertext=hovertext, name=name + )) - fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', - mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', - mirror=False) + fig.update_layout( + width=500, + height=500, + xaxis1={ + "title": "Birth", + "side": "bottom", + "type": "linear", + "range": [min_val_display, max_val_display], + "autorange": False, + "ticks": "outside", + "showline": True, + "zeroline": True, + "linewidth": 1, + "linecolor": "black", + "mirror": False, + "showexponent": "all", + "exponentformat": "e" + }, + yaxis1={ + "title": "Death", + "side": "left", + "type": "linear", + "range": [min_val_display, max_val_display], + "autorange": False, "scaleanchor": "x", "scaleratio": 1, + "ticks": "outside", + "showline": True, + "zeroline": True, + "linewidth": 1, + "linecolor": "black", + "mirror": False, + "showexponent": "all", + "exponentformat": "e" + }, + plot_bgcolor="white" + ) - fig.add_trace(gobj.Scatter(x=np.array([-100 * max_filt_param, - 100 * max_filt_param]), - y=np.array([-100 * max_filt_param, - 100 * max_filt_param]), - mode='lines', - line=dict(dash='dash', width=1, color='black'), - showlegend=False, hoverinfo='none')) + # Add a horizontal dashed line for points with infinite death + if has_posinfinite_death: + fig.add_trace(gobj.Scatter( + x=[min_val_display, max_val_display], + y=[posinfinity_val, posinfinity_val], + mode="lines", + line={"dash": "dash", "width": 0.5, "color": "black"}, + showlegend=True, + name=u"\u221E", + hoverinfo="none" + )) - for dim in homology_dimensions: - name = f'H{int(dim)}' if dim != np.inf else 'Any homology dimension' - subdiagram = _subdiagrams(np.asarray([diagram]), [dim], - remove_dim=True)[0] - diff = (subdiagram[:, 1] != subdiagram[:, 0]) - subdiagram = subdiagram[diff] - fig.add_trace(gobj.Scatter(x=subdiagram[:, 0], y=subdiagram[:, 1], - mode='markers', name=name)) + # Update traces and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("traces", None)) + fig.update_layout(plotly_params.get("layout", None)) - fig.show() + return fig diff --git a/gtda/plotting/point_clouds.py b/gtda/plotting/point_clouds.py index fef2010a2..bb7eb262e 100644 --- a/gtda/plotting/point_clouds.py +++ b/gtda/plotting/point_clouds.py @@ -4,11 +4,13 @@ import numpy as np import plotly.graph_objs as gobj +from ..utils.validation import validate_params -def plot_point_cloud(point_cloud, dimension=None): + +def plot_point_cloud(point_cloud, dimension=None, plotly_params=None): """Plot the first 2 or 3 coordinates of a point cloud. - This function will not work on 1D arrays. + Note: this function does not work on 1D arrays. Parameters ---------- @@ -20,8 +22,22 @@ def plot_point_cloud(point_cloud, dimension=None): Sets the dimension of the resulting plot. If ``None``, the dimension will be chosen between 2 and 3 depending on the shape of `point_cloud`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should be + dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Figure representing a point cloud in 2D or 3D. + """ # TODO: increase the marker size + validate_params({"dimension": dimension}, + {"dimension": {"type": (int, type(None)), "in": [2, 3]}}) if dimension is None: dimension = np.min((3, point_cloud.shape[1])) @@ -30,10 +46,10 @@ def plot_point_cloud(point_cloud, dimension=None): raise ValueError("Not enough dimensions available in the input point " "cloud.") - if dimension == 2: + elif dimension == 2: layout = { - "width": 800, - "height": 800, + "width": 600, + "height": 600, "xaxis1": { "title": "0th", "side": "bottom", @@ -44,7 +60,7 @@ def plot_point_cloud(point_cloud, dimension=None): "zeroline": True, "showexponent": "all", "exponentformat": "e" - }, + }, "yaxis1": { "title": "1st", "side": "left", @@ -55,25 +71,26 @@ def plot_point_cloud(point_cloud, dimension=None): "zeroline": True, "showexponent": "all", "exponentformat": "e" - }, + }, "plot_bgcolor": "white" - } + } fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.add_trace(gobj.Scatter(x=point_cloud[:, 0], - y=point_cloud[:, 1], - mode='markers', - marker=dict(size=4, - color=list(range( - point_cloud.shape[0])), - colorscale='Viridis', - opacity=0.8))) - fig.show() + fig.add_trace(gobj.Scatter( + x=point_cloud[:, 0], + y=point_cloud[:, 1], + mode="markers", + marker={"size": 4, + "color": list(range(point_cloud.shape[0])), + "colorscale": "Viridis", + "opacity": 0.8} + )) + elif dimension == 3: scene = { "xaxis": { @@ -81,34 +98,38 @@ def plot_point_cloud(point_cloud, dimension=None): "type": "linear", "showexponent": "all", "exponentformat": "e" - }, + }, "yaxis": { "title": "1st", "type": "linear", "showexponent": "all", "exponentformat": "e" - }, + }, "zaxis": { "title": "2nd", "type": "linear", "showexponent": "all", "exponentformat": "e" + } } - } fig = gobj.Figure() fig.update_layout(scene=scene) - fig.add_trace(gobj.Scatter3d(x=point_cloud[:, 0], - y=point_cloud[:, 1], - z=point_cloud[:, 2], - mode='markers', - marker=dict(size=4, - color=list(range( - point_cloud.shape[0])), - colorscale='Viridis', - opacity=0.8))) - - fig.show() - else: - raise ValueError("The value of the dimension is different from 2 or 3") + fig.add_trace(gobj.Scatter3d( + x=point_cloud[:, 0], + y=point_cloud[:, 1], + z=point_cloud[:, 2], + mode="markers", + marker={"size": 4, + "color": list(range(point_cloud.shape[0])), + "colorscale": "Viridis", + "opacity": 0.8} + )) + + # Update trace and layout according to user input + if plotly_params: + fig.update_traces(plotly_params.get("trace", None)) + fig.update_layout(plotly_params.get("layout", None)) + + return fig diff --git a/gtda/plotting/tests/__init__.py b/gtda/plotting/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/gtda/plotting/tests/test_diagram_representations.py b/gtda/plotting/tests/test_diagram_representations.py new file mode 100644 index 000000000..85b38fad7 --- /dev/null +++ b/gtda/plotting/tests/test_diagram_representations.py @@ -0,0 +1,78 @@ +"""Testing for plot_betti_curves and plot_betti_surfaces.""" +# License: GNU AGPLv3 + +import numpy as np +import pytest + +from gtda.plotting import plot_betti_curves, plot_betti_surfaces + +n_samples = 10 +n_homology_dimensions = 3 +n_bins = 20 + +X = np.random.randint(0, 20, n_samples * n_homology_dimensions * n_bins).\ + reshape(n_samples, n_homology_dimensions, n_bins) +samplings = np.vstack([ + np.linspace(0, 10, num=n_bins), + np.linspace(5, 15, num=n_bins), + np.linspace(10, 20, num=n_bins) + ]) +plotly_params_curves = {"layout": {"xaxis1": {"title": "New title"}}} +plotly_params_surfaces = { + "layout": {"scene": {"xaxis": {"title": "New title"}}} + } + + +@pytest.mark.parametrize("homology_dimensions", [None, [0], [0, 1], [0, 1, 2]]) +def test_plot_betti_curves(homology_dimensions): + fig = plot_betti_curves(X[0], samplings=samplings, + homology_dimensions=homology_dimensions, + plotly_params=plotly_params_curves) + + if homology_dimensions is None: + _homology_dimensions = list(range(X.shape[1])) + else: + _homology_dimensions = homology_dimensions + traces_xy = all([ + np.array_equal(fig.data[i].x, samplings[i]) + and np.array_equal(fig.data[i].y, X[0][i]) + for i in _homology_dimensions + ]) + assert traces_xy + + assert fig.layout.xaxis1.title.text == "New title" + + +@pytest.mark.parametrize("homology_dimensions", [None, [0], [0, 1], [0, 1, 2]]) +def test_plot_betti_surfaces(homology_dimensions): + fig = plot_betti_surfaces(X, samplings=samplings, + homology_dimensions=homology_dimensions, + plotly_params=plotly_params_surfaces) + + if homology_dimensions is None: + _homology_dimensions = list(range(X.shape[1])) + else: + _homology_dimensions = homology_dimensions + traces_xyz = all([ + np.array_equal(fig[i].data[0].x, samplings[i]) + and np.array_equal(fig[i].data[0].y, np.arange(X.shape[0])) + and np.array_equal(fig[i].data[0].z, X[:, i]) + for i in _homology_dimensions + ]) + assert traces_xyz + + assert [fig[i].layout.scene.xaxis.title.text == "New title" + for i in _homology_dimensions] + + +def test_plot_betti_surfaces_reduces_to_curves(): + fig = plot_betti_surfaces(X[[0]], samplings=samplings, + plotly_params=plotly_params_curves) + + _homology_dimensions = range(X.shape[1]) + traces_xy = all([ + np.array_equal(fig.data[i].x, samplings[i]) + and np.array_equal(fig.data[i].y, X[0][i]) + for i in _homology_dimensions + ]) + assert traces_xy diff --git a/gtda/point_clouds/__init__.py b/gtda/point_clouds/__init__.py index 9a6814e29..7fe81f718 100644 --- a/gtda/point_clouds/__init__.py +++ b/gtda/point_clouds/__init__.py @@ -8,4 +8,4 @@ __all__ = [ 'ConsistentRescaling', 'ConsecutiveRescaling', -] + ] diff --git a/gtda/point_clouds/rescaling.py b/gtda/point_clouds/rescaling.py index 7b6284f70..225327146 100644 --- a/gtda/point_clouds/rescaling.py +++ b/gtda/point_clouds/rescaling.py @@ -58,9 +58,9 @@ class ConsistentRescaling(BaseEstimator, TransformerMixin, PlotterMixin): to the "consistent rescaling" procedure. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. Attributes ---------- @@ -86,7 +86,7 @@ class ConsistentRescaling(BaseEstimator, TransformerMixin, PlotterMixin): ---------- .. [1] T. Berry and T. Sauer, "Consistent manifold representation for topological data analysis"; *Foundations of data analysis* **1**, - pp. 1--38, 2019; doi: `10.3934/fods.2019001 + pp. 1--38, 2019; `DOI: 10.3934/fods.2019001 `_. """ @@ -94,9 +94,9 @@ class ConsistentRescaling(BaseEstimator, TransformerMixin, PlotterMixin): _hyperparameters = { 'metric': {'type': (str, FunctionType)}, 'metric_params': {'type': (dict, type(None))}, - 'neighbor_rank': { - 'type': int, 'in': Interval(1, np.inf, closed='left')} - } + 'neighbor_rank': {'type': int, + 'in': Interval(1, np.inf, closed='left')} + } def __init__(self, metric='euclidean', metric_params=None, neighbor_rank=1, n_jobs=None): @@ -192,7 +192,7 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='blues'): + def plot(Xt, sample=0, colorscale='blues', plotly_params=None): """Plot a sample from a collection of distance matrices. Parameters @@ -208,8 +208,24 @@ def plot(Xt, sample=0, colorscale='blues'): Color scale to be used in the heat map. Can be anything allowed by :class:`plotly.graph_objects.Heatmap`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale) + return plot_heatmap( + Xt[sample], colorscale=colorscale, + title=f"{sample}-th distance matrix after consistent rescaling", + plotly_params=plotly_params + ) @adapt_fit_transform_docs @@ -250,9 +266,9 @@ class ConsecutiveRescaling(BaseEstimator, TransformerMixin, PlotterMixin): points. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. Attributes ---------- @@ -279,9 +295,8 @@ class ConsecutiveRescaling(BaseEstimator, TransformerMixin, PlotterMixin): _hyperparameters = { 'metric': {'type': (str, FunctionType)}, 'metric_params': {'type': (dict, type(None))}, - 'factor': { - 'type': Real, 'in': Interval(0, np.inf, closed='both')} - } + 'factor': {'type': Real, 'in': Interval(0, np.inf, closed='both')} + } def __init__(self, metric='euclidean', metric_params=None, factor=0., n_jobs=None): @@ -371,7 +386,7 @@ def transform(self, X, y=None): return Xt @staticmethod - def plot(Xt, sample=0, colorscale='blues'): + def plot(Xt, sample=0, colorscale='blues', plotly_params=None): """Plot a sample from a collection of distance matrices. Parameters @@ -387,5 +402,21 @@ def plot(Xt, sample=0, colorscale='blues'): Color scale to be used in the heat map. Can be anything allowed by :class:`plotly.graph_objects.Heatmap`. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + """ - return plot_heatmap(Xt[sample], colorscale=colorscale) + return plot_heatmap( + Xt[sample], colorscale=colorscale, + title=f"{sample}-th distance matrix after consecutive rescaling", + plotly_params=plotly_params + ) diff --git a/gtda/tests/test_common.py b/gtda/tests/test_common.py index 057e55b88..5bca2474d 100644 --- a/gtda/tests/test_common.py +++ b/gtda/tests/test_common.py @@ -9,23 +9,21 @@ # mark checks to skip SKIP_TESTS = { - 'Binarizer': [], - 'Inverter': [], -} + 'Binarizer': [], + 'Inverter': [], + } # mark tests as a known failure # TODO: these should be addressed later. # Note with scikit-learn 0.23 these can be moved to estimator tags XFAIL_TESTS = { - 'Binarizer': ["check_transformer_data_not_an_array", - "check_transformer_general", - "check_transformer_general(readonly_memmap=True)", - ], - 'Inverter': ["check_transformer_data_not_an_array", - "check_transformer_general", - "check_transformer_general(readonly_memmap=True)", - ], -} + 'Binarizer': ["check_transformer_data_not_an_array", + "check_transformer_general", + "check_transformer_general(readonly_memmap=True)", ], + 'Inverter': ["check_transformer_data_not_an_array", + "check_transformer_general", + "check_transformer_general(readonly_memmap=True)", ], + } # adapted from sklearn.utils.estimator_check v0.22 @@ -70,9 +68,8 @@ def _get_estimator_name(estimator): return estimator.__class__.__name__ -@parametrize_with_checks( - [Binarizer, Inverter] -) +@pytest.mark.filterwarnings("ignore:Input of `fit` contains") +@parametrize_with_checks([Binarizer(), Inverter()]) def test_sklearn_api(check, estimator, request): estimator_name = _get_estimator_name(estimator) check_name = _get_callable_name(check) diff --git a/gtda/tests/test_pipeline.py b/gtda/tests/test_pipeline.py index 2697c27e6..ce9e40aad 100644 --- a/gtda/tests/test_pipeline.py +++ b/gtda/tests/test_pipeline.py @@ -18,7 +18,7 @@ def split_train_test(data): n_train = int(0.7 * data.shape[0]) n_test = data.shape[0] - n_train - labeller = ts.Labeller(width=5, percentiles=[80], + labeller = ts.Labeller(size=6, percentiles=[80], n_steps_future=1) X_train = data[:n_train] y_train = X_train @@ -33,12 +33,12 @@ def split_train_test(data): def get_steps(): steps = [ - ('embedding', ts.TakensEmbedding()), - ('window', ts.SlidingWindow(width=5, stride=1)), + ('embedding', ts.SingleTakensEmbedding()), + ('window', ts.SlidingWindow(size=6, stride=1)), ('diagram', hl.VietorisRipsPersistence()), ('rescaler', diag.Scaler()), ('filter', diag.Filtering(epsilon=0.1)), - ('entropy', diag.PersistenceEntropy()), + ('entropy', diag.PersistenceEntropy(nan_fill_value=0.)), ('scaling', skprep.MinMaxScaler(copy=True)) ] return steps @@ -50,7 +50,7 @@ def get_param_grid(): diagram_param = {} classification_param = {} - window_param['width'] = [2, 3] + window_param['size'] = [3, 4] diagram_param['homology_dimensions'] = [[0, 1]] classification_param['n_estimators'] = [10, 100] diff --git a/gtda/time_series/__init__.py b/gtda/time_series/__init__.py index d12613f8f..091d6c50d 100644 --- a/gtda/time_series/__init__.py +++ b/gtda/time_series/__init__.py @@ -1,9 +1,9 @@ -"""The module :mod:`gtda.time_series` implements transformers to -preprocess time series or embed them in a higher dimensional space for -persistent homology. -""" +"""The module :mod:`gtda.time_series` implements transformers to preprocess +time series or embed them in a higher dimensional space for persistent +homology.""" -from .embedding import SlidingWindow, TakensEmbedding +from .embedding import SlidingWindow, takens_embedding_optimal_parameters, \ + SingleTakensEmbedding, TakensEmbedding from .features import PermutationEntropy from .preprocessing import Resampler, Stationarizer from .multivariate import PearsonDissimilarity @@ -13,8 +13,10 @@ 'Resampler', 'Stationarizer', 'PermutationEntropy', + 'takens_embedding_optimal_parameters', + 'SingleTakensEmbedding', 'TakensEmbedding', 'SlidingWindow', 'Labeller', 'PearsonDissimilarity' -] + ] diff --git a/gtda/time_series/_utils.py b/gtda/time_series/_utils.py new file mode 100644 index 000000000..2a5be9d7d --- /dev/null +++ b/gtda/time_series/_utils.py @@ -0,0 +1,92 @@ +"""Utility functions for time series processing.""" +# License: GNU AGPLv3 + +from functools import partial + +import numpy as np +from sklearn.metrics import mutual_info_score +from sklearn.neighbors import NearestNeighbors + + +def _time_delay_embedding(X, time_delay=1, dimension=2, stride=1, + flatten=False, ensure_last_value=True): + if hasattr(X, 'shape') and hasattr(X, 'ndim'): # ndarray input + n_timestamps = X.shape[-1] + n_points, offset = \ + divmod(n_timestamps - time_delay * (dimension - 1) - 1, stride) + n_points += 1 + if n_points <= 0: + raise ValueError( + f"Not enough time stamps ({n_timestamps}) to produce at least " + f"one {dimension}-dimensional vector under the current choice " + f"of time delay ({time_delay})." + ) + indices = np.tile(np.arange(0, time_delay * dimension, time_delay), + (n_points, 1)) + indices += np.arange(n_points)[:, None] * stride + if ensure_last_value: + indices += offset + + X_embedded = X[..., indices] + if flatten and (X.ndim > 2): + transpose_axes = (0, *range(1, X.ndim)[::-1], X.ndim) + X_embedded = np.transpose(X_embedded, axes=transpose_axes).\ + reshape(len(X), -1, dimension * np.prod(X.shape[1:-1])) + else: # list of ndarray input + func = partial(_time_delay_embedding, time_delay=time_delay, + dimension=dimension, stride=stride, flatten=flatten, + ensure_last_value=ensure_last_value) + X_embedded = [] + for x in X: + x_embedded = func(x[None, ...])[0] + X_embedded.append(x_embedded) + + return X_embedded + + +def _mutual_information(X, time_delay, n_bins): + """Calculate the mutual information given the time delay.""" + contingency = np.histogram2d(X[:-time_delay], X[time_delay:], + bins=n_bins)[0] + mutual_information = mutual_info_score(None, None, + contingency=contingency) + return mutual_information + + +def _false_nearest_neighbors(X, time_delay, dimension, stride=1): + """Calculate the number of false nearest neighbours in a certain + embedding dimension, based on heuristics.""" + X_embedded = _time_delay_embedding(X, time_delay=time_delay, + dimension=dimension, stride=stride) + + neighbor = \ + NearestNeighbors(n_neighbors=2, algorithm='auto').fit(X_embedded) + distances, indices = neighbor.kneighbors(X_embedded) + distance = distances[:, 1] + X_first_nbhrs = X[indices[:, 1]] + + epsilon = 2. * np.std(X) + tolerance = 10 + + neg_dim_delay = - dimension * time_delay + distance_slice = distance[:neg_dim_delay] + X_rolled = np.roll(X, neg_dim_delay) + X_rolled_slice = slice(len(X) - len(X_embedded), neg_dim_delay) + X_first_nbhrs_rolled = np.roll(X_first_nbhrs, neg_dim_delay) + + neighbor_abs_diff = np.abs( + X_rolled[X_rolled_slice] - X_first_nbhrs_rolled[:neg_dim_delay] + ) + + false_neighbor_ratio = np.divide( + neighbor_abs_diff, distance_slice, + out=np.zeros_like(neighbor_abs_diff, dtype=float), + where=(distance_slice != 0) + ) + false_neighbor_criteria = false_neighbor_ratio > tolerance + + limited_dataset_criteria = distance_slice < epsilon + + n_false_neighbors = \ + np.sum(false_neighbor_criteria * limited_dataset_criteria) + return n_false_neighbors diff --git a/gtda/time_series/embedding.py b/gtda/time_series/embedding.py index a909c7e71..9e4fc0591 100644 --- a/gtda/time_series/embedding.py +++ b/gtda/time_series/embedding.py @@ -3,32 +3,130 @@ import numpy as np from joblib import Parallel, delayed -from sklearn.base import BaseEstimator -from sklearn.metrics import mutual_info_score -from sklearn.neighbors import NearestNeighbors +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted, check_array, column_or_1d -from ..base import TransformerResamplerMixin +from ._utils import _time_delay_embedding, _mutual_information, \ + _false_nearest_neighbors +from ..base import TransformerResamplerMixin, PlotterMixin +from ..plotting import plot_point_cloud from ..utils._docs import adapt_fit_transform_docs from ..utils.intervals import Interval -from ..utils.validation import validate_params -from..plotting import plot_point_cloud +from ..utils.validation import validate_params, check_collection + +_TAKENS_EMBEDDING_HYPERPARAMETERS = { + 'time_delay': {'type': int, 'in': Interval(1, np.inf, closed='left')}, + 'dimension': {'type': int, 'in': Interval(1, np.inf, closed='left')}, + 'stride': {'type': int, 'in': Interval(1, np.inf, closed='left')} + } + + +def takens_embedding_optimal_parameters(X, max_time_delay, max_dimension, + stride=1, n_jobs=None, validate=True): + """Compute the "optimal" parameters for a Takens (time-delay) embedding + [1]_ of a univariate time series. + + First, an optimal time delay is found by minimising the time-delayed mutual + information among values no greater than `max_time_delay`. Then, a + heuristic based on an algorithm in [2]_ is used to select an embedding + dimension which, when increased, does not reveal a large proportion of + "false nearest neighbors". + + Parameters + ---------- + X : ndarray of shape (n_samples,) or (n_samples, 1) + Input data representing a single univariate time series. + + max_time_delay : int, required + Maximum time delay between two consecutive values for constructing one + embedded point. + + max_dimension : int, required + Maximum embedding dimension that will be considered in the + optimization. + + stride : int, optional, default: ``1`` + Stride duration between two consecutive embedded points. It defaults to + 1 as this is the usual value in the statement of Takens's embedding + theorem. + + n_jobs : int or None, optional, default: ``None`` + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + + validate : bool, optional, default: ``True`` + Whether the input and hyperparameters should be validated. + + Returns + ------- + time_delay : int + The "optimal" time delay less than or equal to `max_dimension`, as + determined by minimizing the time-delayed mutual information. + + dimension : int + The "optimal" embedding dimension less than or equal to + `max_dimension`, as determined by a false nearest neighbors heuristic + once `time_delay` is computed. + + See also + -------- + SingleTakensEmbedding, TakensEmbedding, SlidingWindow + + References + ---------- + .. [1] F. Takens, "Detecting strange attractors in turbulence". In: Rand + D., Young LS. (eds) *Dynamical Systems and Turbulence, Warwick + 1980*. Lecture Notes in Mathematics, vol. 898. Springer, 1981; + `DOI: 10.1007/BFb0091924 `_. + + .. [2] M. B. Kennel, R. Brown, and H. D. I. Abarbanel, "Determining + embedding dimension for phase-space reconstruction using a + geometrical construction"; *Phys. Rev. A* **45**, pp. 3403--3411, + 1992; `DOI: 10.1103/PhysRevA.45.3403 + `_. + + """ + if validate: + _hyperparameters = _TAKENS_EMBEDDING_HYPERPARAMETERS.copy() + validate_params({'validate': validate}, {'validate': {'type': bool}}) + validate_params({'time_delay': max_time_delay, + 'dimension': max_dimension, 'stride': stride}, + _hyperparameters) + X = column_or_1d(X) + + mutual_information_list = Parallel(n_jobs=n_jobs)( + delayed(_mutual_information)(X, time_delay, n_bins=100) + for time_delay in range(1, max_time_delay + 1)) + time_delay = \ + mutual_information_list.index(min(mutual_information_list)) + 1 + + n_false_nbhrs_list = Parallel(n_jobs=n_jobs)( + delayed(_false_nearest_neighbors)(X, time_delay, dim, stride=stride) + for dim in range(1, max_dimension + 3)) + variation_list = [np.abs(n_false_nbhrs_list[dim - 1] + - 2 * n_false_nbhrs_list[dim] + + n_false_nbhrs_list[dim + 1]) + / (n_false_nbhrs_list[dim] + 1) / dim + for dim in range(2, max_dimension + 1)] + dimension = variation_list.index(min(variation_list)) + 2 + + return time_delay, dimension @adapt_fit_transform_docs class SlidingWindow(BaseEstimator, TransformerResamplerMixin): """Sliding windows onto the data. - Useful in time series analysis to convert a sequence of objects (scalar - or array-like) into a sequence of windows on the original sequence. Each + Useful in time series analysis to convert a sequence of objects (scalar or + array-like) into a sequence of windows on the original sequence. Each window stacks together consecutive objects, and consecutive windows are separated by a constant stride. Parameters ---------- - width : int, optional, default: ``10`` - Width of each sliding window. Each window contains ``width + 1`` - objects from the original time series. + size : int, optional, default: ``10`` + Size of each sliding window. stride : int, optional, default: ``1`` Stride between consecutive windows. @@ -41,7 +139,7 @@ class SlidingWindow(BaseEstimator, TransformerResamplerMixin): >>> # time series of scalars >>> X = np.arange(20).reshape(-1, 2) >>> y = np.arange(10) - >>> windows = SlidingWindow(width=2, stride=3) + >>> windows = SlidingWindow(size=3, stride=3) >>> # Fit and transform X >>> X_windows = windows.fit_transform(X) >>> print(X_windows) @@ -61,36 +159,43 @@ class SlidingWindow(BaseEstimator, TransformerResamplerMixin): See also -------- - TakensEmbedding + SingleTakensEmbedding, TakensEmbedding Notes ----- - The current implementation favours the last entry over the first one, - in the sense that the last entry of the last window always equals the last + The current implementation favours the last entry over the first one, in + the sense that the last entry of the last window always equals the last entry in the original time series. Hence, a number of initial entries - (depending on the remainder of the division between :math:`n_\\mathrm{ - samples} - \\mathrm{width} - 1` and the stride) may be lost. + (depending on the remainder of the division between ``n_samples - size`` + and ``stride``) may be lost. """ _hyperparameters = { - 'width': {'type': int, 'in': Interval(1, np.inf, closed='left')}, + 'size': {'type': int, 'in': Interval(1, np.inf, closed='left')}, 'stride': {'type': int, 'in': Interval(1, np.inf, closed='left')} - } + } - def __init__(self, width=10, stride=1): - self.width = width + def __init__(self, size=10, stride=1): + self.size = size self.stride = stride - def _slice_windows(self, X): + def _window_indices(self, X): n_samples = X.shape[0] - n_windows = (n_samples - self.width - 1) // self.stride + 1 - - window_slices = [(n_samples - i * self.stride - self.width - 1, - n_samples - i * self.stride) - for i in reversed(range(n_windows))] - - return window_slices + n_windows, offset = divmod(n_samples - self.size, self.stride) + n_windows += 1 + if n_windows <= 0: + raise ValueError( + f"Number of samples ({n_samples}) cannot be less than window " + f"size ({self.size})." + ) + indices = np.tile(np.arange(self.size), (n_windows, 1)) + indices += np.arange(n_windows)[:, None] * self.stride + offset + return indices + + def slice_windows(self, X): + indices = self._window_indices(X) + return indices[:, [0, -1]] + np.array([0, 1]) def fit(self, X, y=None): """Do nothing and return the estimator unchanged. @@ -130,18 +235,17 @@ def transform(self, X, y=None): Returns ------- - Xt : ndarray of shape (n_windows, n_samples_window, ...) + Xt : ndarray of shape (n_windows, size, ...) Windows of consecutive entries of the original time series. - ``n_windows = (n_samples - width - 1) // stride + 1``, and - ``n_samples_window = width + 1``. + ``n_windows = (n_samples - size) // stride + 1``. """ check_is_fitted(self, '_is_fitted') Xt = check_array(X, ensure_2d=False, allow_nd=True) - window_slices = self._slice_windows(Xt) + window_indices = self._window_indices(Xt) - Xt = np.stack([Xt[begin:end] for begin, end in window_slices]) + Xt = Xt[window_indices] return Xt def resample(self, y, X=None): @@ -155,84 +259,59 @@ def resample(self, y, X=None): Target. X : None - There is no need for input data, yet the pipeline API requires - this parameter. + There is no need for input data, yet the pipeline API requires this + parameter. Returns ------- yr : ndarray of shape (n_samples_new,) - The resampled target. ``n_samples_new = (n_samples - time_delay * - (dimension - 1) - 1) // stride + 1``. + The resampled target. ``n_samples_new = (n_samples - size) + // stride + 1``. """ check_is_fitted(self, '_is_fitted') yr = column_or_1d(y) - yr = np.flip(yr) - yr = np.flip(yr[:-self.width:self.stride]) + yr = yr[:self.size - 2:-self.stride][::-1] return yr - @staticmethod - def plot(Xt, sample=0): - """Plot a sample from a collection of sliding windows, as a point - cloud in 2D or 3D. If points in the window have more than three - dimensions, only the first three are plotted. - - Important: when using on the result `Xt` of calling :meth:`transform` - on ``X``, ensure that each sample in ``X`` is a point in - ``n_dimensions``-dimensional space with ``n_dimensions > 1``. - - Parameters - ---------- - Xt : ndarray of shape (n_samples, n_points, n_dimensions) - Collection of sliding windows, each containing ``n_points`` - points in ``n_dimensions``-dimensional space, such as returned by - :meth:`transform`. - - sample : int, optional, default: ``0`` - Index of the sample in `Xt` to be plotted. - - """ - return plot_point_cloud(Xt[sample]) - @adapt_fit_transform_docs -class TakensEmbedding(BaseEstimator, TransformerResamplerMixin): - """Representation of a univariate time series as a time series of - point clouds. - - Based on a time-delay embedding technique named after F. Takens [1]_. - Given a discrete time series :math:`(X_0, X_1, \\ldots)` and a sequence - of evenly sampled times :math:`t_0, t_1, \\ldots`, one extracts a set - of :math:`d`-dimensional vectors of the form :math:`(X_{t_i}, X_{t_i + - \\tau}, \\ldots , X_{t_i + (d-1)\\tau})` for :math:`i = 0, 1, \\ldots`. - This set is called the :ref:`Takens embedding ` - of the time series and can be interpreted as a point cloud. +class SingleTakensEmbedding(BaseEstimator, TransformerResamplerMixin): + """Representation of a single univariate time series as a point cloud. + + Based on a time-delay embedding technique named after F. Takens [1]_ [2]_. + Given a discrete time series :math:`(X_0, X_1, \\ldots)` and a sequence of + evenly sampled times :math:`t_0, t_1, \\ldots`, one extracts a set of + :math:`d`-dimensional vectors of the form :math:`(X_{t_i}, X_{t_i + \\tau}, + \\ldots , X_{t_i + (d-1)\\tau})` for :math:`i = 0, 1, \\ldots`. This set is + called the :ref:`Takens embedding ` of the time series + and can be interpreted as a point cloud. The difference between :math:`t_{i+1}` and :math:`t_i` is called the - stride, :math:`\\tau` is called the time delay, and :math:`d` is called - the (embedding) dimension. + stride, :math:`\\tau` is called the time delay, and :math:`d` is called the + (embedding) dimension. + + If :math:`d` and :math:`\\tau` are not explicitly set, suitable values are + searched for during :meth:`fit` [3]_ [4]_. - If :math:`d` and :math:`\\tau` are not explicitly set, suitable values - are searched for during :meth:`fit`. [2]_ [3]_ + To compute time-delay embeddings of several time series simultaneously, use + :class:`TakensEmbedding` instead. Parameters ---------- parameters_type : ``'search'`` | ``'fixed'``, optional, default: \ ``'search'`` - If set to ``'fixed'``, the values of `time_delay` and `dimension` - are used directly in :meth:`transform`. If set to ``'search'``, - those values are only used as upper bounds in a search as follows: - first, an optimal time delay is found by minimising the time delayed - mutual information; then, a heuristic based on an algorithm in [2]_ is - used to select an embedding dimension which, when increased, does not - reveal a large proportion of "false nearest neighbors". + If set to ``'fixed'``, the values of `time_delay` and `dimension` are + used directly in :meth:`transform`. If set to ``'search'``, + :func:`takens_embedding_optimal_parameter` is run in :meth:`fit` to + estimate optimal values for these quantities and store them as + :attr:`time_delay_` and :attr:`dimension_`. time_delay : int, optional, default: ``1`` - Time delay between two consecutive values for constructing one - embedded point. If `parameters_type` is ``'search'``, - it corresponds to the maximal embedding time delay that will be - considered. + Time delay between two consecutive values for constructing one embedded + point. If `parameters_type` is ``'search'``, it corresponds to the + maximum time delay that will be considered. dimension : int, optional, default: ``5`` Dimension of the embedding space. If `parameters_type` is ``'search'``, @@ -240,8 +319,8 @@ class TakensEmbedding(BaseEstimator, TransformerResamplerMixin): considered. stride : int, optional, default: ``1`` - Stride duration between two consecutive embedded points. It defaults - to 1 as this is the usual value in the statement of Takens's embedding + Stride duration between two consecutive embedded points. It defaults to + 1 as this is the usual value in the statement of Takens's embedding theorem. n_jobs : int or None, optional, default: ``None`` @@ -252,84 +331,82 @@ class TakensEmbedding(BaseEstimator, TransformerResamplerMixin): Attributes ---------- time_delay_ : int - Actual embedding time delay used to embed. If - `parameters_type` is ``'search'``, it is the calculated optimal - embedding time delay and is less than or equal to `time_delay`. - Otherwise it is equal to `time_delay`. + Actual time delay used to embed. If + `parameters_type` is ``'search'``, it is the calculated optimal time + delay and is less than or equal to `time_delay`. Otherwise it is equal + to `time_delay`. dimension_ : int Actual embedding dimension used to embed. If `parameters_type` is - ``'search'``, it is the calculated optimal embedding dimension and - is less than or equal to `dimension`. Otherwise it is equal to + ``'search'``, it is the calculated optimal embedding dimension and is + less than or equal to `dimension`. Otherwise it is equal to `dimension`. Examples -------- >>> import numpy as np - >>> from gtda.time_series import TakensEmbedding + >>> from gtda.time_series import SingleTakensEmbedding >>> # Create a noisy signal + >>> rng = np.random.default_rng() >>> n_samples = 10000 - >>> signal_noise = np.asarray([np.sin(x / 50) + 0.5 * np.random.random() - ... for x in range(n_samples)]) + >>> signal = np.asarray([np.sin(x / 50) + 0.5 * rng.random() + ... for x in range(n_samples)]) >>> # Set up the transformer - >>> embedder = TakensEmbedding(parameters_type='search', dimension=5, - ... time_delay=5, n_jobs=-1) + >>> STE = SingleTakensEmbedding(parameters_type='search', dimension=5, + ... time_delay=5, n_jobs=-1) >>> # Fit and transform - >>> embedded_noise = embedder.fit_transform(signal_noise) - >>> print('Optimal embedding time delay based on mutual information:', - ... embedder.time_delay_) - Optimal embedding time delay based on mutual information: 5 + >>> signal_embedded = STE.fit_transform(signal) + >>> print('Optimal time delay based on mutual information:', + ... STE.time_delay_) + Optimal time delay based on mutual information: 5 >>> print('Optimal embedding dimension based on false nearest neighbors:', - ... embedder.dimension_) + ... STE.dimension_) Optimal embedding dimension based on false nearest neighbors: 2 - >>> print(embedded_noise.shape) + >>> print(signal_embedded.shape) (9995, 2) See also -------- - SlidingWindow, gtda.homology.VietorisRipsPersistence + TakensEmbedding, SlidingWindow, takens_embedding_optimal_parameters Notes ----- - The current implementation favours the last value over the first one, - in the sense that the last coordinate of the last vector in a Takens - embedded time series always equals the last value in the original time - series. Hence, a number of initial values (depending on the remainder of - the division between :math:`n_\\mathrm{samples} - d(\\tau - 1) - 1` and - the stride) may be lost. + The current implementation favours the last value over the first one, in + the sense that the last coordinate of the last vector in a Takens embedded + time series always equals the last value in the original time series. + Hence, a number of initial values (depending on the remainder of the + division between ``n_samples - dimension * (time_delay - 1) - 1`` and the + stride) may be lost. References ---------- .. [1] F. Takens, "Detecting strange attractors in turbulence". In: Rand D., Young LS. (eds) *Dynamical Systems and Turbulence, Warwick 1980*. Lecture Notes in Mathematics, vol. 898. Springer, 1981; - doi: `10.1007/BFb0091924 `_. + `DOI: 10.1007/BFb0091924 `_. - .. [2] M. B. Kennel, R. Brown, and H. D. I. Abarbanel, "Determining + .. [2] J. A. Perea and J. Harer, "Sliding Windows and Persistence: An \ + Application of Topological Methods to Signal Analysis"; \ + *Foundations of Computational Mathematics*, **15**, \ + pp. 799--838; `DOI: 10.1007/s10208-014-9206-z + `_. + + .. [3] M. B. Kennel, R. Brown, and H. D. I. Abarbanel, "Determining embedding dimension for phase-space reconstruction using a geometrical construction"; *Phys. Rev. A* **45**, pp. 3403--3411, - 1992; doi: `10.1103/PhysRevA.45.3403 + 1992; `DOI: 10.1103/PhysRevA.45.3403 `_. - .. [3] N. Sanderson, "Topological Data Analysis of Time Series using + .. [4] N. Sanderson, "Topological Data Analysis of Time Series using Witness Complexes"; PhD thesis, University of Colorado at Boulder, 2018; `https://scholar.colorado.edu/math_gradetds/67 `_. - [4] J. A. Perea and J. Harer, "Sliding Windows and Persistence: An \ - Application of Topological Methods to Signal Analysis"; \ - *Foundations of Computational Mathematics*, **15**, \ - pp. 799--838; `doi:10.1007/s10208-014-9206-z \ - `_. - """ - _hyperparameters = { - 'parameters_type': {'type': str, 'in': ['fixed', 'search']}, - 'time_delay': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'dimension': {'type': int, 'in': Interval(1, np.inf, closed='left')}, - 'stride': {'type': int, 'in': Interval(1, np.inf, closed='left')} - } + _hyperparameters = _TAKENS_EMBEDDING_HYPERPARAMETERS.copy() + _hyperparameters['parameters_type'] = \ + {'type': str, 'in': ['fixed', 'search']} def __init__(self, parameters_type='search', time_delay=1, dimension=5, stride=1, n_jobs=None): @@ -339,64 +416,6 @@ def __init__(self, parameters_type='search', time_delay=1, dimension=5, self.stride = stride self.n_jobs = n_jobs - @staticmethod - def _embed(X, time_delay, dimension, stride): - n_points = (X.shape[0] - time_delay * (dimension - 1) - 1)\ - // stride + 1 - - X = np.flip(X) - points_ = [X[j * stride:j * stride + time_delay * dimension:time_delay] - .flatten() for j in range(n_points)] - X_embedded = np.stack(points_) - - return np.flip(X_embedded).reshape(n_points, dimension) - - @staticmethod - def _mutual_information(X, time_delay, n_bins): - """Calculate the mutual information given the time delay.""" - contingency = np.histogram2d(X.reshape((-1,))[:-time_delay], - X.reshape((-1,))[time_delay:], - bins=n_bins)[0] - mutual_information = mutual_info_score(None, None, - contingency=contingency) - return mutual_information - - @staticmethod - def _false_nearest_neighbors(X, time_delay, dimension, stride=1): - """Calculate the number of false nearest neighbours in a certain - embedding dimension, based on heuristics.""" - X_embedded = TakensEmbedding._embed(X, time_delay, dimension, stride) - - neighbor = NearestNeighbors(n_neighbors=2, algorithm='auto').fit( - X_embedded) - distances, indices = neighbor.kneighbors(X_embedded) - distance = distances[:, 1] - X_first_nbhrs = X[indices[:, 1]] - - epsilon = 2. * np.std(X) - tolerance = 10 - - neg_dim_delay = - dimension * time_delay - distance_slice = distance[:neg_dim_delay] - X_rolled = np.roll(X, neg_dim_delay) - X_rolled_slice = slice(X.shape[0] - X_embedded.shape[0], neg_dim_delay) - X_first_nbhrs_rolled = np.roll(X_first_nbhrs, neg_dim_delay) - - neighbor_abs_diff = np.abs( - X_rolled[X_rolled_slice] - X_first_nbhrs_rolled[:neg_dim_delay]).\ - flatten() - - false_neighbor_ratio = np.divide( - neighbor_abs_diff, distance_slice, - out=np.zeros_like(neighbor_abs_diff), where=(distance_slice != 0)) - false_neighbor_criteria = false_neighbor_ratio > tolerance - - limited_dataset_criteria = distance_slice < epsilon - - n_false_neighbors = np.sum( - false_neighbor_criteria * limited_dataset_criteria) - return n_false_neighbors - def fit(self, X, y=None): """If necessary, compute the optimal time delay and embedding dimension. Then, return the estimator. @@ -410,39 +429,24 @@ def fit(self, X, y=None): Input data. y : None - There is no need for a target in a transformer, yet the pipeline - API requires this parameter. + There is no need for a target, yet the pipeline API requires this + parameter. Returns ------- self : object """ - X = check_array(X, ensure_2d=False) + X = column_or_1d(X) validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) - if X.ndim == 1: - X = X[:, None] - if self.parameters_type == 'search': - mutual_information_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._mutual_information)(X, time_delay, n_bins=100) - for time_delay in range(1, self.time_delay + 1)) - self.time_delay_ = mutual_information_list.index( - min(mutual_information_list)) + 1 - - n_false_nbhrs_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._false_nearest_neighbors)( - X, self.time_delay_, dim, stride=self.stride) - for dim in range(1, self.dimension + 3)) - variation_list = [np.abs(n_false_nbhrs_list[dim - 1] - - 2 * n_false_nbhrs_list[dim] + - n_false_nbhrs_list[dim + 1]) - / (n_false_nbhrs_list[dim] + 1) / dim - for dim in range(2, self.dimension + 1)] - self.dimension_ = variation_list.index(min(variation_list)) + 2 - + self.time_delay_, self.dimension_ = \ + takens_embedding_optimal_parameters( + X, self.time_delay, self.dimension, stride=self.stride, + n_jobs=self.n_jobs, validate=False + ) else: self.time_delay_ = self.time_delay self.dimension_ = self.dimension @@ -469,11 +473,12 @@ def transform(self, X, y=None): """ check_is_fitted(self) - Xt = check_array(X, ensure_2d=False) + Xt = column_or_1d(X).copy() - if Xt.ndim == 1: - Xt = Xt[:, None] - Xt = self._embed(Xt, self.time_delay_, self.dimension_, self.stride) + Xt = _time_delay_embedding( + Xt, time_delay=self.time_delay_, dimension=self.dimension_, + stride=self.stride + ) return Xt @@ -488,8 +493,8 @@ def resample(self, y, X=None): Target. X : None - There is no need for input data, yet the pipeline API requires - this parameter. + There is no need for input data, yet the pipeline API requires this + parameter. Returns ------- @@ -501,7 +506,249 @@ def resample(self, y, X=None): check_is_fitted(self) yr = column_or_1d(y) - yr = np.flip(yr) - final_index = -self.time_delay_ * (self.dimension_ - 1) - yr = np.flip(yr[:final_index:self.stride]) + final_index = self.time_delay_ * (self.dimension_ - 1) + yr = yr[:final_index - 1:-self.stride][::-1] return yr + + +@adapt_fit_transform_docs +class TakensEmbedding(BaseEstimator, TransformerMixin, PlotterMixin): + """Point clouds from collections of time series via independent Takens + embeddings. + + This transformer takes collections of (possibly multivariate) time series + as input, applies the Takens embedding algorithm described in + :class:`SingleTakensEmbedding` to each independently, and returns a + corresponding collection of point clouds in Euclidean space (or possibly + higher-dimensional structures, see `flatten`). + + Parameters + ---------- + time_delay : int, optional, default: ``1`` + Time delay between two consecutive values for constructing one embedded + point. + + dimension : int, optional, default: ``2`` + Dimension of the embedding space (per variable, in the multivariate + case). + + stride : int, optional, default: ``1`` + Stride duration between two consecutive embedded points. + + flatten : bool, optional, default: ``True`` + Only relevant when the input of :meth:`transform` represents a + collection of multivariate or tensor-valued time series. If ``True``, + ensures that the output is a 3D ndarray or list of 2D arrays. If + ``False``, each entry of the input collection leads to an array of + dimension one higher than the entry's dimension. See Examples. + + ensure_last_value : bool, optional, default: ``True`` + Whether the value(s) representing the last measurement(s) must be + be present in the output as the last coordinate(s) of the last + embedding vector(s). If ``False``, the first measurement(s) is (are) + present as the 0-th coordinate(s) of the 0-th vector(s) instead. + + Examples + -------- + >>> import numpy as np + >>> from gtda.time_series import TakensEmbedding + + Two univariate time series of duration 4: + + >>> X = np.arange(8).reshape(2, 4) + >>> print(X) + [[0 1 2 3] + [4 5 6 7]] + >>> TE = TakensEmbedding(time_delay=1, dimension=2) + >>> print(TE.fit_transform(X)) + [[[0 1] + [1 2] + [2 3]] + [[5 6] + [6 7] + [7 8]]] + + Two multivariate time series of duration 4, with 2 variables: + + >>> x = np.arange(8).reshape(2, 1, 4) + >>> X = np.concatenate([x, -x], axis=1) + >>> print(X) + [[[ 0 1 2 3] + [ 0 -1 -2 -3]] + [[ 4 5 6 7] + [-4 -5 -6 -7]]] + + Pass `flatten` as ``True`` (default): + + >>> TE = TakensEmbedding(time_delay=1, dimension=2, flatten=True) + >>> print(TE.fit_transform(X)) + [[[ 0 1 0 -1] + [ 1 2 -1 -2] + [ 2 3 -2 -3]] + [[ 4 5 -4 -5] + [ 5 6 -5 -6] + [ 6 7 -6 -7]]] + + Pass `flatten` as ``False``: + + >>> TE = TakensEmbedding(time_delay=1, dimension=2, flatten=False) + >>> print(TE.fit_transform(X)) + [[[[ 0 1] + [ 1 2] + [ 2 3]] + [[ 0 -1] + [-1 -2] + [-2 -3]]] + [[[ 4 5] + [ 5 6] + [ 6 7]] + [[-4 -5] + [-5 -6] + [-6 -7]]]] + + See also + -------- + SingleTakensEmbedding, SlidingWindow, takens_embedding_optimal_parameters + + Notes + ----- + To compute the Takens embedding of a single univariate time series in the + form of a 1D array or column vector, use :class:`SingleTakensEmbedding` + instead. + + Unlike :class:`SingleTakensEmbedding`, this transformer does not include + heuristics to optimize the choice of time delay and embedding dimension. + The function :func:`takens_embedding_optimal_parameters` is specifically + dedicated to this task, but only on a single univariate time series. + + If dealing with a forecasting problem on a single time series, this + transformer can be used after an instance of :class:`SlidingWindow` and + before an instance of a homology transformer, to produce topological + features from sliding windows over the time series. + + """ + + _hyperparameters = _TAKENS_EMBEDDING_HYPERPARAMETERS.copy() + _hyperparameters.update({'flatten': {'type': bool}, + 'ensure_last_value': {'type': bool}}) + + def __init__(self, time_delay=1, dimension=2, stride=1, flatten=True, + ensure_last_value=True): + self.time_delay = time_delay + self.dimension = dimension + self.stride = stride + self.flatten = flatten + self.ensure_last_value = ensure_last_value + + def fit(self, X, y=None): + """Do nothing and return the estimator unchanged. + + This method is here to implement the usual scikit-learn API and hence + work in pipelines. + + Parameters + ---------- + X : ndarray or list of length n_samples + Input collection of time series. A 2D array or list of 1D arrays is + interpreted as a collection of univariate time series. A 3D array + or list of 2D arrays is interpreted as a collection of multivariate + time series, each with shape ``(n_variables, n_timestamps)``. More + generally, :math`N`-dimensional arrays or lists of + (:math`N-1`)-dimensional arrays (:math:`N \\geq 3`) are interpreted + as collections of tensor-valued time series, each with time indexed + by the last axis. + + y : None + There is no need for a target, yet the pipeline API requires this + parameter. + + Returns + ------- + self : object + + """ + check_collection(X, copy=False) + validate_params(self.get_params(), self._hyperparameters) + self._is_fitted = True + + return self + + def transform(self, X, y=None): + """Compute the Takens embedding of each entry in `X`. + + Parameters + ---------- + X : ndarray or list of length n_samples + Input collection of time series. A 2D array or list of 1D arrays is + interpreted as a collection of univariate time series. A 3D array + or list of 2D arrays is interpreted as a collection of multivariate + time series, each with shape ``(n_variables, n_timestamps)``. More + generally, :math`N`-dimensional arrays or lists of + (:math`N-1`)-dimensional arrays (:math:`N \\geq 3`) are interpreted + as collections of tensor-valued time series, each with time indexed + by the last axis. + + y : None + Ignored. + + Returns + ------- + Xt : ndarray or list of length n_samples + The result of performing a Takens embedding of each entry in `X` + with the given parameters. If `X` is a 2D array or a list of 1D + arrays, `Xt` is a 3D array or a list of 2D arrays (respectively), + each entry of which has shape ``(n_points, dimension)`` where + ``n_points = (n_timestamps - time_delay * (dimension - 1) - 1) // \ + stride + 1``. If `X` is an :math`N`-dimensional array or a list of + (:math`N-1`)-dimensional arrays (:math:`N \\geq 3`), the output + shapes depend on the `flatten` parameter: + + - if `flatten` is ``True``, `Xt` is still a 3D array or a + list of 2D arrays (respectively), each entry of which has + shape ``(n_points, dimension * n_variables)`` where + ``n_points`` is as above and ``n_variables`` is the product + of the sizes of all axes in said entry except the last. + - if `flatten` is ``False``, `Xt` is an + (:math`N+1`)-dimensional array or list of + :math`N`-dimensional arrays. + + """ + check_is_fitted(self, '_is_fitted') + Xt = check_collection(X, copy=True) + + Xt = _time_delay_embedding( + Xt, time_delay=self.time_delay, dimension=self.dimension, + stride=self.stride, flatten=self.flatten, + ensure_last_value=self.ensure_last_value + ) + + return Xt + + @staticmethod + def plot(Xt, sample=0, plotly_params=None): + """Plot a sample from a collection of Takens embeddings of time series, + as a point cloud in 2D or 3D. If points in the window have more than + three dimensions, only the first three are plotted. + + Parameters + ---------- + Xt : ndarray or list of length n_samples + Collection of point clouds, such as returned by :meth:`transform`. + + sample : int, optional, default: ``0`` + Index of the sample in `Xt` to be plotted. + + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"trace"`` and ``"layout"``, and the corresponding values should + be dictionaries containing keyword arguments as would be fed to the + :meth:`update_traces` and :meth:`update_layout` methods of + :class:`plotly.graph_objects.Figure`. + + Returns + ------- + fig : :class:`plotly.graph_objects.Figure` object + Plotly figure. + + """ + return plot_point_cloud(Xt[sample], plotly_params=plotly_params) diff --git a/gtda/time_series/features.py b/gtda/time_series/features.py index 4274f0346..816b1b7f0 100644 --- a/gtda/time_series/features.py +++ b/gtda/time_series/features.py @@ -3,6 +3,7 @@ import numpy as np from joblib import Parallel, delayed, effective_n_jobs +from scipy.stats import entropy from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import gen_even_slices from sklearn.utils.validation import check_is_fitted, check_array @@ -16,8 +17,8 @@ class PermutationEntropy(BaseEstimator, TransformerMixin): Given a two-dimensional array `A`, another array `A'` of the same size is computed by arg-sorting each row in `A`. The permutation entropy [1]_ of - `A` is the Shannon entropy of the probability distribution given by - the relative frequencies of each arg-sorting permutation in `A'`. + `A` is the (base 2) Shannon entropy of the probability distribution given + by the relative frequencies of each arg-sorting permutation in `A'`. Parameters ---------- @@ -28,13 +29,14 @@ class PermutationEntropy(BaseEstimator, TransformerMixin): See also -------- - TakensEmbedding, gtda.diagrams.PersistenceEntropy + SlidingWindow, TakensEmbedding, \ + SingleTakensEmbedding, gtda.diagrams.PersistenceEntropy References ---------- .. [1] C. Bandt and B. Pompe, "Permutation Entropy: A Natural Complexity Measure for Time Series"; *Phys. Rev. Lett.*, **88**.17, 2002; - `doi: 10.1103/physrevlett.88.174102 + `DOI: 10.1103/physrevlett.88.174102 `_. """ @@ -42,15 +44,17 @@ class PermutationEntropy(BaseEstimator, TransformerMixin): def __init__(self, n_jobs=None): self.n_jobs = n_jobs - def _entropy(self, X): - Xo = np.unique(X, axis=0, return_counts=True)[1].reshape(-1, 1) - Xo = Xo / np.sum(Xo, axis=0).reshape(-1, 1) - return -np.sum(np.nan_to_num(Xo * np.log2(Xo)), axis=0).reshape(-1, 1) + @staticmethod + def _entropy_2d(x): + unique_row_counts = np.unique(x, axis=0, return_counts=True)[1] + return entropy(unique_row_counts, base=2) def _permutation_entropy(self, X): - Xo = np.argsort(X, axis=2) - Xo = np.stack([self._entropy(x) for x in Xo]) - return Xo.reshape(-1, 1) + X_permutations = np.argsort(X, axis=2) + X_permutation_entropy = np.asarray( + [self._entropy_2d(x) for x in X_permutations] + )[:, None] + return X_permutation_entropy def fit(self, X, y=None): """Do nothing and return the estimator unchanged. diff --git a/gtda/time_series/multivariate.py b/gtda/time_series/multivariate.py index 932933cc8..f4e727de6 100644 --- a/gtda/time_series/multivariate.py +++ b/gtda/time_series/multivariate.py @@ -13,15 +13,15 @@ class PearsonDissimilarity(BaseEstimator, TransformerMixin): """Pearson dissimilarities from collections of multivariate time series. - The sample Pearson correlation coefficients between pairs of - components of an :math:`N`-variate time series form an :math:`N - \\times N` matrix :math:`R` with entries + The sample Pearson correlation coefficients between pairs of components of + an :math:`N`-variate time series form an :math:`N \\times N` matrix + :math:`R` with entries .. math:: R_{ij} = \\frac{ C_{ij} }{ \\sqrt{ C_{ii} C_{jj} } }, where :math:`C` is the covariance matrix. Setting :math:`D_{ij} = - (1 - R_{ij})/2` or :math:`D_{ij} = 1 - |R_{ij}|` we obtain a - dissimilarity matrix with entries between 0 and 1. + (1 - R_{ij})/2` or :math:`D_{ij} = 1 - |R_{ij}|` we obtain a dissimilarity + matrix with entries between 0 and 1. This transformer computes one dissimilarity matrix per multivariate time series in a collection. Examples of such collections are the outputs of @@ -30,14 +30,14 @@ class PearsonDissimilarity(BaseEstimator, TransformerMixin): Parameters ---------- absolute_value : bool, default: ``False`` - Whether absolute values of the Pearson correlation coefficients - should be taken. Doing so makes pairs of strongly anti-correlated - variables as similar as pairs of strongly correlated ones. + Whether absolute values of the Pearson correlation coefficients should + be taken. Doing so makes pairs of strongly anti-correlated variables as + similar as pairs of strongly correlated ones. n_jobs : int or None, optional, default: ``None`` - The number of jobs to use for the computation. ``None`` means 1 - unless in a :obj:`joblib.parallel_backend` context. ``-1`` means - using all processors. + The number of jobs to use for the computation. ``None`` means 1 unless + in a :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See also -------- @@ -45,8 +45,7 @@ class PearsonDissimilarity(BaseEstimator, TransformerMixin): """ - _hyperparameters = { - 'absolute_value': {'type': bool}} + _hyperparameters = {'absolute_value': {'type': bool}} def __init__(self, absolute_value=False, n_jobs=None): self.absolute_value = absolute_value diff --git a/gtda/time_series/preprocessing.py b/gtda/time_series/preprocessing.py index c5479a166..9851a89a6 100644 --- a/gtda/time_series/preprocessing.py +++ b/gtda/time_series/preprocessing.py @@ -39,7 +39,8 @@ class Resampler(BaseEstimator, TransformerResamplerMixin): """ _hyperparameters = { - 'period': {'type': int, 'in': Interval(1, np.inf, closed='left')}} + 'period': {'type': int, 'in': Interval(1, np.inf, closed='left')} + } def __init__(self, period=2): self.period = period @@ -78,8 +79,8 @@ def transform(self, X, y=None): Input data. y : None - There is no need for a target, yet the pipeline API - requires this parameter. + There is no need for a target, yet the pipeline API requires this + parameter. Returns ------- @@ -105,8 +106,8 @@ def resample(self, y, X=None): Target. X : None - There is no need for input data, - yet the pipeline API requires this parameter. + There is no need for input data, yet the pipeline API requires this + parameter. Returns ------- @@ -160,7 +161,8 @@ class Stationarizer(BaseEstimator, TransformerResamplerMixin): """ _hyperparameters = { - 'operation': {'type': str, 'in': ['return', 'log-return']}} + 'operation': {'type': str, 'in': ['return', 'log-return']} + } def __init__(self, operation='return'): self.operation = operation @@ -199,8 +201,8 @@ def transform(self, X, y=None): Input data. y : None - There is no need for a target, yet the pipeline API - requires this parameter. + There is no need for a target, yet the pipeline API requires this + parameter. Returns ------- @@ -228,8 +230,8 @@ def resample(self, y, X=None): Target. X : None - There is no need for input data, - yet the pipeline API requires this parameter. + There is no need for input data, yet the pipeline API requires this + parameter. Returns ------- diff --git a/gtda/time_series/target.py b/gtda/time_series/target.py index 37687c3f0..6491c7d6d 100644 --- a/gtda/time_series/target.py +++ b/gtda/time_series/target.py @@ -17,17 +17,16 @@ @adapt_fit_transform_docs class Labeller(BaseEstimator, TransformerResamplerMixin): - """Target creation from sliding windows over a time series. + """Target creation from sliding windows over a univariate time series. Useful to define a time series forecasting task in which labels are - obtained from future values of the input time series, via the - application of a function to time windows. + obtained from future values of the input time series, via the application + of a function to time windows. Parameters ---------- - width : int, optional, default: ``10`` - Width of each sliding window. Each window contains ``width + 1`` - objects from the original time series. + size : int, optional, default: ``10`` + Size of each sliding window. func : callable, optional, default: ``numpy.std`` Function to be applied to each window. @@ -37,8 +36,8 @@ class Labeller(BaseEstimator, TransformerResamplerMixin): percentiles : list of real numbers between 0 and 100 inclusive, or \ None, optional, default: ``None`` - If ``None``, creates a target for a regression task. Otherwise, - creates a target for an n-class classification task where + If ``None``, creates a target for a regression task. Otherwise, creates + a target for an n-class classification task where ``n = len(percentiles) + 1``. n_steps_future : int, optional, default: ``1`` @@ -55,39 +54,32 @@ class Labeller(BaseEstimator, TransformerResamplerMixin): >>> import numpy as np >>> from gtda.time_series import Labeller >>> # Create a time series - >>> X = np.arange(10).reshape(-1, 1) - >>> labeller = Labeller(width=2) + >>> X = np.arange(10) + >>> labeller = Labeller(size=3, func=np.min) >>> # Fit and transform X >>> X, y = labeller.fit_transform_resample(X, X) >>> print(X) - [[1] - [2] - [3] - [4] - [5] - [6] - [7] - [8]] + [1 2 3 4 5 6 7 8] >>> print(y) - [0.81649658 0.81649658 0.81649658 0.81649658 0.81649658 0.81649658 - 0.81649658 0.81649658] + [0 1 2 3 4 5 6 7] """ _hyperparameters = { - 'width': {'type': int, 'in': Interval(1, np.inf, closed='left')}, + 'size': {'type': int, 'in': Interval(1, np.inf, closed='left')}, 'func': {'type': FunctionType}, 'func_params': {'type': (dict, type(None))}, 'percentiles': { 'type': (list, type(None)), - 'of': {'type': Real, 'in': Interval(0, 100, closed='both')}}, - 'n_steps_future': { - 'type': int, 'in': Interval(1, np.inf, closed='left')} - } + 'of': {'type': Real, 'in': Interval(0, 100, closed='both')} + }, + 'n_steps_future': {'type': int, + 'in': Interval(1, np.inf, closed='left')} + } - def __init__(self, width=10, func=np.std, + def __init__(self, size=10, func=np.std, func_params=None, percentiles=None, n_steps_future=1): - self.width = width + self.size = size self.func = func self.func_params = func_params self.percentiles = percentiles @@ -99,11 +91,11 @@ def fit(self, X, y=None): Parameters ---------- X : ndarray of shape (n_samples,) or (n_samples, 1) - Time series to build a target for. + Univariate time series to build a target for. y : None - There is no need for a target in a transformer, yet the pipeline - API requires this parameter. + There is no need for a target, yet the pipeline API requires this + parameter. Returns ------- @@ -113,14 +105,13 @@ def fit(self, X, y=None): X = column_or_1d(X) validate_params(self.get_params(), self._hyperparameters) - self._sliding_window = SlidingWindow(width=self.width, stride=1).fit(X) + self._sliding_window = SlidingWindow(size=self.size, stride=1).fit(X) _X = self._sliding_window.transform(X) if self.func_params is None: self._effective_func_params = {} else: self._effective_func_params = self.func_params - _X = self.func( - _X, axis=1, **self._effective_func_params).reshape(-1, 1) + _X = self.func(_X, axis=1, **self._effective_func_params)[:, None] if self.percentiles is None: self.thresholds_ = None @@ -136,15 +127,15 @@ def transform(self, X, y=None): Parameters ---------- X : ndarray of shape (n_samples,) or (n_samples, 1) - Time series to build a target for. + Univariate time series to build a target for. y : None - There is no need for a target, yet the pipeline API - requires this parameter. + There is no need for a target, yet the pipeline API requires this + parameter. Returns ------- - Xt : ndarray of shape (n_samples_new, 1) + Xt : ndarray of shape (n_samples_new,) The cut input time series. """ @@ -153,9 +144,9 @@ def transform(self, X, y=None): Xt = Xt[:-self.n_steps_future] - if self.n_steps_future < self.width: - Xt = Xt[self.width - self.n_steps_future:] - return Xt.reshape(-1, 1) + if self.n_steps_future < self.size - 1: + Xt = Xt[self.size - 1 - self.n_steps_future:] + return Xt def resample(self, y, X=None): """Resample `y`. @@ -179,8 +170,7 @@ def resample(self, y, X=None): y = column_or_1d(y) yr = self._sliding_window.transform(y) - yr = self.func( - yr, axis=1, **self._effective_func_params).reshape(-1, 1) + yr = self.func(yr, axis=1, **self._effective_func_params)[:, None] if self.thresholds_ is not None: yr = np.abs(yr) @@ -192,7 +182,7 @@ def resample(self, y, X=None): [1 * (yr >= self.thresholds_[-1])], axis=1) yr = np.nonzero(yr)[1].reshape(yr.shape[0], 1) - if self.n_steps_future >= self.width: - yr = yr[self.n_steps_future - self.width + 1:] + if self.n_steps_future > self.size - 1: + yr = yr[self.n_steps_future - self.size + 1:] return yr.reshape(-1) diff --git a/gtda/time_series/tests/test_embedding.py b/gtda/time_series/tests/test_embedding.py index 7b9adf0ed..5f51e80cd 100644 --- a/gtda/time_series/tests/test_embedding.py +++ b/gtda/time_series/tests/test_embedding.py @@ -7,7 +7,8 @@ from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.time_series import SlidingWindow, TakensEmbedding +from gtda.time_series import SlidingWindow, \ + takens_embedding_optimal_parameters, SingleTakensEmbedding, TakensEmbedding pio.renderers.default = 'plotly_mimetype' @@ -54,15 +55,23 @@ [2.93799998, 2.98935825, 2.79848711, 2.41211849, 1.92484888]]) +def test_takens_embedding_optimal_parameters_validate(): + time_delay = -1 + dimension = 2 + with pytest.raises(ValueError): + takens_embedding_optimal_parameters(signal, time_delay, dimension, + validate=True) + + def test_embedder_params(): parameters_type = 'not_defined' - embedder = TakensEmbedding(parameters_type=parameters_type) + embedder = SingleTakensEmbedding(parameters_type=parameters_type) with pytest.raises(ValueError): embedder.fit(signal) def test_embedder_not_fitted(): - embedder = TakensEmbedding() + embedder = SingleTakensEmbedding() with pytest.raises(NotFittedError): embedder.transform(signal) @@ -71,39 +80,159 @@ def test_embedder_not_fitted(): [('search', signal_embedded_search), ('fixed', signal_embedded_fixed)]) def test_embedder_transform(parameters_type, expected): - embedder = TakensEmbedding(parameters_type=parameters_type) + embedder = SingleTakensEmbedding(parameters_type=parameters_type) assert_almost_equal(embedder.fit_transform(signal), expected) def test_embedder_resample(): - embedder = TakensEmbedding(parameters_type='fixed', time_delay=3, - dimension=2, stride=3) + embedder = SingleTakensEmbedding(parameters_type='fixed', time_delay=3, + dimension=2, stride=3) embedder.fit(signal) y_resampled = embedder.resample(y) assert_almost_equal(y_resampled, y[np.arange(4, 20, 3)]) -def test_window_params(): - windows = SlidingWindow(width=0) +@pytest.mark.parametrize("size", [0, -1]) +def test_window_params(size): + windows = SlidingWindow(size=size) with pytest.raises(ValueError): windows.fit(signal) def test_window_transform(): - windows = SlidingWindow(width=3, stride=2) + windows = SlidingWindow(size=4, stride=2) X_windows = windows.fit_transform(signal_embedded_search) assert (X_windows.shape == (8, 4, 2)) def test_window_resample(): - windows = SlidingWindow(width=3, stride=2) + windows = SlidingWindow(size=4, stride=2) windows.fit(y) y_resampled = windows.resample(y) assert_almost_equal(y_resampled, y[np.arange(3, 20, 2)]) -def test_window_plot(): - windows = SlidingWindow(width=3, stride=2) - X_windows = windows.fit_transform(signal_embedded_search) - windows.plot(X_windows, sample=0) +def test_window_slice_windows(): + windows = SlidingWindow(size=4, stride=2) + X = signal_embedded_search + X_windows = windows.fit_transform(X) + slice_idx = windows.slice_windows(X) + assert_almost_equal( + np.stack([X[begin:end] for begin, end in slice_idx]), X_windows + ) + + +@pytest.mark.parametrize('time_delay', list(range(1, 5))) +@pytest.mark.parametrize('dimension', list(range(1, 5))) +@pytest.mark.parametrize('stride', list(range(1, 5))) +def test_takens_embedding_consistent_with_single_takens_embedding( + time_delay, dimension, stride + ): + """Test that TakensEmbedding and SingleTakensEmbedding give identical + outputs (up to shape) on arrays representing single univariate time + series.""" + n_points = (len(signal) - time_delay * (dimension - 1) - 1) // stride + 1 + single_embedder = SingleTakensEmbedding(parameters_type='fixed', + time_delay=time_delay, + dimension=dimension, stride=stride) + embedder = TakensEmbedding(time_delay=time_delay, dimension=dimension, + stride=stride) + if n_points <= 0: + with pytest.raises(ValueError): + single_embedder.fit_transform(signal) + with pytest.raises(ValueError): + embedder.fit_transform(signal[None, :]) + else: + single_embedder_res = single_embedder.fit_transform(signal) + embedder_res = embedder.fit_transform(signal[None, :])[0] + assert np.array_equal(single_embedder_res, embedder_res) + + +@pytest.mark.parametrize("params", + [{"time_delay": 0}, {"time_delay": -1}, + {"dimension": 0}, {"dimension": -1}, + {"stride": 0}, {"stride": -1}, + {"flatten": "foo"}, {"ensure_last_value": "bar"}]) +def test_takens_embedding_validation(params): + if "flatten" not in params and "ensure_last_value" not in params: + exception_type = ValueError + else: + exception_type = TypeError + with pytest.raises(exception_type): + TE = TakensEmbedding(**params) + TE.fit(signal[None, :]) + + +def test_takens_embedding_2D(): + """Test the return values of TakensEmbedding on 2D input or list of 1D + input, with default parameters.""" + signals = np.arange(10).reshape(2, 5) + TE = TakensEmbedding() + signals_emb = TE.fit_transform(signals) + signals_emb_list = TE.fit_transform(list(signals)) + signals_emb_exp = np.array([[[0, 1], + [1, 2], + [2, 3], + [3, 4]], + [[5, 6], + [6, 7], + [7, 8], + [8, 9]]]) + assert np.array_equal(signals_emb, signals_emb_exp) + assert np.array_equal(np.asarray(signals_emb_list), signals_emb_exp) + + +def test_takens_embedding_3D_default(): + """Test the return values of TakensEmbedding on 3D input or list of 2D + input, with default parameters.""" + signals = np.arange(20).reshape(2, 2, 5) + TE = TakensEmbedding() + signals_emb = TE.fit_transform(signals) + signals_emb_list = TE.fit_transform(list(signals)) + signals_emb_exp = np.array([[[0, 1, 5, 6], + [1, 2, 6, 7], + [2, 3, 7, 8], + [3, 4, 8, 9]], + [[10, 11, 15, 16], + [11, 12, 16, 17], + [12, 13, 17, 18], + [13, 14, 18, 19]]]) + assert np.array_equal(signals_emb, signals_emb_exp) + assert np.array_equal(np.asarray(signals_emb_list), signals_emb_exp) + + +def test_takens_embedding_3D_no_flatten(): + """Test the return values of TakensEmbedding on 3D input or list of 2D + input, with `flatten` set to ``False``.""" + signals = np.arange(20).reshape(2, 2, 5) + TE = TakensEmbedding(flatten=False) + signals_emb = TE.fit_transform(signals) + signals_emb_list = TE.fit_transform(list(signals)) + signals_emb_exp = np.array([[[[0, 1], + [1, 2], + [2, 3], + [3, 4]], + [[5, 6], + [6, 7], + [7, 8], + [8, 9]]], + [[[10, 11], + [11, 12], + [12, 13], + [13, 14]], + [[15, 16], + [16, 17], + [17, 18], + [18, 19]]]]) + assert np.array_equal(signals_emb, signals_emb_exp) + assert np.array_equal(np.asarray(signals_emb_list), signals_emb_exp) + + +def test_takens_embedding_plot(): + trace_params = {"mode": "lines+markers"} + layout_params = {"title": "New title"} + TE = TakensEmbedding() + plotly_params = {"trace": trace_params, "layout": layout_params} + TE.fit_transform_plot([np.arange(20)], sample=0, + plotly_params=plotly_params) diff --git a/gtda/time_series/tests/test_features.py b/gtda/time_series/tests/test_features.py index 84aa5b6b1..90a7087b9 100644 --- a/gtda/time_series/tests/test_features.py +++ b/gtda/time_series/tests/test_features.py @@ -3,7 +3,7 @@ import numpy as np from numpy.testing import assert_almost_equal -from gtda.time_series.features import PermutationEntropy +from gtda.time_series import PermutationEntropy from itertools import product X = np.ones((10, 200, 3)) # 10 samples, of 200 points embedded in a 3d space diff --git a/gtda/time_series/tests/test_preprocessing.py b/gtda/time_series/tests/test_preprocessing.py index 1650bb717..156b96956 100644 --- a/gtda/time_series/tests/test_preprocessing.py +++ b/gtda/time_series/tests/test_preprocessing.py @@ -111,13 +111,14 @@ def test_stationarizer_errors(): stationarizer.fit(signal) +@pytest.mark.parametrize('X', [signal, signal.ravel()]) @pytest.mark.parametrize("operation, expected", [('return', signal_stationarized_return), ('log-return', signal_stationarized_log_return)]) -def test_stationarizer_transform(operation, expected): +def test_stationarizer_transform(X, operation, expected): stationarizer = Stationarizer(operation=operation) - assert_almost_equal(stationarizer.fit_transform(signal), expected) + assert_almost_equal(stationarizer.fit_transform(X), expected) @pytest.mark.parametrize("operation, expected", diff --git a/gtda/time_series/tests/test_target.py b/gtda/time_series/tests/test_target.py index 42a09a8c0..9d03c4fb8 100644 --- a/gtda/time_series/tests/test_target.py +++ b/gtda/time_series/tests/test_target.py @@ -5,50 +5,72 @@ from numpy.testing import assert_almost_equal import pytest -from gtda.time_series.target import Labeller +from gtda.time_series import Labeller signal = np.asarray([np.sin(x / 2) + 2 for x in range(0, 20)]) -X = np.tile(np.arange(10), reps=2).reshape(-1, 1) +X = np.tile(np.arange(10), reps=2) + + +@pytest.mark.parametrize("size", [0, -1]) +def test_labeller_params(size): + labeller = Labeller(size=size) + with pytest.raises(ValueError): + labeller.fit(signal) def test_labeller_shape(): - width = 3 - labeller = Labeller(width=width, func=np.std, func_params={}, + size = 4 + labeller = Labeller(size=size, func=np.std, func_params={}, percentiles=None, n_steps_future=1) signal_transformed = labeller.fit_transform(signal) - assert signal_transformed.shape == (20-(width+1)+1, 1) + assert signal_transformed.shape == (20 - size + 1,) def test_labeller_transformed(): - width = 5 + size = 6 n_steps_future = 1 - labeller = Labeller(width=width, func=np.max, func_params={}, + labeller = Labeller(size=size, func=np.max, func_params={}, percentiles=None, n_steps_future=n_steps_future) x, y = labeller.fit_transform_resample(X, X) - assert_almost_equal(x, X[(width-1):-n_steps_future]) + assert_almost_equal(x, X[(size - 2):-n_steps_future]) + assert len(x) == len(y) def test_labeller_resampled(): - width = 5 - n_steps_future = 1 - labeller = Labeller(width=width, func=np.max, func_params={}, - percentiles=None, n_steps_future=n_steps_future) + size = 6 + labeller = Labeller(size=size, func=np.max, func_params={}, + percentiles=None, n_steps_future=1) x, y = labeller.fit_transform_resample(X, X) assert_almost_equal(y, np.array([5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 5, 6, 7, 8, 9])) + assert len(x) == len(y) + + # Test behaviour when n_steps_future = size - 1 + labeller.set_params(n_steps_future=size - 1) + x, y = labeller.fit_transform_resample(X, X) + assert_almost_equal(y, np.array([5, 6, 7, 8, 9, 9, 9, + 9, 9, 9, 5, 6, 7, 8, 9])) + assert len(x) == len(y) + + # Test behaviour when n_steps_future > size - 1 + labeller.set_params(n_steps_future=size) + x, y = labeller.fit_transform_resample(X, X) + assert_almost_equal(y, np.array([6, 7, 8, 9, 9, 9, 9, + 9, 9, 5, 6, 7, 8, 9])) + assert len(x) == len(y) def test_labeller_with_percentage(): - width = 5 + size = 6 n_steps_future = 1 - labeller = Labeller(width=width, func=np.max, func_params={}, + labeller = Labeller(size=size, func=np.max, func_params={}, percentiles=[100], n_steps_future=n_steps_future) labeller.fit(X) assert np.max(X) == labeller.thresholds_[0] def test_labeller_invalid_percentage(): - labeller = Labeller(width=5, func=np.max, func_params={}, + labeller = Labeller(size=6, func=np.max, func_params={}, percentiles=[101], n_steps_future=2) with pytest.raises(ValueError): labeller.fit_transform_resample(X, signal) diff --git a/gtda/utils/__init__.py b/gtda/utils/__init__.py index 919720e5a..7c4db39f1 100644 --- a/gtda/utils/__init__.py +++ b/gtda/utils/__init__.py @@ -1,11 +1,11 @@ -"""The module :mod:`gtda.utils` implements hyperparameter and input -validation functions.""" - -from .validation import check_diagrams, check_point_clouds, validate_params +"""The module :mod:`gtda.utils` includes various utilities.""" +from .validation import check_collection, check_point_clouds, check_diagrams, \ + validate_params __all__ = [ - 'check_diagrams', - 'check_point_clouds', - 'validate_params' -] + "check_collection", + "check_point_clouds", + "check_diagrams", + "validate_params" + ] diff --git a/gtda/utils/tests/test_validation.py b/gtda/utils/tests/test_validation.py index 378b2e59a..f26441399 100644 --- a/gtda/utils/tests/test_validation.py +++ b/gtda/utils/tests/test_validation.py @@ -1,12 +1,15 @@ """Tests for validation functions.""" # License: GNU AGPLv3 +from numbers import Integral + import numpy as np import pytest from sklearn.exceptions import DataDimensionalityWarning -from gtda.utils.validation import check_diagrams, validate_params, \ - check_point_clouds +from gtda.utils import check_collection, check_point_clouds, check_diagrams, \ + validate_params +from gtda.utils.intervals import Interval # Testing for validate_params @@ -36,34 +39,93 @@ def test_validate_params_list(): references[parameter_name]['of'].""" references = { 'par1': {'type': list, 'of': {'type': float, 'in': [1., 2.]}} - } + } parameters = {'par1': [1.]} validate_params(parameters, references) -# Testing check_diagrams -# Test for the wrong array key value -def test_inputs_keys_V(): - X = np.array([[[1, 1, 0], [2, 2, -1]]]) +def test_validate_params_tuple_of_types(): + references = { + 'n_coefficients': {'type': (type(None), list, int), + 'in': Interval(1, np.inf, closed='left'), + 'of': {'type': Integral, + 'in': Interval(1, np.inf, closed='left')}} + } + parameters = {'n_coefficients': None} + + validate_params(parameters, references) + + parameters['n_coefficients'] = 1 + validate_params(parameters, references) + + parameters['n_coefficients'] = 1. + with pytest.raises(TypeError): + validate_params(parameters, references) + + parameters['n_coefficients'] = 0 + with pytest.raises(ValueError): + validate_params(parameters, references) + + parameters['n_coefficients'] = [1, 2] + validate_params(parameters, references) + + parameters['n_coefficients'] = [1., 2.] + with pytest.raises(TypeError): + validate_params(parameters, references) + + parameters['n_coefficients'] = [0, 2] with pytest.raises(ValueError): + validate_params(parameters, references) + + +@pytest.mark.parametrize("bad_dim", [-1, 0.2]) +def test_check_diagrams_invalid_homology_dimensions(bad_dim): + X = np.array([[[1, 1, 0], [2, 2, bad_dim]]]) + with pytest.raises( + ValueError, + match="Homology dimensions should be positive integers" + ): + check_diagrams(X) + + +def test_check_diagrams_inf_mixed_with_finite_homology_dimensions(): + X = np.array([[[1, 1, 0], [2, 2, np.inf]]]) + with pytest.raises( + ValueError, + match="numpy.inf is a valid homology dimension" + ): check_diagrams(X) # Test for the wrong structure dimension -def test_inputs_arrayStruc_V(): +def test_check_diagrams_bad_input_dimension(): X = np.array([[[[1, 1, 0], [2, 2, 1]]]]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Input should be a 3D ndarray"): + check_diagrams(X) + + +# Test that axis 2 has length 3 +def test_check_diagrams_bad_axis_2_length(): + X = np.array([[[1, 1, 0, 4], [2, 2, 1, 4]]]) + + with pytest.raises(ValueError, match="with a 3rd dimension of 3"): + check_diagrams(X) + + +def test_check_diagrams_points_below_diagonal(): + X = np.array([[[1, 0, 0], [2, 2, 1]]]) + + with pytest.raises(ValueError, match="should be above the diagonal"): check_diagrams(X) # Testing check_point_clouds # Create several kinds of inputs class CreateInputs: - def __init__( - self, n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra - ): + def __init__(self, n_samples, n_1, n_2, n_samples_extra, n_1_extra, + n_2_extra): N = n_samples * n_1 * n_2 n_1_rectang = n_1 + 1 n_2_rectang = n_2 - 1 @@ -142,7 +204,7 @@ def test_check_point_clouds_value_err_finite(): # Check that we error on 1d array input with pytest.raises(ValueError): - check_point_clouds(np.asarray(ex.X_list_tot)) + check_point_clouds(np.asarray(ex.X_list_tot, dtype=object)) # Check that we error on 2d array input with pytest.raises(ValueError): @@ -173,11 +235,6 @@ def test_check_point_clouds_warn_finite(): with pytest.warns(DataDimensionalityWarning): check_point_clouds(ex.X_list) - # Check that we throw warnings on list input when arrays have different - # number of columns - with pytest.warns(DataDimensionalityWarning): - check_point_clouds(ex.X_list_rectang_diff_cols) - def test_check_point_clouds_regular_inf(): """Cases in which part of the input is infinite and no warnings or errors @@ -238,7 +295,7 @@ def test_check_point_clouds_regular_nan(): @pytest.mark.parametrize("force_all_finite", [True, False]) def test_check_point_clouds_value_err_nan(force_all_finite): - """Cases in which part of the input is nan and we throw a + """Cases in which part of the input is NaN and we throw a ValueError.""" ex = CreateInputs( @@ -259,3 +316,21 @@ def test_check_point_clouds_value_err_nan(force_all_finite): with pytest.raises(ValueError): check_point_clouds( ex.X_list_rectang, force_all_finite=force_all_finite) + + +def test_check_collection_ragged_array(): + X = np.array([np.arange(2), np.arange(3)], dtype=object) + with pytest.raises(ValueError): + check_collection(X) + + +def test_check_collection_array_of_list(): + X = np.array([list(range(2)), list(range(3))], dtype=object) + with pytest.raises(ValueError): + check_collection(X) + + +def test_check_collection_list_of_list(): + X = [list(range(2)), list(range(3))] + Xnew = check_collection(X) + assert np.array_equal(np.array(X[0]), Xnew[0]) diff --git a/gtda/utils/validation.py b/gtda/utils/validation.py index bc5df763e..32cfa6872 100644 --- a/gtda/utils/validation.py +++ b/gtda/utils/validation.py @@ -6,9 +6,23 @@ from warnings import warn import numpy as np - -from sklearn.utils.validation import check_array +from scipy.sparse import issparse from sklearn.exceptions import DataDimensionalityWarning +from sklearn.utils.validation import check_array + + +def _check_array_mod(X, **kwargs): + """Modified version of :func:`sklearn.utils.validation.check_array. When + keyword parameter `force_all_finite` is set to False, NaNs are not + accepted but infinity is.""" + if not kwargs.get('force_all_finite', True): + Xnew = check_array(X, **kwargs) + if np.isnan(Xnew if not issparse(Xnew) else Xnew.data).any(): + raise ValueError("Input contains NaNs. Only finite values and " + "infinity are allowed when parameter " + "`force_all_finite` is False.") + return Xnew + return check_array(X, **kwargs) def check_diagrams(X, copy=False): @@ -33,47 +47,42 @@ def check_diagrams(X, copy=False): The converted and validated array of persistence diagrams. """ - X_array = np.asarray(X) - if X_array.ndim == 0: - raise ValueError( - f"Expected 3D array, got scalar array instead:\narray={X_array}.") + X_array = _check_array_mod(X, ensure_2d=False, allow_nd=True, + force_all_finite=False, copy=copy) if X_array.ndim != 3: raise ValueError( - f"Input should be a 3D ndarray, the shape is {X_array.shape}.") + f"Input should be a 3D ndarray, the shape is {X_array.shape}." + ) if X_array.shape[2] != 3: raise ValueError( f"Input should be a 3D ndarray with a 3rd dimension of 3 " - f"components, but there are {X_array.shape[2]} components.") + f"components, but there are {X_array.shape[2]} components." + ) - X_array = X_array.astype(float, copy=False) - homology_dimensions = sorted(list(set(X_array[0, :, 2]))) + homology_dimensions = sorted(np.unique(X_array[0, :, 2])) for dim in homology_dimensions: if dim == np.inf: if len(homology_dimensions) != 1: raise ValueError( - f"np.inf is a valid homology dimension for a stacked " + f"numpy.inf is a valid homology dimension for a stacked " f"diagram but it should be the only one: " - f"homology_dimensions = {homology_dimensions}.") + f"homology_dimensions = {homology_dimensions}." + ) else: - if dim != int(dim): - raise ValueError( - f"All homology dimensions should be integer valued: " - f"{dim} can't be cast to an int of the same value.") - if dim != np.abs(dim): + if (dim != int(dim)) or (dim < 0): raise ValueError( - f"All homology dimensions should be integer valued: " - f"{dim} can't be cast to an int of the same value.") + f"Homology dimensions should be positive integers or " + f"numpy.inf: {dim} can't be cast to an int of the same " + f"value." + ) - n_points_above_diag = np.sum(X_array[:, :, 1] >= X_array[:, :, 0]) - n_points_global = X_array.shape[0] * X_array.shape[1] - if n_points_above_diag != n_points_global: + n_points_below_diag = np.sum(X_array[:, :, 1] < X_array[:, :, 0]) + if n_points_below_diag: raise ValueError( f"All points of all persistence diagrams should be above the " - f"diagonal, i.e. X[:,:,1] >= X[:,:,0]. " - f"{n_points_global - n_points_above_diag} points are under the " - f"diagonal.") - if copy: - X_array = np.copy(X_array) + f"diagonal, i.e. X[:, :, 1] >= X[:, :, 0]. {n_points_below_diag} " + f"points are below the diagonal." + ) return X_array @@ -83,45 +92,47 @@ def check_graph(X): return X -def _validate_params_single(parameter, reference, name): - if reference is None: - return - - ref_type = reference.get('type', None) - - # Check that parameter has the correct type - if (ref_type is not None) and (not isinstance(parameter, ref_type)): - raise TypeError( - f"Parameter `{name}` is of type {type(parameter)} while " - f"it should be of type {ref_type}.") - - # If the reference type parameter is not list, tuple, np.ndarray or dict, - # the checks are performed on the parameter object directly. - elif ref_type not in [list, tuple, np.ndarray, dict]: - ref_in = reference.get('in', None) - ref_other = reference.get('other', None) - if parameter is not None: - if (ref_in is not None) and (parameter not in ref_in): - raise ValueError( - f"Parameter `{name}` is {parameter}, which is not in " - f"{ref_in}.") - # Perform any other checks via the callable ref_others - if ref_other is not None: - return ref_other(parameter) - - # Explicitly return the type of reference if one of list, tuple, np.ndarray - # or dict. - else: - return ref_type - - def _validate_params(parameters, references, rec_name=None): + types_tuple = (list, tuple, np.ndarray, dict) + + def _validate_params_single(_parameter, _reference, _name): + if _reference is None: + return + + _ref_type = _reference.get('type', None) + + # Check that _parameter has the correct type + if not ((_ref_type is None) or isinstance(_parameter, _ref_type)): + raise TypeError(f"Parameter `{_name}` is of type " + f"{type(_parameter)} while it should be of type " + f"{_ref_type}.") + + # If neither the reference type is list, tuple, np.ndarray or dict, + # nor _parameter is an instance of one of these types, the checks are + # performed on _parameter directly. + elif not ((_ref_type in types_tuple) + or isinstance(_parameter, types_tuple)): + ref_in = _reference.get('in', None) + ref_other = _reference.get('other', None) + if _parameter is not None: + if not ((ref_in is None) or _parameter in ref_in): + raise ValueError(f"Parameter `{_name}` is {_parameter}, " + f"which is not in {ref_in}.") + # Perform any other checks via the callable ref_others + if ref_other is not None: + return ref_other(_parameter) + + # Explicitly return the type of _reference if one of list, tuple, + # np.ndarray or dict. + else: + return _ref_type + for name, parameter in parameters.items(): if name not in references.keys(): name_extras = "" if rec_name is None else f" in `{rec_name}`" - raise KeyError( - f"`{name}`{name_extras} is not an available parameter. " - f"Available parameters are in {list(references.keys())}.") + raise KeyError(f"`{name}`{name_extras} is not an available " + f"parameter. Available parameters are in " + f"{tuple(references.keys())}.") reference = references[name] ref_type = _validate_params_single(parameter, reference, name) @@ -131,8 +142,8 @@ def _validate_params(parameters, references, rec_name=None): _validate_params(parameter, ref_of, rec_name=name) else: # List, tuple or ndarray type for i, parameter_elem in enumerate(parameter): - _validate_params_single( - parameter_elem, ref_of, f"{name}[{i}]") + _validate_params_single(parameter_elem, ref_of, + f"{name}[{i}]") def validate_params(parameters, references, exclude=None): @@ -156,9 +167,9 @@ def validate_params(parameters, references, exclude=None): - ``'type'``, mapping to a class or tuple of classes. ``parameter`` is checked to be an instance of this class or tuple of classes. - - ``'in'``, mapping to a dictionary, when the value of ``'type'`` is + - ``'in'``, mapping to an object, when the value of ``'type'`` is not one of ``list``, ``tuple``, ``numpy.ndarray`` or ``dict``. - Letting ``ref_in`` denote that dictionary, the following check is + Letting ``ref_in`` denote that object, the following check is performed: ``parameter in ref_in``. - ``'of'``, mapping to a dictionary, when the value of ``'type'`` @@ -176,7 +187,7 @@ def validate_params(parameters, references, exclude=None): - ``'other'``, which should map to a callable defining custom checks on ``parameter``. - exclude : list of str, or None, optional, default: ``None`` + exclude : list or None, optional, default: ``None`` List of parameter names which are among the keys in `parameters` but should be excluded from validation. ``None`` is equivalent to passing the empty list. @@ -188,32 +199,13 @@ def validate_params(parameters, references, exclude=None): return _validate_params(parameters_, references) -def _check_array_mod(X, **kwargs): - """Modified version of :func:`~sklearn.utils.validation.check_array. When - keyword parameter `force_all_finite` is set to False, NaNs are not - accepted but infinity is.""" - if not kwargs['force_all_finite']: - Xnew = check_array(X, **kwargs) - if np.isnan(Xnew).any(): - raise ValueError( - "Input contains NaN. Only finite values and infinity are " - "allowed when parameter `force_all_finite` is False.") - return Xnew - return check_array(X, **kwargs) - - def check_point_clouds(X, distance_matrices=False, **kwargs): """Input validation on arrays or lists representing collections of point clouds or of distance/adjacency matrices. The input is checked to be either a single 3D array using a single call - to :func:`~sklearn.utils.validation.check_array`, or a list of 2D arrays by - calling :func:`~sklearn.utils.validation.check_array` on each entry. In - the latter case, warnings are issued when not all point clouds are in - the same Euclidean space. - - Conversions and copies may be triggered as per - :func:`~gtda.utils.validation.check_list_of_arrays`. + to :func:`sklearn.utils.validation.check_array`, or a list of 2D arrays by + calling :func:`sklearn.utils.validation.check_array` on each entry. Parameters ---------- @@ -225,16 +217,16 @@ def check_point_clouds(X, distance_matrices=False, **kwargs): concrete point clouds in Euclidean space. In the first case, entries are allowed to be infinite unless otherwise specified in `kwargs`. - kwargs + **kwargs Keyword arguments accepted by - :func:`~sklearn.utils.validation.check_array`, with the following + :func:`sklearn.utils.validation.check_array`, with the following caveats: 1) `ensure_2d` and `allow_nd` are ignored; 2) if not passed explicitly, `force_all_finite` is set to be the boolean negation of `distance_matrices`; 3) when `force_all_finite` is set to ``False``, NaN inputs are not allowed; 4) `accept_sparse` and `accept_large_sparse` are only meaningful in the case of lists of 2D arrays, in which case they are passed to individual instances of - :func:`~sklearn.utils.validation.check_array` validating each entry + :func:`sklearn.utils.validation.check_array` validating each entry in the list. Returns @@ -257,8 +249,10 @@ def check_point_clouds(X, distance_matrices=False, **kwargs): else: extra_2D = "" raise ValueError( - f"Input must be a single 3D array or a list of 2D arrays. " - f"Array of dimension {X.ndim} passed." + extra_2D) + f"Input must be a single 3D array or a list of 2D arrays or " + f"sparse matrices. Structure of dimension {X.ndim} passed." + + extra_2D + ) if (X.shape[1] != X.shape[2]) and distance_matrices: raise ValueError( f"Input array X must have X.shape[1] == X.shape[2]: " @@ -269,20 +263,22 @@ def check_point_clouds(X, distance_matrices=False, **kwargs): "consistent with a collection of distance/adjacency " "matrices, but the input is being treated as a collection " "of vectors in Euclidean space.", - DataDimensionalityWarning, stacklevel=2) - Xnew = _check_array_mod(X, **kwargs_, allow_nd=True) + DataDimensionalityWarning, stacklevel=2 + ) + Xnew = _check_array_mod(X, allow_nd=True, **kwargs_) else: has_check_failed = False messages = [] Xnew = [] for i, x in enumerate(X): try: - xnew = _check_array_mod(x, **kwargs_, ensure_2d=True) - if distance_matrices: + xnew = _check_array_mod(x, ensure_2d=True, **kwargs_) + if distance_matrices and not issparse(xnew): if not x.shape[0] == x.shape[1]: raise ValueError( f"All arrays must be square: {x.shape[0]} rows " - f"and {x.shape[1]} columns found in this array.") + f"and {x.shape[1]} columns found in this array." + ) Xnew.append(xnew) except ValueError as e: has_check_failed = True @@ -290,20 +286,67 @@ def check_point_clouds(X, distance_matrices=False, **kwargs): if has_check_failed: raise ValueError( "The following errors were raised by the inputs:\n\n" + - "\n\n".join(messages)) + "\n\n".join(messages) + ) if not distance_matrices: if reduce(and_, (x.shape[0] == x.shape[1] for x in X), True): warn( - "All arrays are square. This is consistent with a " - "collection of distance/adjacency matrices, but the input " - "is being treated as a collection of vectors in Euclidean " - "space.", DataDimensionalityWarning, stacklevel=2) + "All arrays/matrices are square. This is consistent with " + "a collection of distance/adjacency matrices, but the " + "entries will be treated as collections of vectors in " + "Euclidean space.", DataDimensionalityWarning, + stacklevel=2 + ) - ref_dim = X[0].shape[1] # Embedding dimension of first sample - if not reduce(and_, (x.shape[1] == ref_dim for x in X[1:]), True): - warn( - "Not all point clouds have the same embedding dimension.", - DataDimensionalityWarning, stacklevel=2) + ref_dim = X[0].shape # Shape of first sample + if reduce(and_, (x.shape == ref_dim for x in X[1:]), True): + Xnew = np.asarray(Xnew) + + return Xnew + + +def check_collection(X, **kwargs): + """Generic input validation on arrays or lists of arrays. + + Parameters + ---------- + X : object + Input object to check / convert. + + **kwargs + Keyword arguments accepted by + :func:`sklearn.utils.validation.check_array`, with the following + caveats: 1) `ensure_2d` and `allow_nd` are ignored; 2) when + `force_all_finite` is set to ``False``, NaN inputs are not allowed. + + Returns + ------- + Xnew : ndarray or list + The converted and validated object. + + """ + kwargs_ = kwargs.copy() + kwargs_.pop('allow_nd', None) + kwargs_.pop('ensure_2d', None) + if hasattr(X, 'shape') and hasattr(X, 'ndim'): + Xnew = _check_array_mod(X, ensure_2d=True, allow_nd=True, **kwargs_) + else: + has_check_failed = False + messages = [] + Xnew = [] + for i, x in enumerate(X): + try: + xnew = _check_array_mod(x, ensure_2d=False, allow_nd=True, + **kwargs_) + Xnew.append(xnew) + except ValueError as e: + has_check_failed = True + messages.append(f"Entry {i}:\n{e}") + if has_check_failed: + raise ValueError( + "The following errors were raised by the inputs:\n\n" + + "\n\n".join(messages) + ) return Xnew diff --git a/requirements.txt b/requirements.txt index 320ecb400..281569679 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ -numpy >= 1.17.0 -scipy >= 0.17.0 -joblib >= 0.13 -scikit-learn >= 0.22.0 -python-igraph >= 0.7.1.post6 -plotly >= 4.4.1 -ipywidgets >= 7.5.1 \ No newline at end of file +numpy >= 1.19.1 +scipy >= 1.5.0 +joblib >= 0.16.0 +scikit-learn >= 0.23.1 +pyflagser >= 0.4.1 +python-igraph >= 0.8.2 +plotly >= 4.8.2 +ipywidgets >= 7.5.1 diff --git a/setup.py b/setup.py index 1c2dd5e0e..3c76b4075 100755 --- a/setup.py +++ b/setup.py @@ -13,68 +13,63 @@ from setuptools.command.build_ext import build_ext -version_file = os.path.join('gtda', '_version.py') +version_file = os.path.join("gtda", "_version.py") with open(version_file) as f: exec(f.read()) -with open('requirements.txt') as f: +with open("requirements.txt") as f: requirements = f.read().splitlines() -DISTNAME = 'giotto-tda' -DESCRIPTION = 'Toolbox for Machine Learning using Topological Data Analysis.' -with codecs.open('README.rst', encoding='utf-8-sig') as f: +DISTNAME = "giotto-tda" +DESCRIPTION = "Toolbox for Machine Learning using Topological Data Analysis." +with codecs.open("README.rst", encoding="utf-8-sig") as f: LONG_DESCRIPTION = f.read() -LONG_DESCRIPTION_TYPE = 'text/x-rst' -MAINTAINER = 'Umberto Lupo, Lewis Tunstall' -MAINTAINER_EMAIL = 'maintainers@giotto.ai' -URL = 'https://github.com/giotto-ai/giotto-tda' -LICENSE = 'GNU AGPLv3' -DOWNLOAD_URL = 'https://github.com/giotto-ai/giotto-tda/tarball/v0.2.2' +LONG_DESCRIPTION_TYPE = "text/x-rst" +MAINTAINER = "Umberto Lupo, Lewis Tunstall" +MAINTAINER_EMAIL = "maintainers@giotto.ai" +URL = "https://github.com/giotto-ai/giotto-tda" +LICENSE = "GNU AGPLv3" +DOWNLOAD_URL = "https://github.com/giotto-ai/giotto-tda/tarball/v0.3.0" VERSION = __version__ # noqa -CLASSIFIERS = ['Intended Audience :: Science/Research', - 'Intended Audience :: Developers', - 'License :: OSI Approved', - 'Programming Language :: C++', - 'Programming Language :: Python', - 'Topic :: Software Development', - 'Topic :: Scientific/Engineering', - 'Operating System :: Microsoft :: Windows', - 'Operating System :: POSIX', - 'Operating System :: Unix', - 'Operating System :: MacOS', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8'] -KEYWORDS = 'machine learning, topological data analysis, persistent ' + \ - 'homology, persistence diagrams, Mapper' +CLASSIFIERS = ["Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved", + "Programming Language :: C++", + "Programming Language :: Python", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8"] +KEYWORDS = "machine learning, topological data analysis, persistent " \ + "homology, persistence diagrams, Mapper" INSTALL_REQUIRES = requirements -EXTRAS_REQUIRE = { - 'tests': [ - 'pytest', - 'pytest-cov', - 'pytest-azurepipelines', - 'pytest-benchmark', - 'jupyter_contrib_nbextensions', - 'flake8', - 'hypothesis'], - 'doc': [ - 'openml', - 'sphinx', - 'nbconvert', - 'sphinx-issues', - 'sphinx_rtd_theme', - 'numpydoc'], - 'examples': [ - 'jupyter', - 'pandas', - 'openml', - 'matplotlib'] -} +EXTRAS_REQUIRE = {"tests": ["pandas", + "pytest", + "pytest-cov", + "pytest-azurepipelines", + "pytest-benchmark", + "jupyter_contrib_nbextensions", + "flake8", + "hypothesis"], + "doc": ["openml", + "sphinx", + "nbconvert", + "sphinx-issues", + "sphinx_rtd_theme", + "numpydoc"], + "examples": ["jupyter", + "pandas", + "openml", + "matplotlib"]} def combine_requirements(base_keys): - return list( - set(k for v in base_keys for k in EXTRAS_REQUIRE[v])) + return list(set(k for v in base_keys for k in EXTRAS_REQUIRE[v])) EXTRAS_REQUIRE["dev"] = combine_requirements( @@ -82,7 +77,7 @@ def combine_requirements(base_keys): class CMakeExtension(Extension): - def __init__(self, name, sourcedir=''): + def __init__(self, name, sourcedir=""): Extension.__init__(self, name, sources=[]) self.sourcedir = os.path.abspath(sourcedir) @@ -90,16 +85,17 @@ def __init__(self, name, sourcedir=''): class CMakeBuild(build_ext): def run(self): try: - out = subprocess.check_output(['cmake', '--version']) + out = subprocess.check_output(["cmake", "--version"]) except OSError: - raise RuntimeError("CMake must be installed to build the " - " following extensions: " + - " , ".join(e.name for e in self.extensions)) + raise RuntimeError( + f"CMake must be installed to build the following extensions: " + f"{', '.join(e.name for e in self.extensions)}" + ) if platform.system() == "Windows": - cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', + cmake_version = LooseVersion(re.search(r"version\s*([\d.]+)", out.decode()).group(1)) - if cmake_version < '3.1.0': + if cmake_version < "3.1.0": raise RuntimeError("CMake >= 3.1.0 is required on Windows") self.install_dependencies() @@ -108,46 +104,36 @@ def run(self): self.build_extension(ext) def install_dependencies(self): - dir_start = os.getcwd() - dir_pybind11 = os.path.join(dir_start, - 'gtda', 'externals', 'pybind11') - if os.path.exists(dir_pybind11): - return 0 - os.mkdir(dir_pybind11) - subprocess.check_call(['git', 'clone', - 'https://github.com/pybind/pybind11.git', - dir_pybind11]) - - subprocess.check_call(['git', 'submodule', 'update', - '--init', '--recursive']) + subprocess.check_call(["git", "submodule", "update", + "--init", "--recursive"]) def build_extension(self, ext): extdir = os.path.abspath(os.path.join(os.path.dirname( - self.get_ext_fullpath(ext.name)), 'gtda', 'externals', 'modules')) - cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, - '-DPYTHON_EXECUTABLE=' + sys.executable] + self.get_ext_fullpath(ext.name)), "gtda", "externals", "modules")) + cmake_args = [f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", + f"-DPYTHON_EXECUTABLE={sys.executable}"] - cfg = 'Debug' if self.debug else 'Release' - build_args = ['--config', cfg] + cfg = "Debug" if self.debug else "Release" + build_args = ["--config", cfg] - if platform.system() == 'Windows': - cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}' - f'={extdir}'] + if platform.system() == "Windows": + cmake_args += [f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}" + f"={extdir}"] if sys.maxsize > 2**32: - cmake_args += ['-A', 'x64'] - build_args += ['--', '/m'] + cmake_args += ["-A", "x64"] + build_args += ["--", "/m"] else: - cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg] - build_args += ['--', '-j2'] + cmake_args += [f"-DCMAKE_BUILD_TYPE={cfg}"] + build_args += ["--", "-j2"] env = os.environ.copy() - env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format( - env.get('CXXFLAGS', ''), self.distribution.get_version()) + env["CXXFLAGS"] = f"{env.get('CXXFLAGS', '')} -DVERSION_INFO="\ + f"\\'{self.distribution.get_version()}\\'" if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) - subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, + subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env) - subprocess.check_call(['cmake', '--build', '.'] + build_args, + subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=self.build_temp) @@ -167,5 +153,5 @@ def build_extension(self, ext): keywords=KEYWORDS, install_requires=INSTALL_REQUIRES, extras_require=EXTRAS_REQUIRE, - ext_modules=[CMakeExtension('gtda')], + ext_modules=[CMakeExtension("gtda")], cmdclass=dict(build_ext=CMakeBuild))