From dbb98ecdd1cfee036c4af5fdbd7899d28d838d4d Mon Sep 17 00:00:00 2001 From: Umberto Lupo <46537483+ulupo@users.noreply.github.com> Date: Mon, 23 Mar 2020 15:46:31 +0100 Subject: [PATCH 1/3] Add @rth as a contributor and code author for v0.2.0 (#376) (#377) * Add @rth to code authors and list of contributors for v0.2.0 --- CODE_AUTHORS | 1 + doc/release.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CODE_AUTHORS b/CODE_AUTHORS index feee72cc8..9a6bb4bec 100644 --- a/CODE_AUTHORS +++ b/CODE_AUTHORS @@ -11,3 +11,4 @@ Alessio Ghiraldello, amg28@protonmail.com Adélie Garin, adelie.garin@epfl.ch Anibal Medina-Mardones, anibal.medinamardones@epfl.ch Wojciech Reise, reisewojciech@gmail.com +Roman Yurchak, roman.yurchak@symerio.com diff --git a/doc/release.rst b/doc/release.rst index c48262261..d9caf3bb8 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -140,7 +140,7 @@ Thanks to our Contributors This release contains contributions from many people: -Umberto Lupo, Guillaume Tauzin, Wojciech Reise, Julian Burella Pérez, Lewis Tunstall, Anibal Medina-Mardones, and Adélie Garin. +Umberto Lupo, Guillaume Tauzin, Wojciech Reise, Julian Burella Pérez, Roman Yurchak, Lewis Tunstall, Anibal Medina-Mardones, and Adélie Garin. We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of inspiring discussions. From 72e31c2292adea17b9ea6694b6da3a117c0edf86 Mon Sep 17 00:00:00 2001 From: Umberto Lupo <46537483+ulupo@users.noreply.github.com> Date: Mon, 23 Mar 2020 18:20:17 +0100 Subject: [PATCH 2/3] Merge twine hotfixes post 0.2.0 (#380) * Add twine check to azure-pipelines.yml * Fix formatting and duplicate issues in README.rst --- README.rst | 8 +++++--- azure-pipelines.yml | 7 ++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index d6938665f..e4ca2c40b 100644 --- a/README.rst +++ b/README.rst @@ -38,7 +38,9 @@ and the `Institute of Reconfigurable & Embedded Digital Systems (REDS) `_. If you need a different distribution license, please contact the `L2F team`_. @@ -96,7 +98,7 @@ the same environment. Developer installation ---------------------- -Please consult the `relevant page `_ +Please consult the `dedicated page `_ for detailed instructions on how to build ``giotto-tda`` from sources across different platforms. .. _contributing-section: @@ -106,14 +108,14 @@ Contributing We welcome new contributors of all experience levels. The Giotto community goals are to be helpful, welcoming, and effective. To learn more about -making a contribution to ``giotto-tda``, please consult the `relevant page +making a contribution to ``giotto-tda``, please consult `the relevant page `_. Testing ------- After installation, you can launch the test suite from outside the -source directory:: +source directory :: pytest gtda diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2909d2857..33a984f4f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -200,6 +200,12 @@ jobs: contents: 'dist/*' targetFolder: '$(Build.ArtifactStagingDirectory)' + - script: | + set -e + pip install twine + twine check dist/* + displayName: 'Check distribution with twine' + - task: PublishBuildArtifacts@1 displayName: 'Create download link' inputs: @@ -208,7 +214,6 @@ jobs: - bash: | set -e - pip install twine twine upload -u giotto-learn -p $(pypi_psw) --skip-existing dist/* condition: eq(variables['nightly_check'], 'true') displayName: 'Upload nightly wheels to PyPI' From 08619b00f0bf19033162487667fa6e7540468cff Mon Sep 17 00:00:00 2001 From: Umberto Lupo <46537483+ulupo@users.noreply.github.com> Date: Wed, 8 Apr 2020 14:47:21 +0200 Subject: [PATCH 3/3] Create v0.2.1 (#392) * Add twine check to CI, attempt fix in README (#379) * Tests for plotting functions and methods, other increases in test coverage (#384) * Vectorization2 (#378) * Fix azure on windows (#388) * Allow more general input to ripser when metric='precomputed', improve/refactor check_point_clouds and add tests (#386) * Add citing entry to README.rst and doc/faq.rst (#389) * Fix markdown subsection formatting (#390) * Prepare release 0.2.1 (#391) Co-authored-by: Anibal M. Medina-Mardones Co-authored-by: REDS institute Co-authored-by: Guillaume Tauzin --- .azure-ci/docker_scripts.sh | 2 +- .azure-ci/install_boost.py | 23 ++ .coveragerc | 12 +- CMakeLists.txt | 2 +- README.rst | 23 +- azure-pipelines.yml | 12 +- doc/faq.rst | 21 +- doc/library.rst | 4 +- doc/release.rst | 39 +++ doc/theory/glossary.tex | 256 ++++++++++-------- examples/voids_on_the_plane.ipynb | 2 +- gtda/_version.py | 2 +- gtda/diagrams/distance.py | 5 +- gtda/diagrams/representations.py | 185 +++++++------ ...es.py => test_features_representations.py} | 79 ++++-- gtda/diagrams/tests/test_preprocessing.py | 77 ++++-- gtda/externals/python/ripser_interface.py | 8 +- gtda/graphs/tests/test_geodesic_distance.py | 13 +- gtda/graphs/tests/test_kneighbors.py | 8 +- gtda/graphs/tests/test_transition.py | 2 +- gtda/homology/simplicial.py | 75 ++--- gtda/homology/tests/test_cubical.py | 9 + gtda/homology/tests/test_simplicial.py | 65 +++-- gtda/images/tests/test_filtrations.py | 25 +- gtda/images/tests/test_preprocessing.py | 38 ++- gtda/point_clouds/tests/test_rescaling.py | 35 ++- gtda/time_series/tests/test_embedding.py | 16 +- gtda/utils/testing.py | 0 gtda/utils/tests/test_validation.py | 207 +++++++++++++- gtda/utils/validation.py | 117 ++++++-- setup.py | 4 +- 31 files changed, 973 insertions(+), 393 deletions(-) create mode 100644 .azure-ci/install_boost.py rename gtda/diagrams/tests/{test_features.py => test_features_representations.py} (70%) delete mode 100644 gtda/utils/testing.py diff --git a/.azure-ci/docker_scripts.sh b/.azure-ci/docker_scripts.sh index 4ad956407..73caa1524 100755 --- a/.azure-ci/docker_scripts.sh +++ b/.azure-ci/docker_scripts.sh @@ -39,7 +39,7 @@ cd /io pip install -e ".[dev]" # Test dev install with pytest -pytest gtda --cov --cov-report xml +pytest gtda --no-cov --no-coverage-upload # Uninstall giotto-tda/giotto-tda-nightly dev pip uninstall -y giotto-tda diff --git a/.azure-ci/install_boost.py b/.azure-ci/install_boost.py new file mode 100644 index 000000000..1f21a74c4 --- /dev/null +++ b/.azure-ci/install_boost.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +import os +from pathlib import Path +import urllib.request +import shutil +import zipfile + + +url = "https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.zip" +boost_folder = r"C:\local" + +Path(boost_folder).mkdir(parents=True, exist_ok=True) +zip_file = os.path.join(boost_folder, "1_72_0.zip") + +with urllib.request.urlopen(url) as response, \ + open(zip_file, 'wb') as out_file: + shutil.copyfileobj(response, out_file) + +with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(boost_folder) + +os.remove(zip_file) diff --git a/.coveragerc b/.coveragerc index e3bd56f66..af5cd07ce 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,14 +1,12 @@ [run] omit = */gtda/externals/* - **/setup.py - */gtda/compose/* - */gtda/datasets/* - */gtda/images/* - */gtda/neural_network/* - */gtda/model_selection/* + *tests* + **/gtda/utils/intervals.py + **/gtda/utils/_docs.py **/base.py **/pipeline.py - **/_version.py \ No newline at end of file + **/setup.py + **/_version.py diff --git a/CMakeLists.txt b/CMakeLists.txt index f611d750a..062b89a0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/gtda/externals/pybind11) set(BINDINGS_DIR "gtda/externals/bindings") include(cmake/HelperBoost.cmake) -include_directories(${Boost_INCLUDE_DIR}) +include_directories(${Boost_INCLUDE_DIRS}) find_package(OpenMP) diff --git a/README.rst b/README.rst index e4ca2c40b..cea8acc71 100644 --- a/README.rst +++ b/README.rst @@ -120,12 +120,33 @@ source directory :: pytest gtda Important links ---------------- +=============== - Official source code repo: https://github.com/giotto-ai/giotto-tda - Download releases: https://pypi.org/project/giotto-tda/ - Issue tracker: https://github.com/giotto-ai/giotto-tda/issues + +Citing giotto-tda +================= + +If you use ``giotto-tda`` in a scientific publication, we would appreciate citations to the following paper: + + `giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration `_, Tauzin *et al*, arXiv:2004.02551, 2020. + +You can use the following BibTeX entry: + +.. code:: RST + + @misc{tauzin2020giottotda, + title={giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration}, + author={Guillaume Tauzin and Umberto Lupo and Lewis Tunstall and Julian Burella Pérez and Matteo Caorsi and Anibal Medina-Mardones and Alberto Dassatti and Kathryn Hess}, + year={2020}, + eprint={2004.02551}, + archivePrefix={arXiv}, + primaryClass={cs.LG} + } + Community ========= diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 33a984f4f..881f05b33 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -38,7 +38,7 @@ jobs: - task: Cache@2 inputs: - key: '"ccache-wheels-v2020.03.23" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-wheels-v2020.04.07" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache @@ -133,7 +133,7 @@ jobs: - task: Cache@2 inputs: - key: '"ccache-v2020.03.23" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-v2020.04.07" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache @@ -154,7 +154,7 @@ jobs: - script: | set -e pytest gtda --cov --cov-report xml - displayName: 'Test dev install with pytest' + displayName: 'Test dev install with pytest, upload coverage report' - script: | set -e @@ -246,13 +246,17 @@ jobs: condition: eq(variables['nightly_check'], 'true') displayName: 'Change name to giotto-tda-nightly' + - script: | + python .azure-ci/install_boost.py || exit /b + displayName: 'Install boost' + - script: | python -m pip install --upgrade pip setuptools python -m pip install -e ".[dev]" displayName: 'Install dev environment' - script: | - pytest gtda --cov --cov-report xml || exit /b + pytest gtda --no-cov --no-coverage-upload || exit /b displayName: 'Test dev install with pytest' - script: | diff --git a/doc/faq.rst b/doc/faq.rst index a27285501..93f183ac8 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -10,6 +10,25 @@ I am a researcher. Can I use ``giotto-tda`` in my project? Of course! The `license `_ is very permissive. For more information, please contact the `L2F team`_. +How do I cite ``giotto-tda``? +----------------------------- +We would appreciate citations to the following paper: + + `giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration `_, Tauzin *et al*, arXiv:2004.02551, 2020. + +You can use the following BibTeX entry: + +.. code:: RST + + @misc{tauzin2020giottotda, + title={giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration}, + author={Guillaume Tauzin and Umberto Lupo and Lewis Tunstall and Julian Burella Pérez and Matteo Caorsi and Anibal Medina-Mardones and Alberto Dassatti and Kathryn Hess}, + year={2020}, + eprint={2004.02551}, + archivePrefix={arXiv}, + primaryClass={cs.LG} + } + I cannot install ``giotto-tda`` ------------------------------- @@ -26,4 +45,4 @@ There are many TDA libraries available. How is ``giotto-tda`` different? ``giotto-tda`` is oriented towards machine learning (for details, see the :ref:`guiding principles `). This philosophy is in contrast with other reference libraries, like `GUDHI `_, which provide more low-level functionality at the expense of being less adapted to e.g. batch processing, or of -being tightly integrated with ``scikit-learn``. \ No newline at end of file +being tightly integrated with ``scikit-learn``. diff --git a/doc/library.rst b/doc/library.rst index 76bfc1be0..7c73509f4 100644 --- a/doc/library.rst +++ b/doc/library.rst @@ -114,5 +114,5 @@ What's new .. include:: release.rst - :start-after: Release 0.2.0 - :end-before: Release 0.1.4 + :start-after: Release 0.2.1 + :end-before: Release 0.2.0 diff --git a/doc/release.rst b/doc/release.rst index d9caf3bb8..43ebed123 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -5,6 +5,45 @@ Release Notes .. _stable: +************* +Release 0.2.1 +************* + +Major Features and Improvements +=============================== + +- The theory glossary has been improved to include the notions of vectorization, kernel and amplitude for persistence diagrams. +- The ``ripser`` function in ``gtda.externals.python.ripser_interface`` no longer uses scikit-learn's ``pairwise_distances`` when + ``metric`` is ``'precomputed'``, thus allowing square arrays with negative entries or infinities to be passed. +- ``check_point_clouds`` in ``gtda.utils.validation`` now checks for square array input when the input should be a collection of + distance-type matrices. Warnings guide the user to correctly setting the ``distance_matrices`` parameter. ``force_all_finite=False`` + no longer means accepting NaN input (only infinite input is accepted). +- ``VietorisRipsPersistence`` in ``gtda.homology.simplicial`` no longer masks out infinite entries in the input to be fed to + ``ripser``. +- The docstrings for ``check_point_clouds`` and ``VietorisRipsPersistence`` have been improved to reflect these changes and the + extra level of generality for ``ripser``. + +Bug Fixes +========= + +- The variable used to indicate the location of Boost headers has been renamed from ``Boost_INCLUDE_DIR`` to ``Boost_INCLUDE_DIRS`` + to address developer installation issues in some Linux systems. + +Backwards-Incompatible Changes +============================== + +- The keyword parameter ``distance_matrix`` in ``check_point_clouds`` has been renamed to ``distance_matrices``. + +Thanks to our Contributors +========================== + +This release contains contributions from many people: + +Umberto Lupo, Anibal Medina-Mardones, Julian Burella Pérez, Guillaume Tauzin, and Wojciech Reise. + +We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of +inspiring discussions. + ************* Release 0.2.0 ************* diff --git a/doc/theory/glossary.tex b/doc/theory/glossary.tex index 7494377d0..0a9c8f518 100644 --- a/doc/theory/glossary.tex +++ b/doc/theory/glossary.tex @@ -15,14 +15,11 @@ linkcolor=blue, urlcolor=blue} - \begin{document} - + \title{Theory Glossary} \maketitle - \bibliography{bibliography} - \section{Symbols} \begin{tabular}{ l l} @@ -33,7 +30,7 @@ $\mathbb R^d$ & The vector space of $d$-tuples of real numbers. \\ $\Delta$ & The %\hyperref[multiset]{multiset} - multiset $ \lbrace (s, s) \mid s \in \mathbb{R} \rbrace $ with multiplicity $ ( s,s ) \mapsto +\infty$. + multiset $ \lbrace (s, s) \mid s \in \mathbb{R} \rbrace $ with multiplicity $ ( s,s ) \mapsto +\infty$. \end{tabular} \section{Homology} @@ -42,7 +39,7 @@ An \textit{elementary interval} $I_a$ is a subset of $\mathbb{R}$ of the form $[a, a+1]$ or $[a,a] = \{a\}$ for some $a \in \mathbb{R}$. These two types are called respectively \textit{non-degenerate} and \textit{degenerate}. To a non-degenerate elementary interval we assign two degenerate elementary intervals \begin{equation*} - d^+ I_a = [a+1, a+1] \qquad \text{and} \qquad d^- I_a = [a, a]. + d^+ I_a = \lbrack a+1, a+1 \rbrack \qquad \text{and} \qquad d^- I_a = \lbrack a, a \rbrack. \end{equation*} An \textit{elementary cube} is a subset of the form \begin{equation*} @@ -65,9 +62,9 @@ A set $\{v_0, \dots, v_n\} \subset \mathbb{R}^N$ is said to be \textit{geometrically independent} if the vectors $\{v_0-v_1, \dots, v_0-v_n\}$ are linearly independent. In this case, we refer to their convex closure as a \textit{simplex}, explicitly \begin{equation*} - \lbrack v_0, \ldots , v_n \rbrack = \left\{ \sum c_i (v_0 - v_i)\ \big|\ c_1+\dots+c_n = 1,\ c_i \geq 0 \right\} + \lbrack v_0, \dots , v_n \rbrack = \left\{ \sum c_i (v_0 - v_i)\ \big|\ c_1+\dots+c_n = 1,\ c_i \geq 0 \right\} \end{equation*} - and to $n$ as its \textit{dimension}. The $i$\textit{-th face} of $[v_0, \dots, v_n]$ is defined by + and to $n$ as its \textit{dimension}. The $i$\textit{-th face} of $\lbrack v_0, \dots, v_n \rbrack$ is defined by \begin{equation*} d_i[v_0, \ldots, v_n] = [v_0, \dots, \widehat{v}_i, \dots, v_n] \end{equation*} @@ -104,7 +101,7 @@ An \textit{ordered simplicial complex} is an % \hyperref[abstract_simplicial_complex]{abstract simplicial complex} - abstract simplicial complex where the set of vertices is equipped with a partial order such that the restriction of this partial order to any simplex is a total order. We denote an $n$-simplex using its ordered vertices by $[v_0, \dots, v_n]$. + abstract simplicial complex where the set of vertices is equipped with a partial order such that the restriction of this partial order to any simplex is a total order. We denote an $n$-simplex using its ordered vertices by $\lbrack v_0, \dots, v_n \rbrack$. A \textit{simplicial map} between ordered simplicial complexes is a simplicial map $f$ between their underlying simplicial complexes preserving the order, i.e., $v \leq w$ implies $f(v) \leq f(w)$. @@ -240,7 +237,7 @@ % \hyperref[filtered_complex]{filtered complex} filtered complex $VR_s(X)$ that contains a subset of $X$ as a simplex if all pairwise distances in the subset are less than or equal to $s$, explicitly \begin{equation*} - VR_s(X) = \Big\{ [v_0,\dots,v_n]\ \Big|\ \forall i,j\ \,d(v_i, v_j) \leq s \Big\}. + VR_s(X) = \Big\{ \lbrack v_0,\dots,v_n \rbrack \ \Big|\ \forall i,j\ \,d(v_i, v_j) \leq s \Big\}. \end{equation*} The \textit{Vietoris-Rips persistence} of $(X, d)$ is the % \hyperref[persistent_simplicial_(co)homology]{persistent simplicial (co)homology} @@ -263,11 +260,11 @@ % \hyperref[filtered_complex]{filtered complex} filtered complex $\check{C}_s(X)$ that is empty if $s<0$ and, if $s \geq 0$, contains a subset of $X$ as a simplex if the balls of radius $s$ with centers in the subset have a non-empty intersection, explicitly \begin{equation*} - \check{C}_s(X) = \Big\{ [v_0,\dots,v_n]\ \Big|\ \bigcap_{i=0}^n B_s(x_i) \neq \emptyset \Big\}. + \check{C}_s(X) = \Big\{ \lbrack v_0,\dots,v_n \rbrack \ \Big|\ \bigcap_{i=0}^n B_s(x_i) \neq \emptyset \Big\}. \end{equation*} The \textit{\v Cech persistence (co)homology} of $(X,d)$ is the - % \hyperref[persistent_simplicial_(co)homology]{persistent simplicial (co)homo-logy} - persistent simplicial (co)homo-logy of $\check{C}_s(X)$. + % \hyperref[persistent_simplicial_(co)homology]{persistent simplicial (co)homology} + persistent simplicial (co)homology of $\check{C}_s(X)$. \subsection*{Multiset} \label{multiset} @@ -307,76 +304,69 @@ \begin{equation*} \sup_{x \in D_1 \cup \Delta} ||x - \gamma(x)||_{\infty.} \end{equation*} + + The set of persistence diagrams together with any of the distances above is a + %\hyperref[metric_space]{metric space}. + metric space. \paragraph{\\ Reference:} \cite{kerber2017geometry} - \subsection*{Persistence landscape} \label{persistence_landscape} - - A \textit{persistence landscape} is a set $\{\lambda_k\}_{k \in \mathbb N}$ of functions - \begin{equation*} - \lambda : \mathbb R \to \overline{\mathbb R} - \end{equation*} - where $\lambda_k$ is referred to as the $k$\textit{-layer of the persistence landscape}. + \subsection*{Persistence landscape} \label{persistence_landscape} Let $\{(b_i, d_i)\}_{i \in I}$ be a - % \hyperref[persistence_diagram]{persistence diagram} - persistence diagram. Its \textit{associated persistence landscape} $\lambda$ is defined by letting $\lambda_k$ be the $k$-th largest value of the set $\{\Lambda_i(t)\}_ {i \in I}$ where - \begin{equation*} - \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ - \end{equation*} - and $c_+ := \max(c,0)$. - - Intuitively, we can describe the set of graphs of a persistence landscape by first joining each of the points in the multiset to the diagonal via a horizontal as well as a vertical line, then clockwise rotating the figure 45 degrees and rescaling it by $1/\sqrt{2}$. - - \paragraph{\\ Reference:} \cite{bubenik2015statistical} - - \subsection*{Persistence landscape norm} \label{persistence_landscape_norm} - - Given a function $f : \mathbb R \to \overline{\mathbb R}$ define - \begin{equation*} - ||f||_p = \left( \int_{\mathbb R} f^p(x)\, dx \right)^{1/p} - \end{equation*} - whenever the right hand side exists and is finite. - - The $p$\textit{-norm} of a - % \hyperref[persistence_landscape]{persistence landscape} - persistence landscape $\lambda = \{\lambda_k\}_{k \in \mathbb N}$ is defined to be - - \begin{equation*} - ||\lambda||_p = \left( \sum_{i \in \mathbb N} ||\lambda_i||^p_p \right)^{1/p} - \end{equation*} - whenever the right hand side exists and is finite. + %\hyperref[persistence_diagram]{persistence diagram} + persistence diagram. Its \textit{persistence landscape} is the set $\{\lambda_k\}_{k \in \mathbb N}$ of functions + \begin{equation*} + \lambda_k : \mathbb R \to \overline{\mathbb R} + \end{equation*} + defined by letting $\lambda_k(t)$ be the $k$-th largest value of the set $\{\Lambda_i(t)\}_ {i \in I}$ where + \begin{equation*} + \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ + \end{equation*} + and $c_+ := \max(c,0)$. The function $\lambda_k$ is referred to as the \textit{$k$-layer of the persistence landscape}. + + We describe the graph of each $\lambda_k$ intuitively. For each $i \in I$, draw an isosceles triangle with base the interval $(b_i, d_i)$ on the horizontal $t$-axis, and sides with slope 1 and $-1$. This subdivides the plane into a number of polygonal regions. Label each of these regions by the number of triangles containing it. If $P_k$ is the union of the polygonal regions with values at least $k$, then the graph of $\lambda_k$ is the upper contour of $P_k$, with $\lambda_k(a) = 0$ if the vertical line $t=a$ does not intersect $P_k$. + + The persistence landscape construction defines a + %\hyperref[vectorization_kernel_and_amplitude]{vectorization} + vectorization of the set of persistence diagrams with target the vector space of real-valued function on $\mathbb N \times \mathbb R$. For any $p = 1,\dots,\infty$ we can restrict attention to persistence diagrams $D$ whose associated persistence landscape $\lambda$ is + %\hyperref[lp_norm]{$p$-integrable} + $p$-integrable, that is to say, + \begin{equation} \label{equation:persistence_landscape_norm} + ||\lambda||_p = \left( \sum_{i \in \mathbb N} ||\lambda_i||^p_p \right)^{1/p} + \end{equation} + where + \begin{equation*} + ||\lambda_i||_p = \left( \int_{\mathbb R} \lambda_i^p(x)\, dx \right)^{1/p} + \end{equation*} + is finite. In this case we refer to \eqref{equation:persistence_landscape_norm} as the + %\hyperref[vectorization_kernel_and_amplitude]{amplitude} + \textit{landscape} $p$-\textit{amplitude} of $D$. - \paragraph{\\ References:} \cite{stein2011functional, bubenik2015statistical} + \paragraph{\\ References:} \cite{bubenik2015statistical} \subsection*{Weighted silhouette} \label{weighted_silhouette} - Let $D = {(b_i, d_i)}_{i \in I}$ be a - % \hyperref[persistence_diagram] {persistence diagram} - persistence diagram. A \textit{weighted silhouette} associated to $D$ is a continuous function $\phi : \mathbb R \to \mathbb R$ of the form + Let $D = \{(b_i, d_i)\}_{i \in I}$ be a + %\hyperref[persistence_diagram]{persistence diagram} + persistence diagram and $w = \{w_i\}_{i \in I}$ a set of positive real numbers. The \textit{silhouette of $D$ weighted by $w$} is the function $\phi : \mathbb R \to \mathbb R$ defined by + \begin{equation*} + \phi(t) = \frac{\sum_{i \in I}w_i \Lambda_i(t)}{\sum_{i \in I}w_i}, + \end{equation*} + where \begin{equation*} - \phi(t) = \frac{\sum_{i \in I}w_i \Lambda_i(t)}{\sum_{i \in I}w_i}, - \end{equation*} - where $\{w_i\}_{i \in I}$ is a set of positive real numbers and - \begin{equation*} \label{equation:lambda_for_persistence_landscapes} - \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ - \end{equation*} - with $c_+ := \max(c,0)$. The particular choice $w_i = \vert d_i - b_i \vert^p$ for $0 < p \leq \infty$ is referred to as \textit{power-weighted silhouettes}. + \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ + \end{equation*} + and $c_+ := \max(c,0)$. When $w_i = \vert d_i - b_i \vert^p$ for $0 < p \leq \infty$ we refer to $\phi$ as the \textit{$p$-power-weighted silhouette} of $D$. The silhouette construction defines a + %\hyperref[vectorization_kernel_and_amplitude]{vectorization} + vectorization of the set of persistence diagrams with target the vector space of continuous real-valued functions on $\mathbb R$. \paragraph{\\ References:} \cite{chazal2014stochastic} - \subsection*{Amplitude} - \label{amplitude} - - Given a function assigning a real number to a pair of persistence diagrams, we define the \textit{amplitude} of a persistence diagram $D$ to be the value assigned to the pair $(D \cup \Delta, \Delta)$. Important examples of such functions are: %\hyperref[wasserstein_and_bottleneck_distance]{Wasserstein and bottleneck distances} - Wasserstein and bottleneck distances and - % \hyperref[persistence_landscape_norm]{landscape distance} - landscape distance. - \subsection*{Persistence entropy} \label{persistence_entropy} Intuitively, this is a measure of the entropy of the points in a - % \hyperref[persistence_diagram]{persistence diagram} + % \hyperref[persistence_diagram]{persistence diagram} persistence diagram. Precisely, let $D = \{(b_i, d_i)\}_{i \in I}$ be a persistence diagram with each $d_i < +\infty$. The \textit{persistence entropy} of $D$ is defined by \begin{equation*} E(D) = - \sum_{i \in I} p_i \log(p_i) @@ -396,54 +386,66 @@ The name is inspired from the case when the persistence diagram comes from persistent homology. - \subsection*{Distances, inner products and kernels} \label{metric_inner_product_and_kernel} + \subsection*{Metric space} \label{metric_space} + A set $X$ with a function + \begin{equation*} + d : X \times X \to \mathbb R + \end{equation*} + is said to be a \textit{metric space} if the values of $d$ are all non-negative and for all $x,y,z \in X$ + \begin{equation*} + d(x,y) = 0\ \Leftrightarrow\ x = y + \end{equation*} + \begin{equation*} + d(x,y) = d(y,x) + \end{equation*} + \begin{equation*} + d(x,z) \leq d(x,y) + d(y, z). + \end{equation*} + In this case the $d$ is referred to as the \textit{metric} or the \textit{distance function}. - A set $X$ with a function - \begin{equation*} - d : X \times X \to \mathbb R - \end{equation*} - is called a \textit{metric space} if the values of $d$ are all non-negative and for all $x,y,z \in X$ - \begin{equation*} - d(x,y) = 0\ \Leftrightarrow\ x = y - \end{equation*} - \begin{equation*} - d(x,y) = d(y,x) - \end{equation*} - \begin{equation*} - d(x,z) \leq d(x,y) + d(y, z). - \end{equation*} - In this case the $d$ is referred to as the \textit{metric} or the \textit{distance function}. + \subsection*{Inner product and norm} \label{inner_product_and_norm} A vector space $V$ together with a function - \begin{equation*} - \langle -, - \rangle : V \times V \to \mathbb R - \end{equation*} - is called and \textit{inner product space} if for all $u,v,w \in V$ - \begin{equation*} - u \neq 0\ \Rightarrow\ \langle u, u \rangle > 0 + \begin{equation*} + \langle -, - \rangle : V \times V \to \mathbb R \end{equation*} - \begin{equation*} - \langle u, v\rangle = \langle v, u\rangle - \end{equation*} - \begin{equation*} - \langle au+v, w \rangle = a\langle u, w \rangle + \langle v, w \rangle. - \end{equation*} - In this case the function $\langle -, - \rangle$ is referred to as the \textit{inner product} and the function given by - \begin{equation*} - ||u|| = \sqrt{\langle u, u \rangle} + is said to be an \textit{inner product space} if for all $u,v,w \in V$ and $a \in \mathbb R$ + \begin{equation*} + u \neq 0\ \Rightarrow\ \langle u, u \rangle > 0 + \end{equation*} + \begin{equation*} + \langle u, v\rangle = \langle v, u\rangle \end{equation*} - as its associated \textit{norm}. An inner product space is naturally a metric space with distance function \begin{equation*} - d(u,v) = ||u-v||. - \end{equation*} + \langle au+v, w \rangle = a\langle u, w \rangle + \langle v, w \rangle. + \end{equation*} + The function $\langle -, - \rangle$ is referred to as the \textit{inner product}. - A \textit{kernel} on a set $X$ is a function - \begin{equation*} - k : X \times X - \end{equation*} - for which there exists a function $\phi : X \to V$ to an inner product space such that - \begin{equation*} - k(x, y) = \langle \phi(x), \phi(y) \rangle. + A vector space $V$ together with a function + \begin{equation*} + ||-|| : V \to \mathbb R + \end{equation*} + is said to be an \textit{normed space} if the values of $||-||$ are all non-negative and for all $u,v \in V$ and $a \in \mathbb R$ + \begin{equation*} + ||v|| = 0\ \Leftrightarrow\ u = 0 + \end{equation*} + \begin{equation*} + ||a u || = |a|\, ||u|| + \end{equation*} + \begin{equation*} + ||u+v|| = ||u|| + ||v||. + \end{equation*} + The function $||-||$ is referred to as the \textit{norm}. + + An inner product space is naturally a norm space with + \begin{equation*} + ||u|| = \sqrt{\langle u, u \rangle} + \end{equation*} + and a norm space is naturally a + %\hyperref[metric_space]{metric space} + metric space with distance function + \begin{equation*} + d(u,v) = ||u-v||. \end{equation*} \subsection*{Euclidean distance and norm} \label{euclidean_distance_and_norm} @@ -454,7 +456,37 @@ \begin{equation*} \langle x, y \rangle = (x_1-y_1)^2 + \cdots + (x_n-y_n)^2. \end{equation*} - The associated norm and distance function are referred to as \textit{Euclidean norm} and \textit{Euclidean distance}. + This inner product is referred to as \textit{dot product} and the associated norm and distance function are respectively named \textit{euclidean norm} and \textit{euclidean distance}. + + \subsection*{Vectorization, kernel and amplitude} \label{vectorization_kernel_and_amplitude} + + Let $X$ be a set, for example, the set of all + %\hyperref[persistence_diagram]{persistence diagrams} + persistence diagrams. A \textit{vectorization} for $X$ is a function + \begin{equation*} + \phi : X \to V + \end{equation*} + where $V$ is a vector space. A \textit{kernel} on the set $X$ is a function + \begin{equation*} + k : X \times X \to \mathbb R + \end{equation*} + for which there exists a vectorization $\phi : X \to V$ with $V$ an + %\hyperref[inner_product_and_norm]{inner product space} + inner product space such that + \begin{equation*} + k(x,y) = \langle \phi(x), \phi(y) \rangle + \end{equation*} + for each $x,y \in X$. Similarly, an \textit{amplitude} on $X$ is a function + \begin{equation*} + A : X \to \mathbb R + \end{equation*} + for which there exists a vectorization $\phi : X \to V$ with $V$ a + %\hyperref[inner_product_and_norm]{normed space} + normed space such that + \begin{equation*} + A(x) = ||\phi(x)|| + \end{equation*} + for all $x \in X$. \subsection*{Finite metric spaces and point clouds} \label{finite_metric_spaces_and_point_clouds} @@ -530,12 +562,10 @@ \paragraph{\\ References:} \cite{milnor1997topology,guillemin2010differential} \subsection*{Compact subset} \label{compact_subset} - A subset $K$ of a metric space $(X,d)$ is said to be \textit{bounded} if there exist a real number $D$ such that for each pair of elements in $K$ the distance between them is less than $D$. It is said to be \textit{complete} if for any $x \in X$ it is the case that $x \in K$ if for any $\epsilon > 0$ the intersection between $K$ and $\{y \,;\ d(x,y) < \epsilon \}$ is not empty. It is said to be \textit{compact} if it is both bounded and complete. - + \section{Bibliography} \bibliography{bibliography}{} \bibliographystyle{alpha} - -\end{document} +\end{document} \ No newline at end of file diff --git a/examples/voids_on_the_plane.ipynb b/examples/voids_on_the_plane.ipynb index 52ee302db..19e7c24e1 100644 --- a/examples/voids_on_the_plane.ipynb +++ b/examples/voids_on_the_plane.ipynb @@ -8,7 +8,7 @@ "\n", "The classic example of a two-dimensional homology class is the \"void\" surrounded by a sphere in three-dimensional space.\n", "Challenge question: **Can two-dimensional topological voids arise from point clouds in two-dimensional space?**\n", - "We will answer this question programmatically by computing Vietoris–Rips persistence homology of random point clouds in the square $[0, 1] \\times [0, 1] \\subset \\mathbb{R}^2$.\n", + "We will answer this question programmatically by computing Vietoris–Rips persistent homology of random point clouds in the square $[0, 1] \\times [0, 1] \\subset \\mathbb{R}^2$.\n", "\n", "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/voids_on_the_plane.ipynb).\n", "\n", diff --git a/gtda/_version.py b/gtda/_version.py index 9d0c2f8e6..7ffe6f6dd 100644 --- a/gtda/_version.py +++ b/gtda/_version.py @@ -19,4 +19,4 @@ # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.2.0' +__version__ = '0.2.1' diff --git a/gtda/diagrams/distance.py b/gtda/diagrams/distance.py index 32c45fd6f..ae37f50b7 100644 --- a/gtda/diagrams/distance.py +++ b/gtda/diagrams/distance.py @@ -102,9 +102,8 @@ class PairwiseDistance(BaseEstimator, TransformerMixin): See also -------- - Amplitude, Scaler, Filtering, \ - BettiCurve, PersistenceLandscape, \ - HeatKernel, Silhouette, \ + Amplitude, Scaler, Filtering, BettiCurve, PersistenceLandscape, \ + PersistenceImage, HeatKernel, Silhouette, \ gtda.homology.VietorisRipsPersistence Notes diff --git a/gtda/diagrams/representations.py b/gtda/diagrams/representations.py index bda0386fa..05f8fdf36 100644 --- a/gtda/diagrams/representations.py +++ b/gtda/diagrams/representations.py @@ -179,51 +179,51 @@ def plot(self, Xt, sample=0, homology_dimensions=None): for dim in homology_dimensions: if dim not in self.homology_dimensions_: raise ValueError( - f'All homology dimensions must be in ' - f'self.homology_dimensions_ which is ' - f'{self.homology_dimensions_}. {dim} is not.') + f"All homology dimensions must be in " + f"self.homology_dimensions_ which is " + f"{self.homology_dimensions_}. {dim} is not.") else: homology_dimensions_arr = np.array( self.homology_dimensions_) ix = np.flatnonzero(homology_dimensions_arr == dim)[0] _homology_dimensions.append((ix, dim)) - layout = { - "xaxis1": { - "title": "Filtration parameter", - "side": "bottom", - "type": "linear", - "ticks": "outside", - "anchor": "x1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "yaxis1": { - "title": "Betti number", - "side": "left", - "type": "linear", - "ticks": "outside", - "anchor": "y1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "plot_bgcolor": "white" - } + layout = dict( + xaxis1=dict( + title="Filtration parameter", + side="bottom", + type="linear", + ticks="outside", + anchor="x1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + yaxis1=dict( + title="Betti number", + side="left", + type="linear", + ticks="outside", + anchor="y1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + plot_bgcolor="white" + ) fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) for ix, dim in _homology_dimensions: fig.add_trace(gobj.Scatter(x=self.samplings_[dim], y=Xt[sample][ix], mode='lines', showlegend=True, - name=f'H{int(dim)}')) + name=f"H{int(dim)}")) fig.show() @@ -395,38 +395,38 @@ def plot(self, Xt, sample=0, homology_dimensions=None): for dim in homology_dimensions: if dim not in self.homology_dimensions_: raise ValueError( - f'All homology dimensions must be in ' - f'self.homology_dimensions_ which is ' - f'{self.homology_dimensions_}. {dim} is not.') + f"All homology dimensions must be in " + f"self.homology_dimensions_ which is " + f"{self.homology_dimensions_}. {dim} is not.") else: homology_dimensions_arr = np.array( self.homology_dimensions_) ix = np.flatnonzero(homology_dimensions_arr == dim)[0] _homology_dimensions.append((ix, dim)) - layout = { - "xaxis1": { - "side": "bottom", - "type": "linear", - "ticks": "outside", - "anchor": "y1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "yaxis1": { - "side": "left", - "type": "linear", - "ticks": "outside", - "anchor": "x1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "plot_bgcolor": "white" - } + layout = dict( + xaxis1=dict( + side="bottom", + type="linear", + ticks="outside", + anchor="y1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + yaxis1=dict( + side="left", + type="linear", + ticks="outside", + anchor="x1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + plot_bgcolor="white" + ) Xt_sample = Xt[sample] for ix, dim in _homology_dimensions: @@ -434,9 +434,9 @@ def plot(self, Xt, sample=0, homology_dimensions=None): layout_dim['title'] = "Persistence landscape for homology " + \ "dimension {}".format(int(dim)) fig = gobj.Figure(layout=layout_dim) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) n_layers = Xt_sample.shape[1] @@ -598,8 +598,7 @@ def transform(self, X, y=None): transpose((1, 0, 2, 3)) return Xt - def plot(self, Xt, sample=0, homology_dimension_ix=0, - colorscale='blues'): + def plot(self, Xt, sample=0, homology_dimension_ix=0, colorscale='blues'): """Plot a single channel – corresponding to a given homology dimension – in a sample from a collection of heat kernel images. @@ -1014,50 +1013,50 @@ def plot(self, Xt, sample=0, homology_dimensions=None): for dim in homology_dimensions: if dim not in self.homology_dimensions_: raise ValueError( - f'All homology dimensions must be in ' - f'self.homology_dimensions_ which is ' - f'{self.homology_dimensions_}. {dim} is not.') + f"All homology dimensions must be in " + f"self.homology_dimensions_ which is " + f"{self.homology_dimensions_}. {dim} is not.") else: homology_dimensions_arr = np.array( self.homology_dimensions_) ix = np.flatnonzero(homology_dimensions_arr == dim)[0] _homology_dimensions.append((ix, dim)) - layout = { - "xaxis1": { - "title": "Filtration parameter", - "side": "bottom", - "type": "linear", - "ticks": "outside", - "anchor": "x1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "yaxis1": { - "side": "left", - "type": "linear", - "ticks": "outside", - "anchor": "y1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "plot_bgcolor": "white" - } + layout = dict( + xaxis1=dict( + title="Filtration parameter", + side="bottom", + type="linear", + ticks="outside", + anchor="x1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + yaxis1=dict( + side="left", + type="linear", + ticks="outside", + anchor="y1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + plot_bgcolor="white" + ) fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) for ix, dim in _homology_dimensions: fig.add_trace(gobj.Scatter(x=self.samplings_[dim], y=Xt[sample][ix], - mode='lines', showlegend=True, - hoverinfo='none', - name=f'H{int(dim)}')) + mode="lines", showlegend=True, + hoverinfo="none", + name=f"H{int(dim)}")) fig.show() diff --git a/gtda/diagrams/tests/test_features.py b/gtda/diagrams/tests/test_features_representations.py similarity index 70% rename from gtda/diagrams/tests/test_features.py rename to gtda/diagrams/tests/test_features_representations.py index b0ab05518..0c036d684 100644 --- a/gtda/diagrams/tests/test_features.py +++ b/gtda/diagrams/tests/test_features_representations.py @@ -1,7 +1,8 @@ -"""Testing for features""" +"""Testing for features and vector representations.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from hypothesis import given from hypothesis.extra.numpy import arrays, array_shapes @@ -9,35 +10,76 @@ from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.diagrams import PersistenceEntropy, HeatKernel, \ - PersistenceImage, Silhouette +from gtda.diagrams import PersistenceEntropy, BettiCurve, \ + PersistenceLandscape, HeatKernel, PersistenceImage, Silhouette -diagram = np.array([[[0, 1, 0], [2, 3, 0], [4, 6, 1], [2, 6, 1]]]) +pio.renderers.default = 'plotly_mimetype' +X = np.array([[[0., 1., 0.], [2., 3., 0.], [4., 6., 1.], [2., 6., 1.]]]) -def test_pe_not_fitted(): - pe = PersistenceEntropy() + +def test_not_fitted(): + with pytest.raises(NotFittedError): + PersistenceEntropy().transform(X) + + with pytest.raises(NotFittedError): + BettiCurve().transform(X) + + with pytest.raises(NotFittedError): + PersistenceLandscape().transform(X) + + with pytest.raises(NotFittedError): + HeatKernel().transform(X) + + with pytest.raises(NotFittedError): + PersistenceImage().transform(X) with pytest.raises(NotFittedError): - pe.transform(diagram) + Silhouette().transform(X) + + +@pytest.mark.parametrize('hom_dim_ix', [0, 1]) +def test_fit_transform_plot_one_hom_dim(hom_dim_ix): + HeatKernel().fit_transform_plot( + X, sample=0, homology_dimension_ix=hom_dim_ix) + PersistenceImage().fit_transform_plot( + X, sample=0, homology_dimension_ix=hom_dim_ix) + + +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_fit_transform_plot_many_hom_dims(hom_dims): + BettiCurve().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + PersistenceLandscape().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + Silhouette().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) def test_pe_transform(): pe = PersistenceEntropy() diagram_res = np.array([[0.69314718, 0.63651417]]) - assert_almost_equal(pe.fit_transform(diagram), diagram_res) + assert_almost_equal(pe.fit_transform(X), diagram_res) -def test_pi_not_fitted(): - pi = PersistenceImage(sigma=1) - with pytest.raises(NotFittedError): - pi.transform(diagram) +@pytest.mark.parametrize('n_bins', range(10, 51, 10)) +def test_bc_transform_shape(n_bins): + bc = BettiCurve(n_bins=n_bins) + X_res = bc.fit_transform(X) + assert X_res.shape == (1, bc._n_dimensions, n_bins) + + +@pytest.mark.parametrize('n_bins', range(10, 51, 10)) +@pytest.mark.parametrize('n_layers', range(1, 10)) +def test_pl_transform_shape(n_bins, n_layers): + pl = PersistenceLandscape(n_bins=n_bins, n_layers=n_layers) + X_res = pl.fit_transform(X) + assert X_res.shape == (1, pl._n_dimensions, n_layers, n_bins) @given(X=arrays(dtype=np.float, unique=True, - elements=integers(min_value=-1e10, - max_value=1e6), + elements=integers(min_value=-1e10, max_value=1e6), shape=array_shapes(min_dims=1, max_dims=1, min_side=11))) def test_pi_null(X): """Test that, if one trivial diagram (all pts on the diagonal) is provided, @@ -74,7 +116,7 @@ def test_silhouette_transform(): 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0., 0.05, 0.1, 0.15, 0.2, 0.25, 0.2, 0.15, 0.1, 0.05, 0.]) - assert_almost_equal(sht.fit_transform(diagram)[0][0], X_sht_res) + assert_almost_equal(sht.fit_transform(X)[0][0], X_sht_res) def test_silhouette_big_order(): @@ -111,7 +153,7 @@ def test_silhouette_big_order(): def _validate_distinct(X): - """Check if, in X, there is any persistence diagram for which all births + """Check if, in X, there is any persistence X for which all births and deaths are equal.""" unique_values = [np.unique(x[:, 0:2]) for x in X] if np.any([len(u) < 2 for u in unique_values]): @@ -164,13 +206,13 @@ def test_hk_positive(pts, dims): @given(pts_gen, dims_gen) def test_hk_big_sigma(pts, dims): - """ We expect that with a huge sigma, the diagrams are so diluted that + """We expect that with a huge sigma, the diagrams are so diluted that they are almost 0. Effectively, verifies that the smoothing is applied.""" n_bins = 10 x = get_input(pts, dims) hk = HeatKernel(sigma=100*np.max(np.abs(x)), n_bins=n_bins) - x_t = hk.fit(x).transform(x) + x_t = hk.fit_transform(x) assert np.all(np.abs(x_t) <= 1e-4) @@ -186,7 +228,6 @@ def test_hk_with_diag_points(pts): diag_points = np.array([[[2, 2, 0], [3, 3, 0], [7, 7, 0]]]) x_with_diag_points = np.concatenate([x, diag_points], axis=1) - # X_total = np.concatenate([X,X_with_diag_points], axis =0) hk = hk.fit(x_with_diag_points) x_t, x_with_diag_points_t = [hk.transform(x_) diff --git a/gtda/diagrams/tests/test_preprocessing.py b/gtda/diagrams/tests/test_preprocessing.py index 13ec6f910..f145a9b03 100644 --- a/gtda/diagrams/tests/test_preprocessing.py +++ b/gtda/diagrams/tests/test_preprocessing.py @@ -1,11 +1,15 @@ -"""Testing for ForgetDimension and Scaler.""" +"""Testing of preprocessing tools for persistence diagrams.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest +from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.diagrams import ForgetDimension, Scaler +from gtda.diagrams import ForgetDimension, Scaler, Filtering + +pio.renderers.default = 'plotly_mimetype' X_1 = np.array([[[0., 0.36905774, 0], [0., 0.37293977, 0], @@ -209,38 +213,71 @@ def test_not_fitted(): - dst = ForgetDimension() - dsc = Scaler() + with pytest.raises(NotFittedError): + ForgetDimension().transform(X_1) with pytest.raises(NotFittedError): - dst.transform(X_1) + Scaler().transform(X_1) with pytest.raises(NotFittedError): - dsc.transform(X_1) + Scaler().inverse_transform(X_1) with pytest.raises(NotFittedError): - dsc.inverse_transform(X_1) + Filtering().transform(X_1) + + +def test_forg_fit_transform_plot(): + ForgetDimension().fit_transform_plot(X_1, sample=0) + + +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,)]) +def test_fit_transform_plot(hom_dims): + Scaler().fit_transform_plot( + X_1, sample=0, homology_dimensions=hom_dims) + + Filtering().fit_transform_plot( + X_1, sample=0, homology_dimensions=hom_dims) @pytest.mark.parametrize('X', [X_1, X_2]) -def test_dst_transform(X): - dst = ForgetDimension() - X_res = dst.fit_transform(X) +def test_forg_transform_shape(X): + forg = ForgetDimension() + X_res = forg.fit_transform(X) assert X_res.shape == X.shape -parameters = [('wasserstein', {'p': 2}), - ('betti', {'n_bins': 10}), - ('bottleneck', None)] +parameters_sc = [('wasserstein', {'p': 2}), + ('betti', {'n_bins': 10}), + ('bottleneck', None)] -@pytest.mark.parametrize(('metric', 'metric_params'), parameters) +@pytest.mark.parametrize(('metric', 'metric_params'), parameters_sc) @pytest.mark.parametrize('X', [X_1, X_2]) -def test_dd_transform(X, metric, metric_params): - dsc = Scaler(metric=metric, metric_params=metric_params, n_jobs=1) - X_res = dsc.fit_transform(X) +def test_sc_transform_shape(X, metric, metric_params): + sc = Scaler(metric=metric, metric_params=metric_params, n_jobs=1) + X_res = sc.fit_transform(X) assert X_res.shape == X.shape - dsc = Scaler(metric=metric, metric_params=metric_params, n_jobs=1) - X_inv_res = dsc.fit(X_res).inverse_transform(X_res) - assert X_inv_res.shape == X.shape + X_inv_res = sc.inverse_transform(X_res) + assert_almost_equal(X_inv_res, X) + + +@pytest.mark.parametrize('X', [X_1, X_2]) +def test_filt_transform_zero(X): + filt = Filtering(epsilon=0.) + X_res = filt.fit_transform(X[:, [0], :]) + assert_almost_equal(X_res, X[:, [0], :]) + + +lifetimes_1 = X_1[:, :, 1] - X_1[:, :, 0] +epsilons_1 = np.linspace(np.min(lifetimes_1), np.max(lifetimes_1), num=3) + + +@pytest.mark.parametrize('epsilon', epsilons_1) +def test_filt_transform(epsilon): + filt = Filtering(epsilon=epsilon) + X_res_1 = filt.fit_transform(X_1) + assert X_res_1.shape == X_1.shape + + lifetimes_res_1 = X_res_1[:, :, 1] - X_res_1[:, :, 0] + assert not ((lifetimes_res_1 > 0.) & (lifetimes_res_1 <= epsilon)).any() diff --git a/gtda/externals/python/ripser_interface.py b/gtda/externals/python/ripser_interface.py index a31c96ae0..8b28b688d 100644 --- a/gtda/externals/python/ripser_interface.py +++ b/gtda/externals/python/ripser_interface.py @@ -163,7 +163,8 @@ def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", ) if n_perm and n_perm < 0: raise Exception( - "Should be a strictly positive number of points in the greedy permutation" + "Should be a strictly positive number of points in the greedy " + "permutation" ) idx_perm = np.arange(X.shape[0]) @@ -175,7 +176,10 @@ def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", r_cover = lambdas[-1] dm = dperm2all[:, idx_perm] else: - dm = pairwise_distances(X, metric=metric) + if metric == 'precomputed': + dm = X + else: + dm = pairwise_distances(X, metric=metric) dperm2all = dm n_points = dm.shape[0] diff --git a/gtda/graphs/tests/test_geodesic_distance.py b/gtda/graphs/tests/test_geodesic_distance.py index 20b210237..29b41bae8 100644 --- a/gtda/graphs/tests/test_geodesic_distance.py +++ b/gtda/graphs/tests/test_geodesic_distance.py @@ -1,12 +1,15 @@ -"""Testing for GraphGeodesicDistance""" +"""Testing for GraphGeodesicDistance.""" import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.graphs import GraphGeodesicDistance +pio.renderers.default = 'plotly_mimetype' + X_ggd = np.array([ np.array( [[0, 1, 3, 0, 0], @@ -22,14 +25,18 @@ [0, 0, 0, 0, 0]])]) -def test_graph_geodesic_distance_not_fitted(): +def test_ggd_not_fitted(): ggd = GraphGeodesicDistance() with pytest.raises(NotFittedError): ggd.transform(X_ggd) -def test_graph_geodesic_distance_transform(): +def test_ggd_fit_transform_plot(): + GraphGeodesicDistance().fit_transform_plot(X_ggd, sample=0) + + +def test_ggd_transform(): X_ggd_res = np.array([ [[0., 1., 3., 7., np.inf], [1., 0., 4., 8., np.inf], diff --git a/gtda/graphs/tests/test_kneighbors.py b/gtda/graphs/tests/test_kneighbors.py index d97eaf35e..bc7bd323e 100644 --- a/gtda/graphs/tests/test_kneighbors.py +++ b/gtda/graphs/tests/test_kneighbors.py @@ -1,4 +1,4 @@ -"""Testing for KNeighborsGraph""" +"""Testing for KNeighborsGraph.""" import numpy as np import pytest @@ -20,7 +20,7 @@ [0., 1., 1., 0.]]))]) -def test_kneighbors_graph_not_fitted(): +def test_kng_not_fitted(): kn_graph = KNeighborsGraph() with pytest.raises(NotFittedError): @@ -29,13 +29,13 @@ def test_kneighbors_graph_not_fitted(): @pytest.mark.parametrize(('n_neighbors', 'expected'), [(1, X_kng_res), (2, X_kng_res_k2)]) -def test_kneighbors_graph_transform(n_neighbors, expected): +def test_kng_transform(n_neighbors, expected): kn_graph = KNeighborsGraph(n_neighbors=n_neighbors) assert (kn_graph.fit_transform(X_kng)[0] != expected[0]).nnz == 0 -def test_parallel_kneighbors_graph_transform(): +def test_parallel_kng_transform(): kn_graph = KNeighborsGraph(n_jobs=1, n_neighbors=2) kn_graph_parallel = KNeighborsGraph(n_jobs=2, n_neighbors=2) diff --git a/gtda/graphs/tests/test_transition.py b/gtda/graphs/tests/test_transition.py index 66d426461..5f6cb3546 100644 --- a/gtda/graphs/tests/test_transition.py +++ b/gtda/graphs/tests/test_transition.py @@ -1,4 +1,4 @@ -"""Testing for TransitionGraph""" +"""Testing for TransitionGraph.""" import numpy as np import pytest diff --git a/gtda/homology/simplicial.py b/gtda/homology/simplicial.py index 1b6c0bc0c..624becc18 100644 --- a/gtda/homology/simplicial.py +++ b/gtda/homology/simplicial.py @@ -40,18 +40,18 @@ class VietorisRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): ---------- metric : string or callable, optional, default: ``'euclidean'`` If set to ``'precomputed'``, input data is to be interpreted as a - collection of distance matrices. Otherwise, input data is to be - interpreted as a collection of point clouds (i.e. feature arrays), - and `metric` determines a rule with which to calculate distances - between pairs of instances (i.e. rows) in these arrays. - If `metric` is a string, it must be one of the options allowed by - :func:`scipy.spatial.distance.pdist` for its metric parameter, or a - metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, - including "euclidean", "manhattan", or "cosine". - If `metric` is a callable function, it is called on each pair of - instances and the resulting value recorded. The callable should take - two arrays from the entry in `X` as input, and return a value - indicating the distance between them. + collection of distance matrices or of adjacency matrices of weighted + undirected graphs. Otherwise, input data is to be interpreted as a + collection of point clouds (i.e. feature arrays), and `metric` + determines a rule with which to calculate distances between pairs of + points (i.e. row vectors). If `metric` is a string, it must be one + of the options allowed by :func:`scipy.spatial.distance.pdist` for + its metric parameter, or a metric listed in + :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, including + ``'euclidean'``, ``'manhattan'`` or ``'cosine'``. If `metric` is a + callable, it should take pairs of vectors (1D arrays) as input and, for + each two vectors in a pair, it should return a scalar indicating the + distance/dissimilarity between them. homology_dimensions : list or tuple, optional, default: ``(0, 1)`` Dimensions (non-negative integers) of the topological features to be @@ -129,13 +129,12 @@ def __init__(self, metric='euclidean', homology_dimensions=(0, 1), self.n_jobs = n_jobs def _ripser_diagram(self, X): - Xdgms = ripser(X[X[:, 0] != np.inf], - maxdim=self._max_homology_dimension, + Xdgms = ripser(X, maxdim=self._max_homology_dimension, thresh=self.max_edge_length, coeff=self.coeff, metric=self.metric)['dgms'] if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][:-1, :] # Remove final death at np.inf + Xdgms[0] = Xdgms[0][:-1, :] # Remove one infinite bar # Add dimension as the third elements of each (b, d) tuple Xdgms = {dim: np.hstack([Xdgms[dim], @@ -153,14 +152,15 @@ def fit(self, X, y=None): Parameters ---------- X : ndarray or list - Input data representing a collection of point clouds or of distance - matrices. Can be either a 3D ndarray whose zeroth dimension has - size ``n_samples``, or a list containing ``n_samples`` 2D ndarrays. - If ``metric == 'precomputed'``, elements of `X` must be square - arrays representing distance matrices; otherwise, their rows are - interpreted as vectors in Euclidean space and, when `X` is a list, - warnings are issued when the number of columns (dimension of the - Euclidean space) differs among samples. + Input data representing a collection of point clouds if `metric` + was not set to ``'precomputed'``, and of distance matrices or + adjacency matrices of weighted undirected graphs otherwise. Can be + either a 3D ndarray whose zeroth dimension has size ``n_samples``, + or a list containing ``n_samples`` 2D ndarrays. If `metric` was + set to ``'precomputed'``, each entry of `X` must be a square + array and should be compatible with a filtration, i.e. the value + at index (i, j) should be no smaller than the values at diagonal + indices (i, i) and (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -174,7 +174,7 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) self._is_precomputed = self.metric == 'precomputed' - check_point_clouds(X, distance_matrix=self._is_precomputed) + check_point_clouds(X, distance_matrices=self._is_precomputed) if self.infinity_values is None: self.infinity_values_ = self.max_edge_length @@ -200,14 +200,15 @@ def transform(self, X, y=None): Parameters ---------- X : ndarray or list - Input data representing a collection of point clouds or of distance - matrices. Can be either a 3D ndarray whose zeroth dimension has - size ``n_samples``, or a list containing ``n_samples`` 2D ndarrays. - If ``metric == 'precomputed'``, elements of `X` must be square - arrays representing distance matrices; otherwise, their rows are - interpreted as vectors in Euclidean space and, when `X` is a list, - warnings are issued when the number of columns (dimension of the - Euclidean space) differs among samples. + Input data representing a collection of point clouds if `metric` + was not set to ``'precomputed'``, and of distance matrices or + adjacency matrices of weighted undirected graphs otherwise. Can be + either a 3D ndarray whose zeroth dimension has size ``n_samples``, + or a list containing ``n_samples`` 2D ndarrays. If `metric` was + set to ``'precomputed'``, each entry of `X` must be a square + array and should be compatible with a filtration, i.e. the value + at index (i, j) should be no smaller than the values at diagonal + indices (i, i) and (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -224,7 +225,7 @@ def transform(self, X, y=None): """ check_is_fitted(self) - X = check_point_clouds(X, distance_matrix=self._is_precomputed) + X = check_point_clouds(X, distance_matrices=self._is_precomputed) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._ripser_diagram)(x) for x in X) @@ -385,7 +386,7 @@ def _gudhi_diagram(self, X): for dim in self.homology_dimensions} if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][1:, :] # Remove final death at np.inf + Xdgms[0] = Xdgms[0][1:, :] # Remove one infinite bar # Add dimension as the third elements of each (b, d) tuple Xdgms = {dim: np.hstack([Xdgms[dim], @@ -424,7 +425,7 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) self._is_precomputed = self.metric == 'precomputed' - check_point_clouds(X, distance_matrix=self._is_precomputed) + check_point_clouds(X, distance_matrices=self._is_precomputed) if self.infinity_values is None: self.infinity_values_ = self.max_edge_length @@ -474,7 +475,7 @@ def transform(self, X, y=None): """ check_is_fitted(self) - X = check_point_clouds(X, distance_matrix=self._is_precomputed) + X = check_point_clouds(X, distance_matrices=self._is_precomputed) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._gudhi_diagram)(x) for x in X) @@ -607,7 +608,7 @@ def _gudhi_diagram(self, X): for dim in self.homology_dimensions} if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][1:, :] # Remove final death at np.inf + Xdgms[0] = Xdgms[0][1:, :] # Remove one infinite bar # Add dimension as the third elements of each (b, d) tuple Xdgms = {dim: np.hstack([Xdgms[dim], diff --git a/gtda/homology/tests/test_cubical.py b/gtda/homology/tests/test_cubical.py index 66d72d806..85b9fcc3d 100644 --- a/gtda/homology/tests/test_cubical.py +++ b/gtda/homology/tests/test_cubical.py @@ -2,12 +2,15 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.homology import CubicalPersistence +pio.renderers.default = 'plotly_mimetype' + X = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], [2.98935825, 2.79848711], @@ -31,6 +34,12 @@ def test_cp_not_fitted(): cp.transform(X) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_cp_fit_transform_plot(hom_dims): + CubicalPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + + @pytest.mark.parametrize("periodic_dimensions, expected", [(None, X_cp_res), (np.array([False, False]), X_cp_res), diff --git a/gtda/homology/tests/test_simplicial.py b/gtda/homology/tests/test_simplicial.py index 4af1633e1..e93472e7b 100644 --- a/gtda/homology/tests/test_simplicial.py +++ b/gtda/homology/tests/test_simplicial.py @@ -1,7 +1,8 @@ -"""Testing for persistent homology on grid.""" +"""Testing for simplicial persistent homology.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError @@ -9,7 +10,9 @@ from gtda.homology import VietorisRipsPersistence, SparseRipsPersistence, \ EuclideanCechPersistence -pc = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], +pio.renderers.default = 'plotly_mimetype' + +X = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], [2.98935825, 2.79848711], [2.79848711, 2.41211849], [2.41211849, 1.92484888]]]) @@ -19,17 +22,17 @@ def test_vrp_params(): vrp = VietorisRipsPersistence(metric=metric) with pytest.raises(ValueError): - vrp.fit_transform(pc) + vrp.fit_transform(X) def test_vrp_not_fitted(): vrp = VietorisRipsPersistence() with pytest.raises(NotFittedError): - vrp.transform(pc) + vrp.transform(X) -pc_vrp_res = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], +X_vrp_res = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], [0., 0.60077095, 0], [0., 0.62186205, 0], [0.69093919, 0.80131882, 1]]]) @@ -37,7 +40,20 @@ def test_vrp_not_fitted(): def test_vrp_transform(): vrp = VietorisRipsPersistence() - assert_almost_equal(vrp.fit_transform(pc), pc_vrp_res) + assert_almost_equal(vrp.fit_transform(X), X_vrp_res) + + +def test_vrp_list_of_arrays(): + X_2 = np.array([[0., 1.], [1., 2.]]) + X_list = [X[0].copy(), X_2] + vrp = VietorisRipsPersistence() + vrp.fit(X_list) + + +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_vrp_fit_transform_plot(hom_dims): + VietorisRipsPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) def test_srp_params(): @@ -45,24 +61,24 @@ def test_srp_params(): vrp = SparseRipsPersistence(metric=metric) with pytest.raises(ValueError): - vrp.fit_transform(pc) + vrp.fit_transform(X) def test_srp_not_fitted(): srp = SparseRipsPersistence() with pytest.raises(NotFittedError): - srp.transform(pc) + srp.transform(X) -pc_srp_res_2 = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], - [0., 0.60077095, 0], [0., 0.62186205, 0], - [0.69093919, 0.80131882, 1]]]) +X_srp_res_2 = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], + [0., 0.60077095, 0], [0., 0.62186205, 0], + [0.69093919, 0.80131882, 1]]]) @pytest.mark.parametrize("epsilon, point_clouds, expected", - [(0.0, pc, pc_vrp_res), - (1.0, pc, pc_srp_res_2)]) + [(0.0, X, X_vrp_res), + (1.0, X, X_srp_res_2)]) def test_srp_transform(epsilon, point_clouds, expected): srp = SparseRipsPersistence(epsilon=epsilon) @@ -70,22 +86,28 @@ def test_srp_transform(epsilon, point_clouds, expected): np.sort(expected, axis=1)) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_srp_fit_transform_plot(hom_dims): + SparseRipsPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + + def test_cp_params(): coeff = 'not_defined' cp = EuclideanCechPersistence(coeff=coeff) with pytest.raises(TypeError): - cp.fit_transform(pc) + cp.fit_transform(X) def test_cp_not_fitted(): cp = EuclideanCechPersistence() with pytest.raises(NotFittedError): - cp.transform(pc) + cp.transform(X) -pc_cp_res = np.array( +X_cp_res = np.array( [[[0., 0.31093103, 0.], [0., 0.30038548, 0.], [0., 0.25587055, 0.], [0., 0.21547186, 0.], [0.34546959, 0.41473758, 1.], [0.51976681, 0.55287585, 1.], @@ -96,11 +118,10 @@ def test_cp_not_fitted(): def test_cp_transform(): cp = EuclideanCechPersistence() - assert_almost_equal(cp.fit_transform(pc), pc_cp_res) + assert_almost_equal(cp.fit_transform(X), X_cp_res) -def test_vrp_list_of_arrays(): - pc_2 = np.array([[0, 1], [1, 2]]) - pc_list = [pc[0].copy(), pc_2] - vrp = VietorisRipsPersistence() - vrp.fit(pc_list) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_cp_fit_transform_plot(hom_dims): + EuclideanCechPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) diff --git a/gtda/images/tests/test_filtrations.py b/gtda/images/tests/test_filtrations.py index ce58a2421..022ebe020 100644 --- a/gtda/images/tests/test_filtrations.py +++ b/gtda/images/tests/test_filtrations.py @@ -2,12 +2,15 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.images import HeightFiltration, RadialFiltration, \ - DilationFiltration, ErosionFiltration, SignedDistanceFiltration + DilationFiltration, ErosionFiltration, SignedDistanceFiltration + +pio.renderers.default = 'plotly_mimetype' images_2D = np.stack([np.ones((3, 4)), np.concatenate([np.ones((3, 2)), np.zeros((3, 2))], @@ -78,6 +81,10 @@ def test_height_transform(direction, images, expected): expected) +def test_height_fit_transform_plot(): + HeightFiltration().fit_transform_plot(images_2D, sample=0) + + def test_radial_not_fitted(): radial = RadialFiltration() with pytest.raises(NotFittedError): @@ -130,6 +137,10 @@ def test_radial_transform(center, images, expected): expected) +def test_radial_fit_transform_plot(): + RadialFiltration().fit_transform_plot(images_2D, sample=0) + + def test_dilation_not_fitted(): dilation = DilationFiltration() with pytest.raises(NotFittedError): @@ -172,6 +183,10 @@ def test_dilation_transform(n_iterations, images, expected): expected) +def test_dilation_fit_transform_plot(): + DilationFiltration().fit_transform_plot(images_2D, sample=0) + + def test_erosion_not_fitted(): erosion = ErosionFiltration() with pytest.raises(NotFittedError): @@ -214,6 +229,10 @@ def test_erosion_transform(n_iterations, images, expected): expected) +def test_erosion_fit_transform_plot(): + ErosionFiltration().fit_transform_plot(images_2D, sample=0) + + def test_signed_not_fitted(): signed = SignedDistanceFiltration() with pytest.raises(NotFittedError): @@ -253,3 +272,7 @@ def test_signed_transform(n_iterations, images, expected): assert_almost_equal(signed.fit_transform(images), expected) + + +def test_signed_fit_transform_plot(): + SignedDistanceFiltration().fit_transform_plot(images_2D, sample=0) diff --git a/gtda/images/tests/test_preprocessing.py b/gtda/images/tests/test_preprocessing.py index 1f4ebe8d8..8fb9fb2eb 100644 --- a/gtda/images/tests/test_preprocessing.py +++ b/gtda/images/tests/test_preprocessing.py @@ -2,12 +2,15 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal, assert_equal from sklearn.exceptions import NotFittedError from gtda.images import Binarizer, Inverter, Padder, ImageToPointCloud +pio.renderers.default = 'plotly_mimetype' + images_2D = np.stack([ np.ones((7, 8)), np.concatenate([np.ones((7, 4)), np.zeros((7, 4))], axis=1), @@ -42,6 +45,10 @@ def test_binarizer_transform(threshold, expected): expected) +def test_binarizer_fit_transform_plot(): + Binarizer().fit_transform_plot(images_2D, sample=0) + + def test_inverter_not_fitted(): inverter = Inverter() with pytest.raises(NotFittedError): @@ -69,13 +76,17 @@ def test_inverter_transform(images, expected): expected) +def test_inverter_fit_transform_plot(): + Inverter().fit_transform_plot(images_2D, sample=0) + + def test_padder_not_fitted(): padder = Padder() with pytest.raises(NotFittedError): padder.transform(images_2D) -@pytest.mark.parametrize("images, paddings, ", +@pytest.mark.parametrize("images, paddings", [(images_2D, np.array([1, 1], dtype=np.int)), (images_2D, None), (images_3D, np.array([2, 2, 2], dtype=np.int))]) @@ -91,6 +102,10 @@ def test_padder_transform(images, paddings): expected_shape) +def test_padder_fit_transform_plot(): + Padder().fit_transform_plot(images_2D, sample=0) + + images_2D_small = np.stack([ np.ones((3, 2)), np.concatenate([np.ones((3, 1)), np.zeros((3, 1))], axis=1), @@ -127,6 +142,16 @@ def test_img2pc_not_fitted(): np.array([[]])]) +def compare_arrays_as_sets(a1, a2): + """ A helper function to compare two point_clouds. + They should have the same points, but not necessarily in the same order. + """ + def to_set_of_elements(a): + return set([tuple(p) for p in a]) + as1, as2 = [to_set_of_elements(a) for a in [a1, a2]] + return (as1 <= as2) and (as1 >= as2) + + @pytest.mark.parametrize("images, expected", [(images_2D_small, images_2D_img2pc), (images_3D_small, images_3D_img2pc)]) @@ -139,11 +164,6 @@ def test_img2pc_transform(images, expected): expected)) -def compare_arrays_as_sets(a1, a2): - """ A helper function to compare two point_clouds. - They should have the same points, but not necessarily in the same order. - """ - def to_set_of_elements(a): - return set([tuple(p) for p in a]) - as1, as2 = [to_set_of_elements(a) for a in [a1, a2]] - return (as1 <= as2) and (as1 >= as2) +@pytest.mark.parametrize("images", [images_2D, images_3D]) +def test_img2pc_fit_transform_plot(images): + ImageToPointCloud().fit_transform_plot(images, sample=0) diff --git a/gtda/point_clouds/tests/test_rescaling.py b/gtda/point_clouds/tests/test_rescaling.py index d026fcfab..93a47fcb8 100644 --- a/gtda/point_clouds/tests/test_rescaling.py +++ b/gtda/point_clouds/tests/test_rescaling.py @@ -1,43 +1,54 @@ -"""Testing for rescaling transfomers.""" +"""Testing for rescaling transformers.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.point_clouds import ConsistentRescaling, ConsecutiveRescaling -Xr = np.array([[[0, 0], [1, 2], [5, 6]]]) +pio.renderers.default = 'plotly_mimetype' + +X = np.array([[[0, 0], [1, 2], [5, 6]]]) def test_consistent_not_fitted(): cr = ConsistentRescaling() with pytest.raises(NotFittedError): - cr.transform(Xr) + cr.transform(X) def test_consistent_transform(): cr = ConsistentRescaling() - Xres = np.array([[[0., 1., 2.19601308], - [1., 0., 1.59054146], - [2.19601308, 1.59054146, 0.]]]) + X_res = np.array([[[0., 1., 2.19601308], + [1., 0., 1.59054146], + [2.19601308, 1.59054146, 0.]]]) + + assert_almost_equal(cr.fit_transform(X), X_res) + - assert_almost_equal(cr.fit_transform(Xr), Xres) +def test_consistent_fit_transform_plot(): + ConsistentRescaling().fit_transform_plot(X, sample=0) def test_consecutive_not_fitted(): cr = ConsecutiveRescaling() with pytest.raises(NotFittedError): - cr.transform(Xr) + cr.transform(X) def test_consecutive_transform(): cr = ConsecutiveRescaling() - Xres = np.array([[[0., 0., 7.81024968], - [2.23606798, 0., 0.], - [7.81024968, 5.65685425, 0.]]]) + X_res = np.array([[[0., 0., 7.81024968], + [2.23606798, 0., 0.], + [7.81024968, 5.65685425, 0.]]]) + + assert_almost_equal(cr.fit_transform(X), X_res) + - assert_almost_equal(cr.fit_transform(Xr), Xres) +def test_consecutive_fit_transform_plot(): + ConsecutiveRescaling().fit_transform_plot(X, sample=0) diff --git a/gtda/time_series/tests/test_embedding.py b/gtda/time_series/tests/test_embedding.py index 9c3c7499d..7b9adf0ed 100644 --- a/gtda/time_series/tests/test_embedding.py +++ b/gtda/time_series/tests/test_embedding.py @@ -2,12 +2,14 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.time_series import TakensEmbedding -from gtda.time_series import SlidingWindow +from gtda.time_series import SlidingWindow, TakensEmbedding + +pio.renderers.default = 'plotly_mimetype' signal = np.asarray([np.sin(x / 2) + 2 for x in range(0, 20)]) @@ -90,8 +92,8 @@ def test_window_params(): def test_window_transform(): windows = SlidingWindow(width=3, stride=2) - x_windows = windows.fit_transform(signal_embedded_search) - assert (x_windows.shape == (8, 4, 2)) + X_windows = windows.fit_transform(signal_embedded_search) + assert (X_windows.shape == (8, 4, 2)) def test_window_resample(): @@ -99,3 +101,9 @@ def test_window_resample(): windows.fit(y) y_resampled = windows.resample(y) assert_almost_equal(y_resampled, y[np.arange(3, 20, 2)]) + + +def test_window_plot(): + windows = SlidingWindow(width=3, stride=2) + X_windows = windows.fit_transform(signal_embedded_search) + windows.plot(X_windows, sample=0) diff --git a/gtda/utils/testing.py b/gtda/utils/testing.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/gtda/utils/tests/test_validation.py b/gtda/utils/tests/test_validation.py index dcf8becf3..378b2e59a 100644 --- a/gtda/utils/tests/test_validation.py +++ b/gtda/utils/tests/test_validation.py @@ -3,8 +3,10 @@ import numpy as np import pytest +from sklearn.exceptions import DataDimensionalityWarning -from gtda.utils.validation import check_diagrams, validate_params +from gtda.utils.validation import check_diagrams, validate_params, \ + check_point_clouds # Testing for validate_params @@ -54,3 +56,206 @@ def test_inputs_arrayStruc_V(): with pytest.raises(ValueError): check_diagrams(X) + + +# Testing check_point_clouds +# Create several kinds of inputs +class CreateInputs: + def __init__( + self, n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra + ): + N = n_samples * n_1 * n_2 + n_1_rectang = n_1 + 1 + n_2_rectang = n_2 - 1 + N_rectang = n_samples * n_1_rectang * n_2_rectang + + self.X = np.arange(N, dtype=float).reshape(n_samples, n_1, n_2) + self.X_rectang = np.arange(N_rectang, dtype=float). \ + reshape(n_samples, n_1_rectang, n_2_rectang) + + self.X_list = [] + self.X_list_rectang = [] + for i in range(n_samples): + self.X_list.append(self.X[i].copy()) + self.X_list_rectang.append(self.X_rectang[i].copy()) + + # List example where not all 2D arrays have the same no. of rows + self.X_list_rectang_diff_rows = \ + self.X_list_rectang[:-1] + [self.X_list_rectang[-1][:-1, :]] + + # List example where not all 2D arrays have the same no. of columns + self.X_list_rectang_diff_cols = \ + self.X_list_rectang[:-1] + [self.X_list_rectang[-1][:, :-1]] + + N_extra = n_samples_extra * n_1_extra * n_2_extra + X_extra = np.arange(N_extra, dtype=float). \ + reshape(n_samples_extra, n_1_extra, n_2_extra) + X_list_extra = [] + for i in range(n_samples_extra): + X_list_extra.append(X_extra[i].copy()) + self.X_list_tot = self.X_list + X_list_extra + + def insert_inf(self): + # Replace first entries with np.inf + self.X[0, 0, 0] = np.inf + self.X_rectang[0, 0, 0] = np.inf + self.X_list[0][0, 0] = np.inf + self.X_list_rectang[0][0, 0] = np.inf + return self + + def insert_nan(self): + # Replace first entries with np.nan + self.X[0, 0, 0] = np.nan + self.X_rectang[0, 0, 0] = np.nan + self.X_list[0][0, 0] = np.nan + self.X_list_rectang[0][0, 0] = np.nan + return self + + +n_samples = 2 +n_1 = 5 +n_2 = 5 +n_samples_extra = 1 +n_1_extra = 6 +n_2_extra = 6 + + +def test_check_point_clouds_regular_finite(): + """Cases in which the input is finite and no warnings or errors should be + thrown by check_point_clouds.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra) + check_point_clouds(ex.X_rectang) + check_point_clouds(ex.X_list_rectang) + check_point_clouds(ex.X_list_rectang_diff_rows) + check_point_clouds(ex.X, distance_matrices=True) + check_point_clouds(ex.X_list, distance_matrices=True) + check_point_clouds(ex.X_list_tot, distance_matrices=True) + + +def test_check_point_clouds_value_err_finite(): + """Cases in which the input is finite but we throw a ValueError.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra) + + # Check that we error on 1d array input + with pytest.raises(ValueError): + check_point_clouds(np.asarray(ex.X_list_tot)) + + # Check that we error on 2d array input + with pytest.raises(ValueError): + check_point_clouds(ex.X[0]) + + # Check that we throw errors when arrays are not square and + # distance_matrices is True. + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds(ex.X_rectang, distance_matrices=True) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds(ex.X_list_rectang, distance_matrices=True) + + +def test_check_point_clouds_warn_finite(): + """Cases in which the input is finite but we throw warnings.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra) + + # Check that we throw warnings when arrays are square and distance_matrices + # is False + # 1) Array input + with pytest.warns(DataDimensionalityWarning): + check_point_clouds(ex.X) + # 2) List input + with pytest.warns(DataDimensionalityWarning): + check_point_clouds(ex.X_list) + + # Check that we throw warnings on list input when arrays have different + # number of columns + with pytest.warns(DataDimensionalityWarning): + check_point_clouds(ex.X_list_rectang_diff_cols) + + +def test_check_point_clouds_regular_inf(): + """Cases in which part of the input is infinite and no warnings or errors + should be thrown by check_point_clouds.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_inf() + + check_point_clouds(ex.X, distance_matrices=True) + check_point_clouds(ex.X_list, distance_matrices=True) + check_point_clouds(ex.X_rectang, force_all_finite=False) + check_point_clouds(ex.X_list_rectang, force_all_finite=False) + + +def test_check_point_clouds_value_err_inf(): + """Cases in which part of the input is infinite and we throw a + ValueError.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_inf() + + # Check that, by default, np.inf is only accepted when distance_matrices + # is True. + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds(ex.X_rectang) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds(ex.X_list_rectang) + + # Check that we error if we explicitly set force_all_finite to True + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds(ex.X, distance_matrices=True, force_all_finite=True) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds( + ex.X_list, distance_matrices=True, force_all_finite=True) + + +def test_check_point_clouds_regular_nan(): + """Cases in which part of the input is NaN and no warnings or errors + should be thrown by check_point_clouds.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_nan() + + check_point_clouds(ex.X, distance_matrices=True, + force_all_finite='allow-nan') + check_point_clouds( + ex.X_list, distance_matrices=True, force_all_finite='allow-nan') + check_point_clouds(ex.X_rectang, force_all_finite='allow-nan') + check_point_clouds(ex.X_list_rectang, force_all_finite='allow-nan') + + +@pytest.mark.parametrize("force_all_finite", [True, False]) +def test_check_point_clouds_value_err_nan(force_all_finite): + """Cases in which part of the input is nan and we throw a + ValueError.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_nan() + + # Check that we error when force_all_finite is True or False + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds( + ex.X, distance_matrices=True, force_all_finite=force_all_finite) + with pytest.raises(ValueError): + check_point_clouds(ex.X_rectang, force_all_finite=force_all_finite) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds(ex.X_list, distance_matrices=True, + force_all_finite=force_all_finite) + with pytest.raises(ValueError): + check_point_clouds( + ex.X_list_rectang, force_all_finite=force_all_finite) diff --git a/gtda/utils/validation.py b/gtda/utils/validation.py index a96ba460a..bc5df763e 100644 --- a/gtda/utils/validation.py +++ b/gtda/utils/validation.py @@ -6,7 +6,9 @@ from warnings import warn import numpy as np + from sklearn.utils.validation import check_array +from sklearn.exceptions import DataDimensionalityWarning def check_diagrams(X, copy=False): @@ -186,9 +188,23 @@ def validate_params(parameters, references, exclude=None): return _validate_params(parameters_, references) -def check_point_clouds(X, distance_matrix=False, **kwargs): - """Input validation on an array or list representing a collection of point - clouds or distance matrices. +def _check_array_mod(X, **kwargs): + """Modified version of :func:`~sklearn.utils.validation.check_array. When + keyword parameter `force_all_finite` is set to False, NaNs are not + accepted but infinity is.""" + if not kwargs['force_all_finite']: + Xnew = check_array(X, **kwargs) + if np.isnan(Xnew).any(): + raise ValueError( + "Input contains NaN. Only finite values and infinity are " + "allowed when parameter `force_all_finite` is False.") + return Xnew + return check_array(X, **kwargs) + + +def check_point_clouds(X, distance_matrices=False, **kwargs): + """Input validation on arrays or lists representing collections of point + clouds or of distance/adjacency matrices. The input is checked to be either a single 3D array using a single call to :func:`~sklearn.utils.validation.check_array`, or a list of 2D arrays by @@ -204,14 +220,22 @@ def check_point_clouds(X, distance_matrix=False, **kwargs): X : object Input object to check / convert. - distance_matrix : bool, optional, default: ``False`` + distance_matrices : bool, optional, default: ``False`` Whether the input represents a collection of distance matrices or of concrete point clouds in Euclidean space. In the first case, entries are allowed to be infinite unless otherwise specified in `kwargs`. kwargs Keyword arguments accepted by - :func:`~gtda.utils.validation.check_list_of_arrays`. + :func:`~sklearn.utils.validation.check_array`, with the following + caveats: 1) `ensure_2d` and `allow_nd` are ignored; 2) if not passed + explicitly, `force_all_finite` is set to be the boolean negation of + `distance_matrices`; 3) when `force_all_finite` is set to ``False``, + NaN inputs are not allowed; 4) `accept_sparse` and + `accept_large_sparse` are only meaningful in the case of lists of 2D + arrays, in which case they are passed to individual instances of + :func:`~sklearn.utils.validation.check_array` validating each entry + in the list. Returns ------- @@ -219,30 +243,67 @@ def check_point_clouds(X, distance_matrix=False, **kwargs): The converted and validated object. """ - kwargs_ = {'force_all_finite': not distance_matrix} + kwargs_ = {'force_all_finite': not distance_matrices} kwargs_.update(kwargs) - if hasattr(X, 'shape'): + kwargs_.pop('allow_nd', None) + kwargs_.pop('ensure_2d', None) + if hasattr(X, 'shape') and hasattr(X, 'ndim'): if X.ndim != 3: - raise ValueError("ndarray input must be 3D.") - return check_array(X, allow_nd=True, **kwargs_) + if X.ndim == 2: + extra_2D = \ + "\nReshape your input X using X.reshape(1, *X.shape) or " \ + "X[None, :, :] if X is a single point cloud/distance " \ + "matrix/adjacency matrix of a weighted graph." + else: + extra_2D = "" + raise ValueError( + f"Input must be a single 3D array or a list of 2D arrays. " + f"Array of dimension {X.ndim} passed." + extra_2D) + if (X.shape[1] != X.shape[2]) and distance_matrices: + raise ValueError( + f"Input array X must have X.shape[1] == X.shape[2]: " + f"{X.shape[1]} != {X.shape[2]} passed.") + elif (X.shape[1] == X.shape[2]) and not distance_matrices: + warn( + "Input array X has X.shape[1] == X.shape[2]. This is " + "consistent with a collection of distance/adjacency " + "matrices, but the input is being treated as a collection " + "of vectors in Euclidean space.", + DataDimensionalityWarning, stacklevel=2) + Xnew = _check_array_mod(X, **kwargs_, allow_nd=True) else: - if not distance_matrix: - reference = X[0].shape[1] # Embedding dimension of first sample - if not reduce( - and_, (x.shape[1] == reference for x in X[1:]), True): - warn("Not all point clouds have the same embedding dimension.") - - has_check_failed = False - messages = [] - Xnew = [] - for i, x in enumerate(X): - try: - Xnew.append(check_array(x, **kwargs_)) - messages = [''] - except ValueError as e: - has_check_failed = True - messages.append(str(e)) - if has_check_failed: - raise ValueError("The following errors were raised by the inputs: \n" - "\n".join(messages)) + has_check_failed = False + messages = [] + Xnew = [] + for i, x in enumerate(X): + try: + xnew = _check_array_mod(x, **kwargs_, ensure_2d=True) + if distance_matrices: + if not x.shape[0] == x.shape[1]: + raise ValueError( + f"All arrays must be square: {x.shape[0]} rows " + f"and {x.shape[1]} columns found in this array.") + Xnew.append(xnew) + except ValueError as e: + has_check_failed = True + messages.append(f"Entry {i}:\n{e}") + if has_check_failed: + raise ValueError( + "The following errors were raised by the inputs:\n\n" + + "\n\n".join(messages)) + + if not distance_matrices: + if reduce(and_, (x.shape[0] == x.shape[1] for x in X), True): + warn( + "All arrays are square. This is consistent with a " + "collection of distance/adjacency matrices, but the input " + "is being treated as a collection of vectors in Euclidean " + "space.", DataDimensionalityWarning, stacklevel=2) + + ref_dim = X[0].shape[1] # Embedding dimension of first sample + if not reduce(and_, (x.shape[1] == ref_dim for x in X[1:]), True): + warn( + "Not all point clouds have the same embedding dimension.", + DataDimensionalityWarning, stacklevel=2) + return Xnew diff --git a/setup.py b/setup.py index 7d5e09e4f..a90c4784c 100755 --- a/setup.py +++ b/setup.py @@ -29,8 +29,8 @@ MAINTAINER_EMAIL = 'maintainers@giotto.ai' URL = 'https://github.com/giotto-ai/giotto-tda' LICENSE = 'GNU AGPLv3' -DOWNLOAD_URL = 'https://github.com/giotto-ai/giotto-tda/tarball/v0.2.0' -VERSION = __version__ # noqa +DOWNLOAD_URL = 'https://github.com/giotto-ai/giotto-tda/tarball/v0.2.1' +VERSION = __version__ # noqa CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', 'License :: OSI Approved',