diff --git a/.azure-ci/docker_scripts.sh b/.azure-ci/docker_scripts.sh index 4ad956407..73caa1524 100755 --- a/.azure-ci/docker_scripts.sh +++ b/.azure-ci/docker_scripts.sh @@ -39,7 +39,7 @@ cd /io pip install -e ".[dev]" # Test dev install with pytest -pytest gtda --cov --cov-report xml +pytest gtda --no-cov --no-coverage-upload # Uninstall giotto-tda/giotto-tda-nightly dev pip uninstall -y giotto-tda diff --git a/.azure-ci/install_boost.py b/.azure-ci/install_boost.py new file mode 100644 index 000000000..1f21a74c4 --- /dev/null +++ b/.azure-ci/install_boost.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +import os +from pathlib import Path +import urllib.request +import shutil +import zipfile + + +url = "https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.zip" +boost_folder = r"C:\local" + +Path(boost_folder).mkdir(parents=True, exist_ok=True) +zip_file = os.path.join(boost_folder, "1_72_0.zip") + +with urllib.request.urlopen(url) as response, \ + open(zip_file, 'wb') as out_file: + shutil.copyfileobj(response, out_file) + +with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(boost_folder) + +os.remove(zip_file) diff --git a/.coveragerc b/.coveragerc index e3bd56f66..af5cd07ce 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,14 +1,12 @@ [run] omit = */gtda/externals/* - **/setup.py - */gtda/compose/* - */gtda/datasets/* - */gtda/images/* - */gtda/neural_network/* - */gtda/model_selection/* + *tests* + **/gtda/utils/intervals.py + **/gtda/utils/_docs.py **/base.py **/pipeline.py - **/_version.py \ No newline at end of file + **/setup.py + **/_version.py diff --git a/CMakeLists.txt b/CMakeLists.txt index f611d750a..062b89a0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/gtda/externals/pybind11) set(BINDINGS_DIR "gtda/externals/bindings") include(cmake/HelperBoost.cmake) -include_directories(${Boost_INCLUDE_DIR}) +include_directories(${Boost_INCLUDE_DIRS}) find_package(OpenMP) diff --git a/README.rst b/README.rst index e4ca2c40b..cea8acc71 100644 --- a/README.rst +++ b/README.rst @@ -120,12 +120,33 @@ source directory :: pytest gtda Important links ---------------- +=============== - Official source code repo: https://github.com/giotto-ai/giotto-tda - Download releases: https://pypi.org/project/giotto-tda/ - Issue tracker: https://github.com/giotto-ai/giotto-tda/issues + +Citing giotto-tda +================= + +If you use ``giotto-tda`` in a scientific publication, we would appreciate citations to the following paper: + + `giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration `_, Tauzin *et al*, arXiv:2004.02551, 2020. + +You can use the following BibTeX entry: + +.. code:: RST + + @misc{tauzin2020giottotda, + title={giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration}, + author={Guillaume Tauzin and Umberto Lupo and Lewis Tunstall and Julian Burella Pérez and Matteo Caorsi and Anibal Medina-Mardones and Alberto Dassatti and Kathryn Hess}, + year={2020}, + eprint={2004.02551}, + archivePrefix={arXiv}, + primaryClass={cs.LG} + } + Community ========= diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 33a984f4f..881f05b33 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -38,7 +38,7 @@ jobs: - task: Cache@2 inputs: - key: '"ccache-wheels-v2020.03.23" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-wheels-v2020.04.07" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache @@ -133,7 +133,7 @@ jobs: - task: Cache@2 inputs: - key: '"ccache-v2020.03.23" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-v2020.04.07" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache @@ -154,7 +154,7 @@ jobs: - script: | set -e pytest gtda --cov --cov-report xml - displayName: 'Test dev install with pytest' + displayName: 'Test dev install with pytest, upload coverage report' - script: | set -e @@ -246,13 +246,17 @@ jobs: condition: eq(variables['nightly_check'], 'true') displayName: 'Change name to giotto-tda-nightly' + - script: | + python .azure-ci/install_boost.py || exit /b + displayName: 'Install boost' + - script: | python -m pip install --upgrade pip setuptools python -m pip install -e ".[dev]" displayName: 'Install dev environment' - script: | - pytest gtda --cov --cov-report xml || exit /b + pytest gtda --no-cov --no-coverage-upload || exit /b displayName: 'Test dev install with pytest' - script: | diff --git a/doc/faq.rst b/doc/faq.rst index a27285501..93f183ac8 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -10,6 +10,25 @@ I am a researcher. Can I use ``giotto-tda`` in my project? Of course! The `license `_ is very permissive. For more information, please contact the `L2F team`_. +How do I cite ``giotto-tda``? +----------------------------- +We would appreciate citations to the following paper: + + `giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration `_, Tauzin *et al*, arXiv:2004.02551, 2020. + +You can use the following BibTeX entry: + +.. code:: RST + + @misc{tauzin2020giottotda, + title={giotto-tda: A Topological Data Analysis Toolkit for Machine Learning and Data Exploration}, + author={Guillaume Tauzin and Umberto Lupo and Lewis Tunstall and Julian Burella Pérez and Matteo Caorsi and Anibal Medina-Mardones and Alberto Dassatti and Kathryn Hess}, + year={2020}, + eprint={2004.02551}, + archivePrefix={arXiv}, + primaryClass={cs.LG} + } + I cannot install ``giotto-tda`` ------------------------------- @@ -26,4 +45,4 @@ There are many TDA libraries available. How is ``giotto-tda`` different? ``giotto-tda`` is oriented towards machine learning (for details, see the :ref:`guiding principles `). This philosophy is in contrast with other reference libraries, like `GUDHI `_, which provide more low-level functionality at the expense of being less adapted to e.g. batch processing, or of -being tightly integrated with ``scikit-learn``. \ No newline at end of file +being tightly integrated with ``scikit-learn``. diff --git a/doc/library.rst b/doc/library.rst index 76bfc1be0..7c73509f4 100644 --- a/doc/library.rst +++ b/doc/library.rst @@ -114,5 +114,5 @@ What's new .. include:: release.rst - :start-after: Release 0.2.0 - :end-before: Release 0.1.4 + :start-after: Release 0.2.1 + :end-before: Release 0.2.0 diff --git a/doc/release.rst b/doc/release.rst index d9caf3bb8..43ebed123 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -5,6 +5,45 @@ Release Notes .. _stable: +************* +Release 0.2.1 +************* + +Major Features and Improvements +=============================== + +- The theory glossary has been improved to include the notions of vectorization, kernel and amplitude for persistence diagrams. +- The ``ripser`` function in ``gtda.externals.python.ripser_interface`` no longer uses scikit-learn's ``pairwise_distances`` when + ``metric`` is ``'precomputed'``, thus allowing square arrays with negative entries or infinities to be passed. +- ``check_point_clouds`` in ``gtda.utils.validation`` now checks for square array input when the input should be a collection of + distance-type matrices. Warnings guide the user to correctly setting the ``distance_matrices`` parameter. ``force_all_finite=False`` + no longer means accepting NaN input (only infinite input is accepted). +- ``VietorisRipsPersistence`` in ``gtda.homology.simplicial`` no longer masks out infinite entries in the input to be fed to + ``ripser``. +- The docstrings for ``check_point_clouds`` and ``VietorisRipsPersistence`` have been improved to reflect these changes and the + extra level of generality for ``ripser``. + +Bug Fixes +========= + +- The variable used to indicate the location of Boost headers has been renamed from ``Boost_INCLUDE_DIR`` to ``Boost_INCLUDE_DIRS`` + to address developer installation issues in some Linux systems. + +Backwards-Incompatible Changes +============================== + +- The keyword parameter ``distance_matrix`` in ``check_point_clouds`` has been renamed to ``distance_matrices``. + +Thanks to our Contributors +========================== + +This release contains contributions from many people: + +Umberto Lupo, Anibal Medina-Mardones, Julian Burella Pérez, Guillaume Tauzin, and Wojciech Reise. + +We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of +inspiring discussions. + ************* Release 0.2.0 ************* diff --git a/doc/theory/glossary.tex b/doc/theory/glossary.tex index 7494377d0..0a9c8f518 100644 --- a/doc/theory/glossary.tex +++ b/doc/theory/glossary.tex @@ -15,14 +15,11 @@ linkcolor=blue, urlcolor=blue} - \begin{document} - + \title{Theory Glossary} \maketitle - \bibliography{bibliography} - \section{Symbols} \begin{tabular}{ l l} @@ -33,7 +30,7 @@ $\mathbb R^d$ & The vector space of $d$-tuples of real numbers. \\ $\Delta$ & The %\hyperref[multiset]{multiset} - multiset $ \lbrace (s, s) \mid s \in \mathbb{R} \rbrace $ with multiplicity $ ( s,s ) \mapsto +\infty$. + multiset $ \lbrace (s, s) \mid s \in \mathbb{R} \rbrace $ with multiplicity $ ( s,s ) \mapsto +\infty$. \end{tabular} \section{Homology} @@ -42,7 +39,7 @@ An \textit{elementary interval} $I_a$ is a subset of $\mathbb{R}$ of the form $[a, a+1]$ or $[a,a] = \{a\}$ for some $a \in \mathbb{R}$. These two types are called respectively \textit{non-degenerate} and \textit{degenerate}. To a non-degenerate elementary interval we assign two degenerate elementary intervals \begin{equation*} - d^+ I_a = [a+1, a+1] \qquad \text{and} \qquad d^- I_a = [a, a]. + d^+ I_a = \lbrack a+1, a+1 \rbrack \qquad \text{and} \qquad d^- I_a = \lbrack a, a \rbrack. \end{equation*} An \textit{elementary cube} is a subset of the form \begin{equation*} @@ -65,9 +62,9 @@ A set $\{v_0, \dots, v_n\} \subset \mathbb{R}^N$ is said to be \textit{geometrically independent} if the vectors $\{v_0-v_1, \dots, v_0-v_n\}$ are linearly independent. In this case, we refer to their convex closure as a \textit{simplex}, explicitly \begin{equation*} - \lbrack v_0, \ldots , v_n \rbrack = \left\{ \sum c_i (v_0 - v_i)\ \big|\ c_1+\dots+c_n = 1,\ c_i \geq 0 \right\} + \lbrack v_0, \dots , v_n \rbrack = \left\{ \sum c_i (v_0 - v_i)\ \big|\ c_1+\dots+c_n = 1,\ c_i \geq 0 \right\} \end{equation*} - and to $n$ as its \textit{dimension}. The $i$\textit{-th face} of $[v_0, \dots, v_n]$ is defined by + and to $n$ as its \textit{dimension}. The $i$\textit{-th face} of $\lbrack v_0, \dots, v_n \rbrack$ is defined by \begin{equation*} d_i[v_0, \ldots, v_n] = [v_0, \dots, \widehat{v}_i, \dots, v_n] \end{equation*} @@ -104,7 +101,7 @@ An \textit{ordered simplicial complex} is an % \hyperref[abstract_simplicial_complex]{abstract simplicial complex} - abstract simplicial complex where the set of vertices is equipped with a partial order such that the restriction of this partial order to any simplex is a total order. We denote an $n$-simplex using its ordered vertices by $[v_0, \dots, v_n]$. + abstract simplicial complex where the set of vertices is equipped with a partial order such that the restriction of this partial order to any simplex is a total order. We denote an $n$-simplex using its ordered vertices by $\lbrack v_0, \dots, v_n \rbrack$. A \textit{simplicial map} between ordered simplicial complexes is a simplicial map $f$ between their underlying simplicial complexes preserving the order, i.e., $v \leq w$ implies $f(v) \leq f(w)$. @@ -240,7 +237,7 @@ % \hyperref[filtered_complex]{filtered complex} filtered complex $VR_s(X)$ that contains a subset of $X$ as a simplex if all pairwise distances in the subset are less than or equal to $s$, explicitly \begin{equation*} - VR_s(X) = \Big\{ [v_0,\dots,v_n]\ \Big|\ \forall i,j\ \,d(v_i, v_j) \leq s \Big\}. + VR_s(X) = \Big\{ \lbrack v_0,\dots,v_n \rbrack \ \Big|\ \forall i,j\ \,d(v_i, v_j) \leq s \Big\}. \end{equation*} The \textit{Vietoris-Rips persistence} of $(X, d)$ is the % \hyperref[persistent_simplicial_(co)homology]{persistent simplicial (co)homology} @@ -263,11 +260,11 @@ % \hyperref[filtered_complex]{filtered complex} filtered complex $\check{C}_s(X)$ that is empty if $s<0$ and, if $s \geq 0$, contains a subset of $X$ as a simplex if the balls of radius $s$ with centers in the subset have a non-empty intersection, explicitly \begin{equation*} - \check{C}_s(X) = \Big\{ [v_0,\dots,v_n]\ \Big|\ \bigcap_{i=0}^n B_s(x_i) \neq \emptyset \Big\}. + \check{C}_s(X) = \Big\{ \lbrack v_0,\dots,v_n \rbrack \ \Big|\ \bigcap_{i=0}^n B_s(x_i) \neq \emptyset \Big\}. \end{equation*} The \textit{\v Cech persistence (co)homology} of $(X,d)$ is the - % \hyperref[persistent_simplicial_(co)homology]{persistent simplicial (co)homo-logy} - persistent simplicial (co)homo-logy of $\check{C}_s(X)$. + % \hyperref[persistent_simplicial_(co)homology]{persistent simplicial (co)homology} + persistent simplicial (co)homology of $\check{C}_s(X)$. \subsection*{Multiset} \label{multiset} @@ -307,76 +304,69 @@ \begin{equation*} \sup_{x \in D_1 \cup \Delta} ||x - \gamma(x)||_{\infty.} \end{equation*} + + The set of persistence diagrams together with any of the distances above is a + %\hyperref[metric_space]{metric space}. + metric space. \paragraph{\\ Reference:} \cite{kerber2017geometry} - \subsection*{Persistence landscape} \label{persistence_landscape} - - A \textit{persistence landscape} is a set $\{\lambda_k\}_{k \in \mathbb N}$ of functions - \begin{equation*} - \lambda : \mathbb R \to \overline{\mathbb R} - \end{equation*} - where $\lambda_k$ is referred to as the $k$\textit{-layer of the persistence landscape}. + \subsection*{Persistence landscape} \label{persistence_landscape} Let $\{(b_i, d_i)\}_{i \in I}$ be a - % \hyperref[persistence_diagram]{persistence diagram} - persistence diagram. Its \textit{associated persistence landscape} $\lambda$ is defined by letting $\lambda_k$ be the $k$-th largest value of the set $\{\Lambda_i(t)\}_ {i \in I}$ where - \begin{equation*} - \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ - \end{equation*} - and $c_+ := \max(c,0)$. - - Intuitively, we can describe the set of graphs of a persistence landscape by first joining each of the points in the multiset to the diagonal via a horizontal as well as a vertical line, then clockwise rotating the figure 45 degrees and rescaling it by $1/\sqrt{2}$. - - \paragraph{\\ Reference:} \cite{bubenik2015statistical} - - \subsection*{Persistence landscape norm} \label{persistence_landscape_norm} - - Given a function $f : \mathbb R \to \overline{\mathbb R}$ define - \begin{equation*} - ||f||_p = \left( \int_{\mathbb R} f^p(x)\, dx \right)^{1/p} - \end{equation*} - whenever the right hand side exists and is finite. - - The $p$\textit{-norm} of a - % \hyperref[persistence_landscape]{persistence landscape} - persistence landscape $\lambda = \{\lambda_k\}_{k \in \mathbb N}$ is defined to be - - \begin{equation*} - ||\lambda||_p = \left( \sum_{i \in \mathbb N} ||\lambda_i||^p_p \right)^{1/p} - \end{equation*} - whenever the right hand side exists and is finite. + %\hyperref[persistence_diagram]{persistence diagram} + persistence diagram. Its \textit{persistence landscape} is the set $\{\lambda_k\}_{k \in \mathbb N}$ of functions + \begin{equation*} + \lambda_k : \mathbb R \to \overline{\mathbb R} + \end{equation*} + defined by letting $\lambda_k(t)$ be the $k$-th largest value of the set $\{\Lambda_i(t)\}_ {i \in I}$ where + \begin{equation*} + \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ + \end{equation*} + and $c_+ := \max(c,0)$. The function $\lambda_k$ is referred to as the \textit{$k$-layer of the persistence landscape}. + + We describe the graph of each $\lambda_k$ intuitively. For each $i \in I$, draw an isosceles triangle with base the interval $(b_i, d_i)$ on the horizontal $t$-axis, and sides with slope 1 and $-1$. This subdivides the plane into a number of polygonal regions. Label each of these regions by the number of triangles containing it. If $P_k$ is the union of the polygonal regions with values at least $k$, then the graph of $\lambda_k$ is the upper contour of $P_k$, with $\lambda_k(a) = 0$ if the vertical line $t=a$ does not intersect $P_k$. + + The persistence landscape construction defines a + %\hyperref[vectorization_kernel_and_amplitude]{vectorization} + vectorization of the set of persistence diagrams with target the vector space of real-valued function on $\mathbb N \times \mathbb R$. For any $p = 1,\dots,\infty$ we can restrict attention to persistence diagrams $D$ whose associated persistence landscape $\lambda$ is + %\hyperref[lp_norm]{$p$-integrable} + $p$-integrable, that is to say, + \begin{equation} \label{equation:persistence_landscape_norm} + ||\lambda||_p = \left( \sum_{i \in \mathbb N} ||\lambda_i||^p_p \right)^{1/p} + \end{equation} + where + \begin{equation*} + ||\lambda_i||_p = \left( \int_{\mathbb R} \lambda_i^p(x)\, dx \right)^{1/p} + \end{equation*} + is finite. In this case we refer to \eqref{equation:persistence_landscape_norm} as the + %\hyperref[vectorization_kernel_and_amplitude]{amplitude} + \textit{landscape} $p$-\textit{amplitude} of $D$. - \paragraph{\\ References:} \cite{stein2011functional, bubenik2015statistical} + \paragraph{\\ References:} \cite{bubenik2015statistical} \subsection*{Weighted silhouette} \label{weighted_silhouette} - Let $D = {(b_i, d_i)}_{i \in I}$ be a - % \hyperref[persistence_diagram] {persistence diagram} - persistence diagram. A \textit{weighted silhouette} associated to $D$ is a continuous function $\phi : \mathbb R \to \mathbb R$ of the form + Let $D = \{(b_i, d_i)\}_{i \in I}$ be a + %\hyperref[persistence_diagram]{persistence diagram} + persistence diagram and $w = \{w_i\}_{i \in I}$ a set of positive real numbers. The \textit{silhouette of $D$ weighted by $w$} is the function $\phi : \mathbb R \to \mathbb R$ defined by + \begin{equation*} + \phi(t) = \frac{\sum_{i \in I}w_i \Lambda_i(t)}{\sum_{i \in I}w_i}, + \end{equation*} + where \begin{equation*} - \phi(t) = \frac{\sum_{i \in I}w_i \Lambda_i(t)}{\sum_{i \in I}w_i}, - \end{equation*} - where $\{w_i\}_{i \in I}$ is a set of positive real numbers and - \begin{equation*} \label{equation:lambda_for_persistence_landscapes} - \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ - \end{equation*} - with $c_+ := \max(c,0)$. The particular choice $w_i = \vert d_i - b_i \vert^p$ for $0 < p \leq \infty$ is referred to as \textit{power-weighted silhouettes}. + \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ + \end{equation*} + and $c_+ := \max(c,0)$. When $w_i = \vert d_i - b_i \vert^p$ for $0 < p \leq \infty$ we refer to $\phi$ as the \textit{$p$-power-weighted silhouette} of $D$. The silhouette construction defines a + %\hyperref[vectorization_kernel_and_amplitude]{vectorization} + vectorization of the set of persistence diagrams with target the vector space of continuous real-valued functions on $\mathbb R$. \paragraph{\\ References:} \cite{chazal2014stochastic} - \subsection*{Amplitude} - \label{amplitude} - - Given a function assigning a real number to a pair of persistence diagrams, we define the \textit{amplitude} of a persistence diagram $D$ to be the value assigned to the pair $(D \cup \Delta, \Delta)$. Important examples of such functions are: %\hyperref[wasserstein_and_bottleneck_distance]{Wasserstein and bottleneck distances} - Wasserstein and bottleneck distances and - % \hyperref[persistence_landscape_norm]{landscape distance} - landscape distance. - \subsection*{Persistence entropy} \label{persistence_entropy} Intuitively, this is a measure of the entropy of the points in a - % \hyperref[persistence_diagram]{persistence diagram} + % \hyperref[persistence_diagram]{persistence diagram} persistence diagram. Precisely, let $D = \{(b_i, d_i)\}_{i \in I}$ be a persistence diagram with each $d_i < +\infty$. The \textit{persistence entropy} of $D$ is defined by \begin{equation*} E(D) = - \sum_{i \in I} p_i \log(p_i) @@ -396,54 +386,66 @@ The name is inspired from the case when the persistence diagram comes from persistent homology. - \subsection*{Distances, inner products and kernels} \label{metric_inner_product_and_kernel} + \subsection*{Metric space} \label{metric_space} + A set $X$ with a function + \begin{equation*} + d : X \times X \to \mathbb R + \end{equation*} + is said to be a \textit{metric space} if the values of $d$ are all non-negative and for all $x,y,z \in X$ + \begin{equation*} + d(x,y) = 0\ \Leftrightarrow\ x = y + \end{equation*} + \begin{equation*} + d(x,y) = d(y,x) + \end{equation*} + \begin{equation*} + d(x,z) \leq d(x,y) + d(y, z). + \end{equation*} + In this case the $d$ is referred to as the \textit{metric} or the \textit{distance function}. - A set $X$ with a function - \begin{equation*} - d : X \times X \to \mathbb R - \end{equation*} - is called a \textit{metric space} if the values of $d$ are all non-negative and for all $x,y,z \in X$ - \begin{equation*} - d(x,y) = 0\ \Leftrightarrow\ x = y - \end{equation*} - \begin{equation*} - d(x,y) = d(y,x) - \end{equation*} - \begin{equation*} - d(x,z) \leq d(x,y) + d(y, z). - \end{equation*} - In this case the $d$ is referred to as the \textit{metric} or the \textit{distance function}. + \subsection*{Inner product and norm} \label{inner_product_and_norm} A vector space $V$ together with a function - \begin{equation*} - \langle -, - \rangle : V \times V \to \mathbb R - \end{equation*} - is called and \textit{inner product space} if for all $u,v,w \in V$ - \begin{equation*} - u \neq 0\ \Rightarrow\ \langle u, u \rangle > 0 + \begin{equation*} + \langle -, - \rangle : V \times V \to \mathbb R \end{equation*} - \begin{equation*} - \langle u, v\rangle = \langle v, u\rangle - \end{equation*} - \begin{equation*} - \langle au+v, w \rangle = a\langle u, w \rangle + \langle v, w \rangle. - \end{equation*} - In this case the function $\langle -, - \rangle$ is referred to as the \textit{inner product} and the function given by - \begin{equation*} - ||u|| = \sqrt{\langle u, u \rangle} + is said to be an \textit{inner product space} if for all $u,v,w \in V$ and $a \in \mathbb R$ + \begin{equation*} + u \neq 0\ \Rightarrow\ \langle u, u \rangle > 0 + \end{equation*} + \begin{equation*} + \langle u, v\rangle = \langle v, u\rangle \end{equation*} - as its associated \textit{norm}. An inner product space is naturally a metric space with distance function \begin{equation*} - d(u,v) = ||u-v||. - \end{equation*} + \langle au+v, w \rangle = a\langle u, w \rangle + \langle v, w \rangle. + \end{equation*} + The function $\langle -, - \rangle$ is referred to as the \textit{inner product}. - A \textit{kernel} on a set $X$ is a function - \begin{equation*} - k : X \times X - \end{equation*} - for which there exists a function $\phi : X \to V$ to an inner product space such that - \begin{equation*} - k(x, y) = \langle \phi(x), \phi(y) \rangle. + A vector space $V$ together with a function + \begin{equation*} + ||-|| : V \to \mathbb R + \end{equation*} + is said to be an \textit{normed space} if the values of $||-||$ are all non-negative and for all $u,v \in V$ and $a \in \mathbb R$ + \begin{equation*} + ||v|| = 0\ \Leftrightarrow\ u = 0 + \end{equation*} + \begin{equation*} + ||a u || = |a|\, ||u|| + \end{equation*} + \begin{equation*} + ||u+v|| = ||u|| + ||v||. + \end{equation*} + The function $||-||$ is referred to as the \textit{norm}. + + An inner product space is naturally a norm space with + \begin{equation*} + ||u|| = \sqrt{\langle u, u \rangle} + \end{equation*} + and a norm space is naturally a + %\hyperref[metric_space]{metric space} + metric space with distance function + \begin{equation*} + d(u,v) = ||u-v||. \end{equation*} \subsection*{Euclidean distance and norm} \label{euclidean_distance_and_norm} @@ -454,7 +456,37 @@ \begin{equation*} \langle x, y \rangle = (x_1-y_1)^2 + \cdots + (x_n-y_n)^2. \end{equation*} - The associated norm and distance function are referred to as \textit{Euclidean norm} and \textit{Euclidean distance}. + This inner product is referred to as \textit{dot product} and the associated norm and distance function are respectively named \textit{euclidean norm} and \textit{euclidean distance}. + + \subsection*{Vectorization, kernel and amplitude} \label{vectorization_kernel_and_amplitude} + + Let $X$ be a set, for example, the set of all + %\hyperref[persistence_diagram]{persistence diagrams} + persistence diagrams. A \textit{vectorization} for $X$ is a function + \begin{equation*} + \phi : X \to V + \end{equation*} + where $V$ is a vector space. A \textit{kernel} on the set $X$ is a function + \begin{equation*} + k : X \times X \to \mathbb R + \end{equation*} + for which there exists a vectorization $\phi : X \to V$ with $V$ an + %\hyperref[inner_product_and_norm]{inner product space} + inner product space such that + \begin{equation*} + k(x,y) = \langle \phi(x), \phi(y) \rangle + \end{equation*} + for each $x,y \in X$. Similarly, an \textit{amplitude} on $X$ is a function + \begin{equation*} + A : X \to \mathbb R + \end{equation*} + for which there exists a vectorization $\phi : X \to V$ with $V$ a + %\hyperref[inner_product_and_norm]{normed space} + normed space such that + \begin{equation*} + A(x) = ||\phi(x)|| + \end{equation*} + for all $x \in X$. \subsection*{Finite metric spaces and point clouds} \label{finite_metric_spaces_and_point_clouds} @@ -530,12 +562,10 @@ \paragraph{\\ References:} \cite{milnor1997topology,guillemin2010differential} \subsection*{Compact subset} \label{compact_subset} - A subset $K$ of a metric space $(X,d)$ is said to be \textit{bounded} if there exist a real number $D$ such that for each pair of elements in $K$ the distance between them is less than $D$. It is said to be \textit{complete} if for any $x \in X$ it is the case that $x \in K$ if for any $\epsilon > 0$ the intersection between $K$ and $\{y \,;\ d(x,y) < \epsilon \}$ is not empty. It is said to be \textit{compact} if it is both bounded and complete. - + \section{Bibliography} \bibliography{bibliography}{} \bibliographystyle{alpha} - -\end{document} +\end{document} \ No newline at end of file diff --git a/examples/voids_on_the_plane.ipynb b/examples/voids_on_the_plane.ipynb index 52ee302db..19e7c24e1 100644 --- a/examples/voids_on_the_plane.ipynb +++ b/examples/voids_on_the_plane.ipynb @@ -8,7 +8,7 @@ "\n", "The classic example of a two-dimensional homology class is the \"void\" surrounded by a sphere in three-dimensional space.\n", "Challenge question: **Can two-dimensional topological voids arise from point clouds in two-dimensional space?**\n", - "We will answer this question programmatically by computing Vietoris–Rips persistence homology of random point clouds in the square $[0, 1] \\times [0, 1] \\subset \\mathbb{R}^2$.\n", + "We will answer this question programmatically by computing Vietoris–Rips persistent homology of random point clouds in the square $[0, 1] \\times [0, 1] \\subset \\mathbb{R}^2$.\n", "\n", "If you are looking at a static version of this notebook and would like to run its contents, head over to [github](https://github.com/giotto-ai/giotto-tda/blob/master/examples/voids_on_the_plane.ipynb).\n", "\n", diff --git a/gtda/_version.py b/gtda/_version.py index 9d0c2f8e6..7ffe6f6dd 100644 --- a/gtda/_version.py +++ b/gtda/_version.py @@ -19,4 +19,4 @@ # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.2.0' +__version__ = '0.2.1' diff --git a/gtda/diagrams/distance.py b/gtda/diagrams/distance.py index 32c45fd6f..ae37f50b7 100644 --- a/gtda/diagrams/distance.py +++ b/gtda/diagrams/distance.py @@ -102,9 +102,8 @@ class PairwiseDistance(BaseEstimator, TransformerMixin): See also -------- - Amplitude, Scaler, Filtering, \ - BettiCurve, PersistenceLandscape, \ - HeatKernel, Silhouette, \ + Amplitude, Scaler, Filtering, BettiCurve, PersistenceLandscape, \ + PersistenceImage, HeatKernel, Silhouette, \ gtda.homology.VietorisRipsPersistence Notes diff --git a/gtda/diagrams/representations.py b/gtda/diagrams/representations.py index bda0386fa..05f8fdf36 100644 --- a/gtda/diagrams/representations.py +++ b/gtda/diagrams/representations.py @@ -179,51 +179,51 @@ def plot(self, Xt, sample=0, homology_dimensions=None): for dim in homology_dimensions: if dim not in self.homology_dimensions_: raise ValueError( - f'All homology dimensions must be in ' - f'self.homology_dimensions_ which is ' - f'{self.homology_dimensions_}. {dim} is not.') + f"All homology dimensions must be in " + f"self.homology_dimensions_ which is " + f"{self.homology_dimensions_}. {dim} is not.") else: homology_dimensions_arr = np.array( self.homology_dimensions_) ix = np.flatnonzero(homology_dimensions_arr == dim)[0] _homology_dimensions.append((ix, dim)) - layout = { - "xaxis1": { - "title": "Filtration parameter", - "side": "bottom", - "type": "linear", - "ticks": "outside", - "anchor": "x1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "yaxis1": { - "title": "Betti number", - "side": "left", - "type": "linear", - "ticks": "outside", - "anchor": "y1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "plot_bgcolor": "white" - } + layout = dict( + xaxis1=dict( + title="Filtration parameter", + side="bottom", + type="linear", + ticks="outside", + anchor="x1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + yaxis1=dict( + title="Betti number", + side="left", + type="linear", + ticks="outside", + anchor="y1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + plot_bgcolor="white" + ) fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) for ix, dim in _homology_dimensions: fig.add_trace(gobj.Scatter(x=self.samplings_[dim], y=Xt[sample][ix], mode='lines', showlegend=True, - name=f'H{int(dim)}')) + name=f"H{int(dim)}")) fig.show() @@ -395,38 +395,38 @@ def plot(self, Xt, sample=0, homology_dimensions=None): for dim in homology_dimensions: if dim not in self.homology_dimensions_: raise ValueError( - f'All homology dimensions must be in ' - f'self.homology_dimensions_ which is ' - f'{self.homology_dimensions_}. {dim} is not.') + f"All homology dimensions must be in " + f"self.homology_dimensions_ which is " + f"{self.homology_dimensions_}. {dim} is not.") else: homology_dimensions_arr = np.array( self.homology_dimensions_) ix = np.flatnonzero(homology_dimensions_arr == dim)[0] _homology_dimensions.append((ix, dim)) - layout = { - "xaxis1": { - "side": "bottom", - "type": "linear", - "ticks": "outside", - "anchor": "y1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "yaxis1": { - "side": "left", - "type": "linear", - "ticks": "outside", - "anchor": "x1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "plot_bgcolor": "white" - } + layout = dict( + xaxis1=dict( + side="bottom", + type="linear", + ticks="outside", + anchor="y1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + yaxis1=dict( + side="left", + type="linear", + ticks="outside", + anchor="x1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + plot_bgcolor="white" + ) Xt_sample = Xt[sample] for ix, dim in _homology_dimensions: @@ -434,9 +434,9 @@ def plot(self, Xt, sample=0, homology_dimensions=None): layout_dim['title'] = "Persistence landscape for homology " + \ "dimension {}".format(int(dim)) fig = gobj.Figure(layout=layout_dim) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) n_layers = Xt_sample.shape[1] @@ -598,8 +598,7 @@ def transform(self, X, y=None): transpose((1, 0, 2, 3)) return Xt - def plot(self, Xt, sample=0, homology_dimension_ix=0, - colorscale='blues'): + def plot(self, Xt, sample=0, homology_dimension_ix=0, colorscale='blues'): """Plot a single channel – corresponding to a given homology dimension – in a sample from a collection of heat kernel images. @@ -1014,50 +1013,50 @@ def plot(self, Xt, sample=0, homology_dimensions=None): for dim in homology_dimensions: if dim not in self.homology_dimensions_: raise ValueError( - f'All homology dimensions must be in ' - f'self.homology_dimensions_ which is ' - f'{self.homology_dimensions_}. {dim} is not.') + f"All homology dimensions must be in " + f"self.homology_dimensions_ which is " + f"{self.homology_dimensions_}. {dim} is not.") else: homology_dimensions_arr = np.array( self.homology_dimensions_) ix = np.flatnonzero(homology_dimensions_arr == dim)[0] _homology_dimensions.append((ix, dim)) - layout = { - "xaxis1": { - "title": "Filtration parameter", - "side": "bottom", - "type": "linear", - "ticks": "outside", - "anchor": "x1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "yaxis1": { - "side": "left", - "type": "linear", - "ticks": "outside", - "anchor": "y1", - "showline": True, - "zeroline": True, - "showexponent": "all", - "exponentformat": "e" - }, - "plot_bgcolor": "white" - } + layout = dict( + xaxis1=dict( + title="Filtration parameter", + side="bottom", + type="linear", + ticks="outside", + anchor="x1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + yaxis1=dict( + side="left", + type="linear", + ticks="outside", + anchor="y1", + showline=True, + zeroline=True, + showexponent="all", + exponentformat="e" + ), + plot_bgcolor="white" + ) fig = gobj.Figure(layout=layout) - fig.update_xaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_xaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) - fig.update_yaxes(zeroline=True, linewidth=1, linecolor='black', + fig.update_yaxes(zeroline=True, linewidth=1, linecolor="black", mirror=False) for ix, dim in _homology_dimensions: fig.add_trace(gobj.Scatter(x=self.samplings_[dim], y=Xt[sample][ix], - mode='lines', showlegend=True, - hoverinfo='none', - name=f'H{int(dim)}')) + mode="lines", showlegend=True, + hoverinfo="none", + name=f"H{int(dim)}")) fig.show() diff --git a/gtda/diagrams/tests/test_features.py b/gtda/diagrams/tests/test_features_representations.py similarity index 70% rename from gtda/diagrams/tests/test_features.py rename to gtda/diagrams/tests/test_features_representations.py index b0ab05518..0c036d684 100644 --- a/gtda/diagrams/tests/test_features.py +++ b/gtda/diagrams/tests/test_features_representations.py @@ -1,7 +1,8 @@ -"""Testing for features""" +"""Testing for features and vector representations.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from hypothesis import given from hypothesis.extra.numpy import arrays, array_shapes @@ -9,35 +10,76 @@ from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.diagrams import PersistenceEntropy, HeatKernel, \ - PersistenceImage, Silhouette +from gtda.diagrams import PersistenceEntropy, BettiCurve, \ + PersistenceLandscape, HeatKernel, PersistenceImage, Silhouette -diagram = np.array([[[0, 1, 0], [2, 3, 0], [4, 6, 1], [2, 6, 1]]]) +pio.renderers.default = 'plotly_mimetype' +X = np.array([[[0., 1., 0.], [2., 3., 0.], [4., 6., 1.], [2., 6., 1.]]]) -def test_pe_not_fitted(): - pe = PersistenceEntropy() + +def test_not_fitted(): + with pytest.raises(NotFittedError): + PersistenceEntropy().transform(X) + + with pytest.raises(NotFittedError): + BettiCurve().transform(X) + + with pytest.raises(NotFittedError): + PersistenceLandscape().transform(X) + + with pytest.raises(NotFittedError): + HeatKernel().transform(X) + + with pytest.raises(NotFittedError): + PersistenceImage().transform(X) with pytest.raises(NotFittedError): - pe.transform(diagram) + Silhouette().transform(X) + + +@pytest.mark.parametrize('hom_dim_ix', [0, 1]) +def test_fit_transform_plot_one_hom_dim(hom_dim_ix): + HeatKernel().fit_transform_plot( + X, sample=0, homology_dimension_ix=hom_dim_ix) + PersistenceImage().fit_transform_plot( + X, sample=0, homology_dimension_ix=hom_dim_ix) + + +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_fit_transform_plot_many_hom_dims(hom_dims): + BettiCurve().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + PersistenceLandscape().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + Silhouette().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) def test_pe_transform(): pe = PersistenceEntropy() diagram_res = np.array([[0.69314718, 0.63651417]]) - assert_almost_equal(pe.fit_transform(diagram), diagram_res) + assert_almost_equal(pe.fit_transform(X), diagram_res) -def test_pi_not_fitted(): - pi = PersistenceImage(sigma=1) - with pytest.raises(NotFittedError): - pi.transform(diagram) +@pytest.mark.parametrize('n_bins', range(10, 51, 10)) +def test_bc_transform_shape(n_bins): + bc = BettiCurve(n_bins=n_bins) + X_res = bc.fit_transform(X) + assert X_res.shape == (1, bc._n_dimensions, n_bins) + + +@pytest.mark.parametrize('n_bins', range(10, 51, 10)) +@pytest.mark.parametrize('n_layers', range(1, 10)) +def test_pl_transform_shape(n_bins, n_layers): + pl = PersistenceLandscape(n_bins=n_bins, n_layers=n_layers) + X_res = pl.fit_transform(X) + assert X_res.shape == (1, pl._n_dimensions, n_layers, n_bins) @given(X=arrays(dtype=np.float, unique=True, - elements=integers(min_value=-1e10, - max_value=1e6), + elements=integers(min_value=-1e10, max_value=1e6), shape=array_shapes(min_dims=1, max_dims=1, min_side=11))) def test_pi_null(X): """Test that, if one trivial diagram (all pts on the diagonal) is provided, @@ -74,7 +116,7 @@ def test_silhouette_transform(): 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0., 0.05, 0.1, 0.15, 0.2, 0.25, 0.2, 0.15, 0.1, 0.05, 0.]) - assert_almost_equal(sht.fit_transform(diagram)[0][0], X_sht_res) + assert_almost_equal(sht.fit_transform(X)[0][0], X_sht_res) def test_silhouette_big_order(): @@ -111,7 +153,7 @@ def test_silhouette_big_order(): def _validate_distinct(X): - """Check if, in X, there is any persistence diagram for which all births + """Check if, in X, there is any persistence X for which all births and deaths are equal.""" unique_values = [np.unique(x[:, 0:2]) for x in X] if np.any([len(u) < 2 for u in unique_values]): @@ -164,13 +206,13 @@ def test_hk_positive(pts, dims): @given(pts_gen, dims_gen) def test_hk_big_sigma(pts, dims): - """ We expect that with a huge sigma, the diagrams are so diluted that + """We expect that with a huge sigma, the diagrams are so diluted that they are almost 0. Effectively, verifies that the smoothing is applied.""" n_bins = 10 x = get_input(pts, dims) hk = HeatKernel(sigma=100*np.max(np.abs(x)), n_bins=n_bins) - x_t = hk.fit(x).transform(x) + x_t = hk.fit_transform(x) assert np.all(np.abs(x_t) <= 1e-4) @@ -186,7 +228,6 @@ def test_hk_with_diag_points(pts): diag_points = np.array([[[2, 2, 0], [3, 3, 0], [7, 7, 0]]]) x_with_diag_points = np.concatenate([x, diag_points], axis=1) - # X_total = np.concatenate([X,X_with_diag_points], axis =0) hk = hk.fit(x_with_diag_points) x_t, x_with_diag_points_t = [hk.transform(x_) diff --git a/gtda/diagrams/tests/test_preprocessing.py b/gtda/diagrams/tests/test_preprocessing.py index 13ec6f910..f145a9b03 100644 --- a/gtda/diagrams/tests/test_preprocessing.py +++ b/gtda/diagrams/tests/test_preprocessing.py @@ -1,11 +1,15 @@ -"""Testing for ForgetDimension and Scaler.""" +"""Testing of preprocessing tools for persistence diagrams.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest +from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.diagrams import ForgetDimension, Scaler +from gtda.diagrams import ForgetDimension, Scaler, Filtering + +pio.renderers.default = 'plotly_mimetype' X_1 = np.array([[[0., 0.36905774, 0], [0., 0.37293977, 0], @@ -209,38 +213,71 @@ def test_not_fitted(): - dst = ForgetDimension() - dsc = Scaler() + with pytest.raises(NotFittedError): + ForgetDimension().transform(X_1) with pytest.raises(NotFittedError): - dst.transform(X_1) + Scaler().transform(X_1) with pytest.raises(NotFittedError): - dsc.transform(X_1) + Scaler().inverse_transform(X_1) with pytest.raises(NotFittedError): - dsc.inverse_transform(X_1) + Filtering().transform(X_1) + + +def test_forg_fit_transform_plot(): + ForgetDimension().fit_transform_plot(X_1, sample=0) + + +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,)]) +def test_fit_transform_plot(hom_dims): + Scaler().fit_transform_plot( + X_1, sample=0, homology_dimensions=hom_dims) + + Filtering().fit_transform_plot( + X_1, sample=0, homology_dimensions=hom_dims) @pytest.mark.parametrize('X', [X_1, X_2]) -def test_dst_transform(X): - dst = ForgetDimension() - X_res = dst.fit_transform(X) +def test_forg_transform_shape(X): + forg = ForgetDimension() + X_res = forg.fit_transform(X) assert X_res.shape == X.shape -parameters = [('wasserstein', {'p': 2}), - ('betti', {'n_bins': 10}), - ('bottleneck', None)] +parameters_sc = [('wasserstein', {'p': 2}), + ('betti', {'n_bins': 10}), + ('bottleneck', None)] -@pytest.mark.parametrize(('metric', 'metric_params'), parameters) +@pytest.mark.parametrize(('metric', 'metric_params'), parameters_sc) @pytest.mark.parametrize('X', [X_1, X_2]) -def test_dd_transform(X, metric, metric_params): - dsc = Scaler(metric=metric, metric_params=metric_params, n_jobs=1) - X_res = dsc.fit_transform(X) +def test_sc_transform_shape(X, metric, metric_params): + sc = Scaler(metric=metric, metric_params=metric_params, n_jobs=1) + X_res = sc.fit_transform(X) assert X_res.shape == X.shape - dsc = Scaler(metric=metric, metric_params=metric_params, n_jobs=1) - X_inv_res = dsc.fit(X_res).inverse_transform(X_res) - assert X_inv_res.shape == X.shape + X_inv_res = sc.inverse_transform(X_res) + assert_almost_equal(X_inv_res, X) + + +@pytest.mark.parametrize('X', [X_1, X_2]) +def test_filt_transform_zero(X): + filt = Filtering(epsilon=0.) + X_res = filt.fit_transform(X[:, [0], :]) + assert_almost_equal(X_res, X[:, [0], :]) + + +lifetimes_1 = X_1[:, :, 1] - X_1[:, :, 0] +epsilons_1 = np.linspace(np.min(lifetimes_1), np.max(lifetimes_1), num=3) + + +@pytest.mark.parametrize('epsilon', epsilons_1) +def test_filt_transform(epsilon): + filt = Filtering(epsilon=epsilon) + X_res_1 = filt.fit_transform(X_1) + assert X_res_1.shape == X_1.shape + + lifetimes_res_1 = X_res_1[:, :, 1] - X_res_1[:, :, 0] + assert not ((lifetimes_res_1 > 0.) & (lifetimes_res_1 <= epsilon)).any() diff --git a/gtda/externals/python/ripser_interface.py b/gtda/externals/python/ripser_interface.py index a31c96ae0..8b28b688d 100644 --- a/gtda/externals/python/ripser_interface.py +++ b/gtda/externals/python/ripser_interface.py @@ -163,7 +163,8 @@ def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", ) if n_perm and n_perm < 0: raise Exception( - "Should be a strictly positive number of points in the greedy permutation" + "Should be a strictly positive number of points in the greedy " + "permutation" ) idx_perm = np.arange(X.shape[0]) @@ -175,7 +176,10 @@ def ripser(X, maxdim=1, thresh=np.inf, coeff=2, metric="euclidean", r_cover = lambdas[-1] dm = dperm2all[:, idx_perm] else: - dm = pairwise_distances(X, metric=metric) + if metric == 'precomputed': + dm = X + else: + dm = pairwise_distances(X, metric=metric) dperm2all = dm n_points = dm.shape[0] diff --git a/gtda/graphs/tests/test_geodesic_distance.py b/gtda/graphs/tests/test_geodesic_distance.py index 20b210237..29b41bae8 100644 --- a/gtda/graphs/tests/test_geodesic_distance.py +++ b/gtda/graphs/tests/test_geodesic_distance.py @@ -1,12 +1,15 @@ -"""Testing for GraphGeodesicDistance""" +"""Testing for GraphGeodesicDistance.""" import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.graphs import GraphGeodesicDistance +pio.renderers.default = 'plotly_mimetype' + X_ggd = np.array([ np.array( [[0, 1, 3, 0, 0], @@ -22,14 +25,18 @@ [0, 0, 0, 0, 0]])]) -def test_graph_geodesic_distance_not_fitted(): +def test_ggd_not_fitted(): ggd = GraphGeodesicDistance() with pytest.raises(NotFittedError): ggd.transform(X_ggd) -def test_graph_geodesic_distance_transform(): +def test_ggd_fit_transform_plot(): + GraphGeodesicDistance().fit_transform_plot(X_ggd, sample=0) + + +def test_ggd_transform(): X_ggd_res = np.array([ [[0., 1., 3., 7., np.inf], [1., 0., 4., 8., np.inf], diff --git a/gtda/graphs/tests/test_kneighbors.py b/gtda/graphs/tests/test_kneighbors.py index d97eaf35e..bc7bd323e 100644 --- a/gtda/graphs/tests/test_kneighbors.py +++ b/gtda/graphs/tests/test_kneighbors.py @@ -1,4 +1,4 @@ -"""Testing for KNeighborsGraph""" +"""Testing for KNeighborsGraph.""" import numpy as np import pytest @@ -20,7 +20,7 @@ [0., 1., 1., 0.]]))]) -def test_kneighbors_graph_not_fitted(): +def test_kng_not_fitted(): kn_graph = KNeighborsGraph() with pytest.raises(NotFittedError): @@ -29,13 +29,13 @@ def test_kneighbors_graph_not_fitted(): @pytest.mark.parametrize(('n_neighbors', 'expected'), [(1, X_kng_res), (2, X_kng_res_k2)]) -def test_kneighbors_graph_transform(n_neighbors, expected): +def test_kng_transform(n_neighbors, expected): kn_graph = KNeighborsGraph(n_neighbors=n_neighbors) assert (kn_graph.fit_transform(X_kng)[0] != expected[0]).nnz == 0 -def test_parallel_kneighbors_graph_transform(): +def test_parallel_kng_transform(): kn_graph = KNeighborsGraph(n_jobs=1, n_neighbors=2) kn_graph_parallel = KNeighborsGraph(n_jobs=2, n_neighbors=2) diff --git a/gtda/graphs/tests/test_transition.py b/gtda/graphs/tests/test_transition.py index 66d426461..5f6cb3546 100644 --- a/gtda/graphs/tests/test_transition.py +++ b/gtda/graphs/tests/test_transition.py @@ -1,4 +1,4 @@ -"""Testing for TransitionGraph""" +"""Testing for TransitionGraph.""" import numpy as np import pytest diff --git a/gtda/homology/simplicial.py b/gtda/homology/simplicial.py index 1b6c0bc0c..624becc18 100644 --- a/gtda/homology/simplicial.py +++ b/gtda/homology/simplicial.py @@ -40,18 +40,18 @@ class VietorisRipsPersistence(BaseEstimator, TransformerMixin, PlotterMixin): ---------- metric : string or callable, optional, default: ``'euclidean'`` If set to ``'precomputed'``, input data is to be interpreted as a - collection of distance matrices. Otherwise, input data is to be - interpreted as a collection of point clouds (i.e. feature arrays), - and `metric` determines a rule with which to calculate distances - between pairs of instances (i.e. rows) in these arrays. - If `metric` is a string, it must be one of the options allowed by - :func:`scipy.spatial.distance.pdist` for its metric parameter, or a - metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, - including "euclidean", "manhattan", or "cosine". - If `metric` is a callable function, it is called on each pair of - instances and the resulting value recorded. The callable should take - two arrays from the entry in `X` as input, and return a value - indicating the distance between them. + collection of distance matrices or of adjacency matrices of weighted + undirected graphs. Otherwise, input data is to be interpreted as a + collection of point clouds (i.e. feature arrays), and `metric` + determines a rule with which to calculate distances between pairs of + points (i.e. row vectors). If `metric` is a string, it must be one + of the options allowed by :func:`scipy.spatial.distance.pdist` for + its metric parameter, or a metric listed in + :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`, including + ``'euclidean'``, ``'manhattan'`` or ``'cosine'``. If `metric` is a + callable, it should take pairs of vectors (1D arrays) as input and, for + each two vectors in a pair, it should return a scalar indicating the + distance/dissimilarity between them. homology_dimensions : list or tuple, optional, default: ``(0, 1)`` Dimensions (non-negative integers) of the topological features to be @@ -129,13 +129,12 @@ def __init__(self, metric='euclidean', homology_dimensions=(0, 1), self.n_jobs = n_jobs def _ripser_diagram(self, X): - Xdgms = ripser(X[X[:, 0] != np.inf], - maxdim=self._max_homology_dimension, + Xdgms = ripser(X, maxdim=self._max_homology_dimension, thresh=self.max_edge_length, coeff=self.coeff, metric=self.metric)['dgms'] if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][:-1, :] # Remove final death at np.inf + Xdgms[0] = Xdgms[0][:-1, :] # Remove one infinite bar # Add dimension as the third elements of each (b, d) tuple Xdgms = {dim: np.hstack([Xdgms[dim], @@ -153,14 +152,15 @@ def fit(self, X, y=None): Parameters ---------- X : ndarray or list - Input data representing a collection of point clouds or of distance - matrices. Can be either a 3D ndarray whose zeroth dimension has - size ``n_samples``, or a list containing ``n_samples`` 2D ndarrays. - If ``metric == 'precomputed'``, elements of `X` must be square - arrays representing distance matrices; otherwise, their rows are - interpreted as vectors in Euclidean space and, when `X` is a list, - warnings are issued when the number of columns (dimension of the - Euclidean space) differs among samples. + Input data representing a collection of point clouds if `metric` + was not set to ``'precomputed'``, and of distance matrices or + adjacency matrices of weighted undirected graphs otherwise. Can be + either a 3D ndarray whose zeroth dimension has size ``n_samples``, + or a list containing ``n_samples`` 2D ndarrays. If `metric` was + set to ``'precomputed'``, each entry of `X` must be a square + array and should be compatible with a filtration, i.e. the value + at index (i, j) should be no smaller than the values at diagonal + indices (i, i) and (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -174,7 +174,7 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) self._is_precomputed = self.metric == 'precomputed' - check_point_clouds(X, distance_matrix=self._is_precomputed) + check_point_clouds(X, distance_matrices=self._is_precomputed) if self.infinity_values is None: self.infinity_values_ = self.max_edge_length @@ -200,14 +200,15 @@ def transform(self, X, y=None): Parameters ---------- X : ndarray or list - Input data representing a collection of point clouds or of distance - matrices. Can be either a 3D ndarray whose zeroth dimension has - size ``n_samples``, or a list containing ``n_samples`` 2D ndarrays. - If ``metric == 'precomputed'``, elements of `X` must be square - arrays representing distance matrices; otherwise, their rows are - interpreted as vectors in Euclidean space and, when `X` is a list, - warnings are issued when the number of columns (dimension of the - Euclidean space) differs among samples. + Input data representing a collection of point clouds if `metric` + was not set to ``'precomputed'``, and of distance matrices or + adjacency matrices of weighted undirected graphs otherwise. Can be + either a 3D ndarray whose zeroth dimension has size ``n_samples``, + or a list containing ``n_samples`` 2D ndarrays. If `metric` was + set to ``'precomputed'``, each entry of `X` must be a square + array and should be compatible with a filtration, i.e. the value + at index (i, j) should be no smaller than the values at diagonal + indices (i, i) and (j, j). y : None There is no need for a target in a transformer, yet the pipeline @@ -224,7 +225,7 @@ def transform(self, X, y=None): """ check_is_fitted(self) - X = check_point_clouds(X, distance_matrix=self._is_precomputed) + X = check_point_clouds(X, distance_matrices=self._is_precomputed) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._ripser_diagram)(x) for x in X) @@ -385,7 +386,7 @@ def _gudhi_diagram(self, X): for dim in self.homology_dimensions} if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][1:, :] # Remove final death at np.inf + Xdgms[0] = Xdgms[0][1:, :] # Remove one infinite bar # Add dimension as the third elements of each (b, d) tuple Xdgms = {dim: np.hstack([Xdgms[dim], @@ -424,7 +425,7 @@ def fit(self, X, y=None): validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) self._is_precomputed = self.metric == 'precomputed' - check_point_clouds(X, distance_matrix=self._is_precomputed) + check_point_clouds(X, distance_matrices=self._is_precomputed) if self.infinity_values is None: self.infinity_values_ = self.max_edge_length @@ -474,7 +475,7 @@ def transform(self, X, y=None): """ check_is_fitted(self) - X = check_point_clouds(X, distance_matrix=self._is_precomputed) + X = check_point_clouds(X, distance_matrices=self._is_precomputed) Xt = Parallel(n_jobs=self.n_jobs)( delayed(self._gudhi_diagram)(x) for x in X) @@ -607,7 +608,7 @@ def _gudhi_diagram(self, X): for dim in self.homology_dimensions} if 0 in self._homology_dimensions: - Xdgms[0] = Xdgms[0][1:, :] # Remove final death at np.inf + Xdgms[0] = Xdgms[0][1:, :] # Remove one infinite bar # Add dimension as the third elements of each (b, d) tuple Xdgms = {dim: np.hstack([Xdgms[dim], diff --git a/gtda/homology/tests/test_cubical.py b/gtda/homology/tests/test_cubical.py index 66d72d806..85b9fcc3d 100644 --- a/gtda/homology/tests/test_cubical.py +++ b/gtda/homology/tests/test_cubical.py @@ -2,12 +2,15 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.homology import CubicalPersistence +pio.renderers.default = 'plotly_mimetype' + X = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], [2.98935825, 2.79848711], @@ -31,6 +34,12 @@ def test_cp_not_fitted(): cp.transform(X) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_cp_fit_transform_plot(hom_dims): + CubicalPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + + @pytest.mark.parametrize("periodic_dimensions, expected", [(None, X_cp_res), (np.array([False, False]), X_cp_res), diff --git a/gtda/homology/tests/test_simplicial.py b/gtda/homology/tests/test_simplicial.py index 4af1633e1..e93472e7b 100644 --- a/gtda/homology/tests/test_simplicial.py +++ b/gtda/homology/tests/test_simplicial.py @@ -1,7 +1,8 @@ -"""Testing for persistent homology on grid.""" +"""Testing for simplicial persistent homology.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError @@ -9,7 +10,9 @@ from gtda.homology import VietorisRipsPersistence, SparseRipsPersistence, \ EuclideanCechPersistence -pc = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], +pio.renderers.default = 'plotly_mimetype' + +X = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], [2.98935825, 2.79848711], [2.79848711, 2.41211849], [2.41211849, 1.92484888]]]) @@ -19,17 +22,17 @@ def test_vrp_params(): vrp = VietorisRipsPersistence(metric=metric) with pytest.raises(ValueError): - vrp.fit_transform(pc) + vrp.fit_transform(X) def test_vrp_not_fitted(): vrp = VietorisRipsPersistence() with pytest.raises(NotFittedError): - vrp.transform(pc) + vrp.transform(X) -pc_vrp_res = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], +X_vrp_res = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], [0., 0.60077095, 0], [0., 0.62186205, 0], [0.69093919, 0.80131882, 1]]]) @@ -37,7 +40,20 @@ def test_vrp_not_fitted(): def test_vrp_transform(): vrp = VietorisRipsPersistence() - assert_almost_equal(vrp.fit_transform(pc), pc_vrp_res) + assert_almost_equal(vrp.fit_transform(X), X_vrp_res) + + +def test_vrp_list_of_arrays(): + X_2 = np.array([[0., 1.], [1., 2.]]) + X_list = [X[0].copy(), X_2] + vrp = VietorisRipsPersistence() + vrp.fit(X_list) + + +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_vrp_fit_transform_plot(hom_dims): + VietorisRipsPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) def test_srp_params(): @@ -45,24 +61,24 @@ def test_srp_params(): vrp = SparseRipsPersistence(metric=metric) with pytest.raises(ValueError): - vrp.fit_transform(pc) + vrp.fit_transform(X) def test_srp_not_fitted(): srp = SparseRipsPersistence() with pytest.raises(NotFittedError): - srp.transform(pc) + srp.transform(X) -pc_srp_res_2 = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], - [0., 0.60077095, 0], [0., 0.62186205, 0], - [0.69093919, 0.80131882, 1]]]) +X_srp_res_2 = np.array([[[0., 0.43094373, 0], [0., 0.5117411, 0], + [0., 0.60077095, 0], [0., 0.62186205, 0], + [0.69093919, 0.80131882, 1]]]) @pytest.mark.parametrize("epsilon, point_clouds, expected", - [(0.0, pc, pc_vrp_res), - (1.0, pc, pc_srp_res_2)]) + [(0.0, X, X_vrp_res), + (1.0, X, X_srp_res_2)]) def test_srp_transform(epsilon, point_clouds, expected): srp = SparseRipsPersistence(epsilon=epsilon) @@ -70,22 +86,28 @@ def test_srp_transform(epsilon, point_clouds, expected): np.sort(expected, axis=1)) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_srp_fit_transform_plot(hom_dims): + SparseRipsPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) + + def test_cp_params(): coeff = 'not_defined' cp = EuclideanCechPersistence(coeff=coeff) with pytest.raises(TypeError): - cp.fit_transform(pc) + cp.fit_transform(X) def test_cp_not_fitted(): cp = EuclideanCechPersistence() with pytest.raises(NotFittedError): - cp.transform(pc) + cp.transform(X) -pc_cp_res = np.array( +X_cp_res = np.array( [[[0., 0.31093103, 0.], [0., 0.30038548, 0.], [0., 0.25587055, 0.], [0., 0.21547186, 0.], [0.34546959, 0.41473758, 1.], [0.51976681, 0.55287585, 1.], @@ -96,11 +118,10 @@ def test_cp_not_fitted(): def test_cp_transform(): cp = EuclideanCechPersistence() - assert_almost_equal(cp.fit_transform(pc), pc_cp_res) + assert_almost_equal(cp.fit_transform(X), X_cp_res) -def test_vrp_list_of_arrays(): - pc_2 = np.array([[0, 1], [1, 2]]) - pc_list = [pc[0].copy(), pc_2] - vrp = VietorisRipsPersistence() - vrp.fit(pc_list) +@pytest.mark.parametrize('hom_dims', [None, (0,), (1,), (0, 1)]) +def test_cp_fit_transform_plot(hom_dims): + EuclideanCechPersistence().fit_transform_plot( + X, sample=0, homology_dimensions=hom_dims) diff --git a/gtda/images/tests/test_filtrations.py b/gtda/images/tests/test_filtrations.py index ce58a2421..022ebe020 100644 --- a/gtda/images/tests/test_filtrations.py +++ b/gtda/images/tests/test_filtrations.py @@ -2,12 +2,15 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.images import HeightFiltration, RadialFiltration, \ - DilationFiltration, ErosionFiltration, SignedDistanceFiltration + DilationFiltration, ErosionFiltration, SignedDistanceFiltration + +pio.renderers.default = 'plotly_mimetype' images_2D = np.stack([np.ones((3, 4)), np.concatenate([np.ones((3, 2)), np.zeros((3, 2))], @@ -78,6 +81,10 @@ def test_height_transform(direction, images, expected): expected) +def test_height_fit_transform_plot(): + HeightFiltration().fit_transform_plot(images_2D, sample=0) + + def test_radial_not_fitted(): radial = RadialFiltration() with pytest.raises(NotFittedError): @@ -130,6 +137,10 @@ def test_radial_transform(center, images, expected): expected) +def test_radial_fit_transform_plot(): + RadialFiltration().fit_transform_plot(images_2D, sample=0) + + def test_dilation_not_fitted(): dilation = DilationFiltration() with pytest.raises(NotFittedError): @@ -172,6 +183,10 @@ def test_dilation_transform(n_iterations, images, expected): expected) +def test_dilation_fit_transform_plot(): + DilationFiltration().fit_transform_plot(images_2D, sample=0) + + def test_erosion_not_fitted(): erosion = ErosionFiltration() with pytest.raises(NotFittedError): @@ -214,6 +229,10 @@ def test_erosion_transform(n_iterations, images, expected): expected) +def test_erosion_fit_transform_plot(): + ErosionFiltration().fit_transform_plot(images_2D, sample=0) + + def test_signed_not_fitted(): signed = SignedDistanceFiltration() with pytest.raises(NotFittedError): @@ -253,3 +272,7 @@ def test_signed_transform(n_iterations, images, expected): assert_almost_equal(signed.fit_transform(images), expected) + + +def test_signed_fit_transform_plot(): + SignedDistanceFiltration().fit_transform_plot(images_2D, sample=0) diff --git a/gtda/images/tests/test_preprocessing.py b/gtda/images/tests/test_preprocessing.py index 1f4ebe8d8..8fb9fb2eb 100644 --- a/gtda/images/tests/test_preprocessing.py +++ b/gtda/images/tests/test_preprocessing.py @@ -2,12 +2,15 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal, assert_equal from sklearn.exceptions import NotFittedError from gtda.images import Binarizer, Inverter, Padder, ImageToPointCloud +pio.renderers.default = 'plotly_mimetype' + images_2D = np.stack([ np.ones((7, 8)), np.concatenate([np.ones((7, 4)), np.zeros((7, 4))], axis=1), @@ -42,6 +45,10 @@ def test_binarizer_transform(threshold, expected): expected) +def test_binarizer_fit_transform_plot(): + Binarizer().fit_transform_plot(images_2D, sample=0) + + def test_inverter_not_fitted(): inverter = Inverter() with pytest.raises(NotFittedError): @@ -69,13 +76,17 @@ def test_inverter_transform(images, expected): expected) +def test_inverter_fit_transform_plot(): + Inverter().fit_transform_plot(images_2D, sample=0) + + def test_padder_not_fitted(): padder = Padder() with pytest.raises(NotFittedError): padder.transform(images_2D) -@pytest.mark.parametrize("images, paddings, ", +@pytest.mark.parametrize("images, paddings", [(images_2D, np.array([1, 1], dtype=np.int)), (images_2D, None), (images_3D, np.array([2, 2, 2], dtype=np.int))]) @@ -91,6 +102,10 @@ def test_padder_transform(images, paddings): expected_shape) +def test_padder_fit_transform_plot(): + Padder().fit_transform_plot(images_2D, sample=0) + + images_2D_small = np.stack([ np.ones((3, 2)), np.concatenate([np.ones((3, 1)), np.zeros((3, 1))], axis=1), @@ -127,6 +142,16 @@ def test_img2pc_not_fitted(): np.array([[]])]) +def compare_arrays_as_sets(a1, a2): + """ A helper function to compare two point_clouds. + They should have the same points, but not necessarily in the same order. + """ + def to_set_of_elements(a): + return set([tuple(p) for p in a]) + as1, as2 = [to_set_of_elements(a) for a in [a1, a2]] + return (as1 <= as2) and (as1 >= as2) + + @pytest.mark.parametrize("images, expected", [(images_2D_small, images_2D_img2pc), (images_3D_small, images_3D_img2pc)]) @@ -139,11 +164,6 @@ def test_img2pc_transform(images, expected): expected)) -def compare_arrays_as_sets(a1, a2): - """ A helper function to compare two point_clouds. - They should have the same points, but not necessarily in the same order. - """ - def to_set_of_elements(a): - return set([tuple(p) for p in a]) - as1, as2 = [to_set_of_elements(a) for a in [a1, a2]] - return (as1 <= as2) and (as1 >= as2) +@pytest.mark.parametrize("images", [images_2D, images_3D]) +def test_img2pc_fit_transform_plot(images): + ImageToPointCloud().fit_transform_plot(images, sample=0) diff --git a/gtda/point_clouds/tests/test_rescaling.py b/gtda/point_clouds/tests/test_rescaling.py index d026fcfab..93a47fcb8 100644 --- a/gtda/point_clouds/tests/test_rescaling.py +++ b/gtda/point_clouds/tests/test_rescaling.py @@ -1,43 +1,54 @@ -"""Testing for rescaling transfomers.""" +"""Testing for rescaling transformers.""" # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError from gtda.point_clouds import ConsistentRescaling, ConsecutiveRescaling -Xr = np.array([[[0, 0], [1, 2], [5, 6]]]) +pio.renderers.default = 'plotly_mimetype' + +X = np.array([[[0, 0], [1, 2], [5, 6]]]) def test_consistent_not_fitted(): cr = ConsistentRescaling() with pytest.raises(NotFittedError): - cr.transform(Xr) + cr.transform(X) def test_consistent_transform(): cr = ConsistentRescaling() - Xres = np.array([[[0., 1., 2.19601308], - [1., 0., 1.59054146], - [2.19601308, 1.59054146, 0.]]]) + X_res = np.array([[[0., 1., 2.19601308], + [1., 0., 1.59054146], + [2.19601308, 1.59054146, 0.]]]) + + assert_almost_equal(cr.fit_transform(X), X_res) + - assert_almost_equal(cr.fit_transform(Xr), Xres) +def test_consistent_fit_transform_plot(): + ConsistentRescaling().fit_transform_plot(X, sample=0) def test_consecutive_not_fitted(): cr = ConsecutiveRescaling() with pytest.raises(NotFittedError): - cr.transform(Xr) + cr.transform(X) def test_consecutive_transform(): cr = ConsecutiveRescaling() - Xres = np.array([[[0., 0., 7.81024968], - [2.23606798, 0., 0.], - [7.81024968, 5.65685425, 0.]]]) + X_res = np.array([[[0., 0., 7.81024968], + [2.23606798, 0., 0.], + [7.81024968, 5.65685425, 0.]]]) + + assert_almost_equal(cr.fit_transform(X), X_res) + - assert_almost_equal(cr.fit_transform(Xr), Xres) +def test_consecutive_fit_transform_plot(): + ConsecutiveRescaling().fit_transform_plot(X, sample=0) diff --git a/gtda/time_series/tests/test_embedding.py b/gtda/time_series/tests/test_embedding.py index 9c3c7499d..7b9adf0ed 100644 --- a/gtda/time_series/tests/test_embedding.py +++ b/gtda/time_series/tests/test_embedding.py @@ -2,12 +2,14 @@ # License: GNU AGPLv3 import numpy as np +import plotly.io as pio import pytest from numpy.testing import assert_almost_equal from sklearn.exceptions import NotFittedError -from gtda.time_series import TakensEmbedding -from gtda.time_series import SlidingWindow +from gtda.time_series import SlidingWindow, TakensEmbedding + +pio.renderers.default = 'plotly_mimetype' signal = np.asarray([np.sin(x / 2) + 2 for x in range(0, 20)]) @@ -90,8 +92,8 @@ def test_window_params(): def test_window_transform(): windows = SlidingWindow(width=3, stride=2) - x_windows = windows.fit_transform(signal_embedded_search) - assert (x_windows.shape == (8, 4, 2)) + X_windows = windows.fit_transform(signal_embedded_search) + assert (X_windows.shape == (8, 4, 2)) def test_window_resample(): @@ -99,3 +101,9 @@ def test_window_resample(): windows.fit(y) y_resampled = windows.resample(y) assert_almost_equal(y_resampled, y[np.arange(3, 20, 2)]) + + +def test_window_plot(): + windows = SlidingWindow(width=3, stride=2) + X_windows = windows.fit_transform(signal_embedded_search) + windows.plot(X_windows, sample=0) diff --git a/gtda/utils/testing.py b/gtda/utils/testing.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/gtda/utils/tests/test_validation.py b/gtda/utils/tests/test_validation.py index dcf8becf3..378b2e59a 100644 --- a/gtda/utils/tests/test_validation.py +++ b/gtda/utils/tests/test_validation.py @@ -3,8 +3,10 @@ import numpy as np import pytest +from sklearn.exceptions import DataDimensionalityWarning -from gtda.utils.validation import check_diagrams, validate_params +from gtda.utils.validation import check_diagrams, validate_params, \ + check_point_clouds # Testing for validate_params @@ -54,3 +56,206 @@ def test_inputs_arrayStruc_V(): with pytest.raises(ValueError): check_diagrams(X) + + +# Testing check_point_clouds +# Create several kinds of inputs +class CreateInputs: + def __init__( + self, n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra + ): + N = n_samples * n_1 * n_2 + n_1_rectang = n_1 + 1 + n_2_rectang = n_2 - 1 + N_rectang = n_samples * n_1_rectang * n_2_rectang + + self.X = np.arange(N, dtype=float).reshape(n_samples, n_1, n_2) + self.X_rectang = np.arange(N_rectang, dtype=float). \ + reshape(n_samples, n_1_rectang, n_2_rectang) + + self.X_list = [] + self.X_list_rectang = [] + for i in range(n_samples): + self.X_list.append(self.X[i].copy()) + self.X_list_rectang.append(self.X_rectang[i].copy()) + + # List example where not all 2D arrays have the same no. of rows + self.X_list_rectang_diff_rows = \ + self.X_list_rectang[:-1] + [self.X_list_rectang[-1][:-1, :]] + + # List example where not all 2D arrays have the same no. of columns + self.X_list_rectang_diff_cols = \ + self.X_list_rectang[:-1] + [self.X_list_rectang[-1][:, :-1]] + + N_extra = n_samples_extra * n_1_extra * n_2_extra + X_extra = np.arange(N_extra, dtype=float). \ + reshape(n_samples_extra, n_1_extra, n_2_extra) + X_list_extra = [] + for i in range(n_samples_extra): + X_list_extra.append(X_extra[i].copy()) + self.X_list_tot = self.X_list + X_list_extra + + def insert_inf(self): + # Replace first entries with np.inf + self.X[0, 0, 0] = np.inf + self.X_rectang[0, 0, 0] = np.inf + self.X_list[0][0, 0] = np.inf + self.X_list_rectang[0][0, 0] = np.inf + return self + + def insert_nan(self): + # Replace first entries with np.nan + self.X[0, 0, 0] = np.nan + self.X_rectang[0, 0, 0] = np.nan + self.X_list[0][0, 0] = np.nan + self.X_list_rectang[0][0, 0] = np.nan + return self + + +n_samples = 2 +n_1 = 5 +n_2 = 5 +n_samples_extra = 1 +n_1_extra = 6 +n_2_extra = 6 + + +def test_check_point_clouds_regular_finite(): + """Cases in which the input is finite and no warnings or errors should be + thrown by check_point_clouds.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra) + check_point_clouds(ex.X_rectang) + check_point_clouds(ex.X_list_rectang) + check_point_clouds(ex.X_list_rectang_diff_rows) + check_point_clouds(ex.X, distance_matrices=True) + check_point_clouds(ex.X_list, distance_matrices=True) + check_point_clouds(ex.X_list_tot, distance_matrices=True) + + +def test_check_point_clouds_value_err_finite(): + """Cases in which the input is finite but we throw a ValueError.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra) + + # Check that we error on 1d array input + with pytest.raises(ValueError): + check_point_clouds(np.asarray(ex.X_list_tot)) + + # Check that we error on 2d array input + with pytest.raises(ValueError): + check_point_clouds(ex.X[0]) + + # Check that we throw errors when arrays are not square and + # distance_matrices is True. + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds(ex.X_rectang, distance_matrices=True) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds(ex.X_list_rectang, distance_matrices=True) + + +def test_check_point_clouds_warn_finite(): + """Cases in which the input is finite but we throw warnings.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra) + + # Check that we throw warnings when arrays are square and distance_matrices + # is False + # 1) Array input + with pytest.warns(DataDimensionalityWarning): + check_point_clouds(ex.X) + # 2) List input + with pytest.warns(DataDimensionalityWarning): + check_point_clouds(ex.X_list) + + # Check that we throw warnings on list input when arrays have different + # number of columns + with pytest.warns(DataDimensionalityWarning): + check_point_clouds(ex.X_list_rectang_diff_cols) + + +def test_check_point_clouds_regular_inf(): + """Cases in which part of the input is infinite and no warnings or errors + should be thrown by check_point_clouds.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_inf() + + check_point_clouds(ex.X, distance_matrices=True) + check_point_clouds(ex.X_list, distance_matrices=True) + check_point_clouds(ex.X_rectang, force_all_finite=False) + check_point_clouds(ex.X_list_rectang, force_all_finite=False) + + +def test_check_point_clouds_value_err_inf(): + """Cases in which part of the input is infinite and we throw a + ValueError.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_inf() + + # Check that, by default, np.inf is only accepted when distance_matrices + # is True. + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds(ex.X_rectang) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds(ex.X_list_rectang) + + # Check that we error if we explicitly set force_all_finite to True + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds(ex.X, distance_matrices=True, force_all_finite=True) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds( + ex.X_list, distance_matrices=True, force_all_finite=True) + + +def test_check_point_clouds_regular_nan(): + """Cases in which part of the input is NaN and no warnings or errors + should be thrown by check_point_clouds.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_nan() + + check_point_clouds(ex.X, distance_matrices=True, + force_all_finite='allow-nan') + check_point_clouds( + ex.X_list, distance_matrices=True, force_all_finite='allow-nan') + check_point_clouds(ex.X_rectang, force_all_finite='allow-nan') + check_point_clouds(ex.X_list_rectang, force_all_finite='allow-nan') + + +@pytest.mark.parametrize("force_all_finite", [True, False]) +def test_check_point_clouds_value_err_nan(force_all_finite): + """Cases in which part of the input is nan and we throw a + ValueError.""" + + ex = CreateInputs( + n_samples, n_1, n_2, n_samples_extra, n_1_extra, n_2_extra).\ + insert_nan() + + # Check that we error when force_all_finite is True or False + # 1) Array input + with pytest.raises(ValueError): + check_point_clouds( + ex.X, distance_matrices=True, force_all_finite=force_all_finite) + with pytest.raises(ValueError): + check_point_clouds(ex.X_rectang, force_all_finite=force_all_finite) + # 2) List input + with pytest.raises(ValueError): + check_point_clouds(ex.X_list, distance_matrices=True, + force_all_finite=force_all_finite) + with pytest.raises(ValueError): + check_point_clouds( + ex.X_list_rectang, force_all_finite=force_all_finite) diff --git a/gtda/utils/validation.py b/gtda/utils/validation.py index a96ba460a..bc5df763e 100644 --- a/gtda/utils/validation.py +++ b/gtda/utils/validation.py @@ -6,7 +6,9 @@ from warnings import warn import numpy as np + from sklearn.utils.validation import check_array +from sklearn.exceptions import DataDimensionalityWarning def check_diagrams(X, copy=False): @@ -186,9 +188,23 @@ def validate_params(parameters, references, exclude=None): return _validate_params(parameters_, references) -def check_point_clouds(X, distance_matrix=False, **kwargs): - """Input validation on an array or list representing a collection of point - clouds or distance matrices. +def _check_array_mod(X, **kwargs): + """Modified version of :func:`~sklearn.utils.validation.check_array. When + keyword parameter `force_all_finite` is set to False, NaNs are not + accepted but infinity is.""" + if not kwargs['force_all_finite']: + Xnew = check_array(X, **kwargs) + if np.isnan(Xnew).any(): + raise ValueError( + "Input contains NaN. Only finite values and infinity are " + "allowed when parameter `force_all_finite` is False.") + return Xnew + return check_array(X, **kwargs) + + +def check_point_clouds(X, distance_matrices=False, **kwargs): + """Input validation on arrays or lists representing collections of point + clouds or of distance/adjacency matrices. The input is checked to be either a single 3D array using a single call to :func:`~sklearn.utils.validation.check_array`, or a list of 2D arrays by @@ -204,14 +220,22 @@ def check_point_clouds(X, distance_matrix=False, **kwargs): X : object Input object to check / convert. - distance_matrix : bool, optional, default: ``False`` + distance_matrices : bool, optional, default: ``False`` Whether the input represents a collection of distance matrices or of concrete point clouds in Euclidean space. In the first case, entries are allowed to be infinite unless otherwise specified in `kwargs`. kwargs Keyword arguments accepted by - :func:`~gtda.utils.validation.check_list_of_arrays`. + :func:`~sklearn.utils.validation.check_array`, with the following + caveats: 1) `ensure_2d` and `allow_nd` are ignored; 2) if not passed + explicitly, `force_all_finite` is set to be the boolean negation of + `distance_matrices`; 3) when `force_all_finite` is set to ``False``, + NaN inputs are not allowed; 4) `accept_sparse` and + `accept_large_sparse` are only meaningful in the case of lists of 2D + arrays, in which case they are passed to individual instances of + :func:`~sklearn.utils.validation.check_array` validating each entry + in the list. Returns ------- @@ -219,30 +243,67 @@ def check_point_clouds(X, distance_matrix=False, **kwargs): The converted and validated object. """ - kwargs_ = {'force_all_finite': not distance_matrix} + kwargs_ = {'force_all_finite': not distance_matrices} kwargs_.update(kwargs) - if hasattr(X, 'shape'): + kwargs_.pop('allow_nd', None) + kwargs_.pop('ensure_2d', None) + if hasattr(X, 'shape') and hasattr(X, 'ndim'): if X.ndim != 3: - raise ValueError("ndarray input must be 3D.") - return check_array(X, allow_nd=True, **kwargs_) + if X.ndim == 2: + extra_2D = \ + "\nReshape your input X using X.reshape(1, *X.shape) or " \ + "X[None, :, :] if X is a single point cloud/distance " \ + "matrix/adjacency matrix of a weighted graph." + else: + extra_2D = "" + raise ValueError( + f"Input must be a single 3D array or a list of 2D arrays. " + f"Array of dimension {X.ndim} passed." + extra_2D) + if (X.shape[1] != X.shape[2]) and distance_matrices: + raise ValueError( + f"Input array X must have X.shape[1] == X.shape[2]: " + f"{X.shape[1]} != {X.shape[2]} passed.") + elif (X.shape[1] == X.shape[2]) and not distance_matrices: + warn( + "Input array X has X.shape[1] == X.shape[2]. This is " + "consistent with a collection of distance/adjacency " + "matrices, but the input is being treated as a collection " + "of vectors in Euclidean space.", + DataDimensionalityWarning, stacklevel=2) + Xnew = _check_array_mod(X, **kwargs_, allow_nd=True) else: - if not distance_matrix: - reference = X[0].shape[1] # Embedding dimension of first sample - if not reduce( - and_, (x.shape[1] == reference for x in X[1:]), True): - warn("Not all point clouds have the same embedding dimension.") - - has_check_failed = False - messages = [] - Xnew = [] - for i, x in enumerate(X): - try: - Xnew.append(check_array(x, **kwargs_)) - messages = [''] - except ValueError as e: - has_check_failed = True - messages.append(str(e)) - if has_check_failed: - raise ValueError("The following errors were raised by the inputs: \n" - "\n".join(messages)) + has_check_failed = False + messages = [] + Xnew = [] + for i, x in enumerate(X): + try: + xnew = _check_array_mod(x, **kwargs_, ensure_2d=True) + if distance_matrices: + if not x.shape[0] == x.shape[1]: + raise ValueError( + f"All arrays must be square: {x.shape[0]} rows " + f"and {x.shape[1]} columns found in this array.") + Xnew.append(xnew) + except ValueError as e: + has_check_failed = True + messages.append(f"Entry {i}:\n{e}") + if has_check_failed: + raise ValueError( + "The following errors were raised by the inputs:\n\n" + + "\n\n".join(messages)) + + if not distance_matrices: + if reduce(and_, (x.shape[0] == x.shape[1] for x in X), True): + warn( + "All arrays are square. This is consistent with a " + "collection of distance/adjacency matrices, but the input " + "is being treated as a collection of vectors in Euclidean " + "space.", DataDimensionalityWarning, stacklevel=2) + + ref_dim = X[0].shape[1] # Embedding dimension of first sample + if not reduce(and_, (x.shape[1] == ref_dim for x in X[1:]), True): + warn( + "Not all point clouds have the same embedding dimension.", + DataDimensionalityWarning, stacklevel=2) + return Xnew diff --git a/setup.py b/setup.py index 7d5e09e4f..a90c4784c 100755 --- a/setup.py +++ b/setup.py @@ -29,8 +29,8 @@ MAINTAINER_EMAIL = 'maintainers@giotto.ai' URL = 'https://github.com/giotto-ai/giotto-tda' LICENSE = 'GNU AGPLv3' -DOWNLOAD_URL = 'https://github.com/giotto-ai/giotto-tda/tarball/v0.2.0' -VERSION = __version__ # noqa +DOWNLOAD_URL = 'https://github.com/giotto-ai/giotto-tda/tarball/v0.2.1' +VERSION = __version__ # noqa CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', 'License :: OSI Approved',