giotto-ai · ulupo · Aug 14, 2020 · Aug 10, 2020 · Aug 10, 2020 · Aug 10, 2020
diff --git a/gtda/diagrams/_metrics.py b/gtda/diagrams/_metrics.py
diff --git a/gtda/diagrams/_utils.py b/gtda/diagrams/_utils.py
@@ -4,14 +4,23 @@
 import numpy as np
 
 
+def _homology_dimensions_to_sorted_ints(homology_dimensions):
+    return tuple(
+        sorted([int(dim) if dim != np.inf else dim
+                for dim in homology_dimensions])
+        )
+
+
 def _subdiagrams(X, homology_dimensions, remove_dim=False):
     """For each diagram in a collection, extract the subdiagrams in a given
     list of homology dimensions. It is assumed that all diagrams in X contain
     the same number of points in each homology dimension."""
     n = len(X)
     if len(homology_dimensions) == 1:
+        # Reshape ensures copy
         Xs = X[X[:, :, 2] == homology_dimensions[0]].reshape(n, -1, 3)
     else:
+        # np.concatenate will also create a copy
         Xs = np.concatenate([X[X[:, :, 2] == dim].reshape(n, -1, 3)
                              for dim in homology_dimensions],
                             axis=1)
@@ -20,17 +29,10 @@ def _subdiagrams(X, homology_dimensions, remove_dim=False):
     return Xs
 
 
-def _pad(X, max_diagram_sizes):
-    X_padded = {dim: np.pad(
-        X[dim],
-        ((0, 0), (0, max_diagram_sizes[dim] - X[dim].shape[1]),
-         (0, 0)), 'constant') for dim in X.keys()}
-    return X_padded
-
-
-def _sample_image(image, sampled_diag):
-    # NOTE: Modifies `image` in-place
-    unique, counts = np.unique(sampled_diag, axis=0, return_counts=True)
+def _sample_image(image, diagram_pixel_coords):
+    # WARNING: Modifies `image` in-place
+    unique, counts = \
+        np.unique(diagram_pixel_coords, axis=0, return_counts=True)
     unique = tuple(tuple(row) for row in unique.astype(np.int).T)
     image[unique] = counts
 
@@ -54,7 +56,7 @@ def _multirange(counts):
 
 def _filter(X, filtered_homology_dimensions, cutoff):
     n = len(X)
-    homology_dimensions = sorted(list(set(X[0, :, 2])))
+    homology_dimensions = sorted(np.unique(X[0, :, 2]))
     unfiltered_homology_dimensions = [dim for dim in homology_dimensions if
                                       dim not in filtered_homology_dimensions]
 
@@ -97,8 +99,9 @@ def _filter(X, filtered_homology_dimensions, cutoff):
     return Xf
 
 
-def _bin(X, metric, n_bins=100, **kw_args):
-    homology_dimensions = sorted(list(set(X[0, :, 2])))
+def _bin(X, metric, n_bins=100, homology_dimensions=None, **kw_args):
+    if homology_dimensions is None:
+        homology_dimensions = sorted(np.unique(X[0, :, 2]))
     # For some vectorizations, we force the values to be the same + widest
     sub_diags = {dim: _subdiagrams(X, [dim], remove_dim=True)
                  for dim in homology_dimensions}
@@ -131,18 +134,30 @@ def _bin(X, metric, n_bins=100, **kw_args):
     samplings = {}
     step_sizes = {}
     for dim in homology_dimensions:
-        samplings[dim], step_sizes[dim] = np.linspace(min_vals[dim],
-                                                      max_vals[dim],
-                                                      retstep=True,
-                                                      num=n_bins)
+        samplings[dim], step_sizes[dim] = np.linspace(
+            min_vals[dim], max_vals[dim], retstep=True, num=n_bins
+            )
     if metric in ['landscape', 'betti', 'heat', 'silhouette']:
         for dim in homology_dimensions:
             samplings[dim] = samplings[dim][:, [0], None]
             step_sizes[dim] = step_sizes[dim][0]
     return samplings, step_sizes
 
 
-def _calculate_weights(X, weight_function, samplings, **kw_args):
-    weights = {dim: weight_function(samplings[dim][:, 1])
-               for dim in samplings.keys()}
-    return weights
+def _make_homology_dimensions_mapping(homology_dimensions,
+                                      homology_dimensions_ref):
+    """`homology_dimensions_ref` is assumed to be a sorted tuple as is e.g.
+    :attr:`homology_dimensions_` for several transformers."""
+    if homology_dimensions is None:
+        homology_dimensions_mapping = list(enumerate(homology_dimensions_ref))
+    else:
+        homology_dimensions_mapping = []
+        for dim in homology_dimensions:
+            if dim not in homology_dimensions_ref:
+                raise ValueError(f"All homology dimensions must be in "
+                                 f"{homology_dimensions_ref}; {dim} is not.")
+            else:
+                homology_dimensions_arr = np.array(homology_dimensions_ref)
+                inv_idx = np.flatnonzero(homology_dimensions_arr == dim)[0]
+                homology_dimensions_mapping.append((inv_idx, dim))
+    return homology_dimensions_mapping
diff --git a/gtda/diagrams/distance.py b/gtda/diagrams/distance.py
@@ -8,7 +8,7 @@
 from sklearn.utils.validation import check_is_fitted
 
 from ._metrics import _AVAILABLE_METRICS, _parallel_pairwise
-from ._utils import _bin, _calculate_weights
+from ._utils import _bin, _homology_dimensions_to_sorted_ints
 from ..utils._docs import adapt_fit_transform_docs
 from ..utils.intervals import Interval
 from ..utils.validation import check_diagrams, validate_params
@@ -24,9 +24,6 @@ class PairwiseDistance(BaseEstimator, TransformerMixin):
     matrices or a single distance matrix between pairs of diagrams is
     calculated according to the following steps:
 
-    Input collections of persistence diagrams for this transformer must satisfy
-    certain requirements, see e.g. :meth:`fit`.
-
         1. All diagrams are partitioned into subdiagrams corresponding to
            distinct homology dimensions.
         2. Pairwise distances between subdiagrams of equal homology
@@ -37,22 +34,29 @@ class PairwiseDistance(BaseEstimator, TransformerMixin):
            three-dimensional array, or a single distance matrix constructed
            by taking norms of the vectors of distances between diagram pairs.
 
+    **Important notes**:
+
+        - Input collections of persistence diagrams for this transformer must
+          satisfy certain requirements, see e.g. :meth:`fit`.
+        - The shape of outputs of :meth:`transform` depends on the value of the
+          `order` parameter.
+
     Parameters
     ----------
-    metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'landscape'`` | \
-        ``'betti'`` | ``'heat'`` | ``'persistence_image'``, | \
-        ``'silhouette'``, optional, default: ``'landscape'``
+    metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'betti'`` | \
+        ``'landscape'`` | ``'silhouette'`` | ``'heat'`` | \
+        ``'persistence_image'``, optional, default: ``'landscape'``
         Distance or dissimilarity function between subdiagrams:
 
         - ``'bottleneck'`` and ``'wasserstein'`` refer to the identically named
           perfect-matching--based notions of distance.
+        - ``'betti'`` refers to the :math:`L^p` distance between Betti curves.
         - ``'landscape'`` refers to the :math:`L^p` distance between
           persistence landscapes.
-        - ``'betti'`` refers to the :math:`L^p` distance between Betti curves.
-        - ``'heat'`` refers to the :math:`L^p` distance between
-          Gaussian-smoothed diagrams.
         - ``'silhouette'`` refers to the :math:`L^p` distance between
           silhouettes.
+        - ``'heat'`` refers to the :math:`L^p` distance between
+          Gaussian-smoothed diagrams.
         - ``'persistence_image'`` refers to the :math:`L^p` distance between
           Gaussian-smoothed diagrams represented on birth-persistence axes.
 
@@ -61,27 +65,27 @@ class PairwiseDistance(BaseEstimator, TransformerMixin):
         ``None`` is equivalent to passing the defaults described below):
 
         - If ``metric == 'bottleneck'`` the only argument is `delta` (float,
-          default: ``0.01``). When equal to ``0.``, an exact algorithm is
-          used; otherwise, a faster approximate algorithm is used.
+          default: ``0.01``). When equal to ``0.``, an exact algorithm is used;
+          otherwise, a faster approximate algorithm is used.
         - If ``metric == 'wasserstein'`` the available arguments are `p`
           (float, default: ``2.``) and `delta` (float, default: ``0.01``).
-          Unlike the case of ``'bottleneck'``, `delta` cannot be set to
-          ``0.`` and an exact algorithm is not available.
+          Unlike the case of ``'bottleneck'``, `delta` cannot be set to ``0.``
+          and an exact algorithm is not available.
         - If ``metric == 'betti'`` the available arguments are `p` (float,
           default: ``2.``) and `n_bins` (int, default: ``100``).
-        - If ``metric == 'landscape'`` the available arguments are `p`
-          (float, default: ``2.``), `n_bins` (int, default: ``100``) and
-          `n_layers` (int, default: ``1``).
-        - If ``metric == 'heat'`` the available arguments are `p`
-          (float, default: ``2.``), `sigma` (float, default: ``1.``) and
-          `n_bins` (int, default: ``100``).
-        - If ``metric == 'silhouette'`` the available arguments are `p`
-          (float, default: ``2.``), `order` (float, default: ``1.``) and
-          `n_bins` (int, default: ``100``).
+        - If ``metric == 'landscape'`` the available arguments are `p` (float,
+          default: ``2.``), `n_bins` (int, default: ``100``) and `n_layers`
+          (int, default: ``1``).
+        - If ``metric == 'silhouette'`` the available arguments are `p` (float,
+          default: ``2.``), `power` (float, default: ``1.``) and `n_bins` (int,
+          default: ``100``).
+        - If ``metric == 'heat'`` the available arguments are `p` (float,
+          default: ``2.``), `sigma` (float, default: ``0.1``) and `n_bins`
+          (int, default: ``100``).
         - If ``metric == 'persistence_image'`` the available arguments are `p`
-          (float, default: ``2.``), `sigma` (float, default: ``1.``),
-          `n_bins` (int, default: ``100``) and `weight_function`
-          (callable or None, default: ``None``).
+          (float, default: ``2.``), `sigma` (float, default: ``0.1``), `n_bins`
+          (int, default: ``100``) and `weight_function` (callable or None,
+          default: ``None``).
 
     order : float or None, optional, default: ``2.``
         If ``None``, :meth:`transform` returns for each pair of diagrams a
@@ -98,9 +102,9 @@ class PairwiseDistance(BaseEstimator, TransformerMixin):
     ----------
     effective_metric_params_ : dict
         Dictionary containing all information present in `metric_params` as
-        well as on any relevant quantities computed in :meth:`fit`.
+        well as relevant quantities computed in :meth:`fit`.
 
-    homology_dimensions_ : list
+    homology_dimensions_ : tuple
         Homology dimensions seen in :meth:`fit`, sorted in ascending order.
 
     See also
@@ -174,15 +178,23 @@ def fit(self, X, y=None):
         validate_params(
             self.effective_metric_params_, _AVAILABLE_METRICS[self.metric])
 
-        self.homology_dimensions_ = sorted(set(X[0, :, 2]))
+        # Find the unique homology dimensions in the 3D array X passed to `fit`
+        # assuming that they can all be found in its zero-th entry
+        homology_dimensions_fit = np.unique(X[0, :, 2])
+        self.homology_dimensions_ = \
+            _homology_dimensions_to_sorted_ints(homology_dimensions_fit)
 
         self.effective_metric_params_['samplings'], \
             self.effective_metric_params_['step_sizes'] = \
-            _bin(X, metric=self.metric, **self.effective_metric_params_)
+            _bin(X, self.metric, **self.effective_metric_params_)
 
         if self.metric == 'persistence_image':
-            self.effective_metric_params_['weights'] = \
-                _calculate_weights(X, **self.effective_metric_params_)
+            weight_function = self.effective_metric_params_.get(
+                'weight_function', None
+                )
+            weight_function = \
+                np.ones_like if weight_function is None else weight_function
+            self.effective_metric_params_['weight_function'] = weight_function
 
         self._X = X
         return self