Skip to content

Commit

Permalink
Refactor gtda/diagrams/_metrics.py to fix several bugs
Browse files Browse the repository at this point in the history
- Change computation of heat/persistence image distances and amplitudes to yield the continuum limit when `n_bins` tends to infinity.
- Make `sigma` in persistence-image-- and heat-kernel--based representations/distances/amplitudes measured in the same units as the filtration parameter (not in pixels), thus decoupling its effect from the effect of `n_bins`. Also change the default value in PairwiseDistance, Amplitude, HeatKernel and PersistenceImage from 1. to 0.1.
- Remove trivial points from diagrams before creating a sampled image in `heats` and `persistence_images`. This ensures in particular that the trivial points really give no contribution regardless of the weight function used for persistence images.
- Finish fixing giotto-ai#438 and similar issues with PairwiseDistance when the metric is persistence-image--based.
- Ensure `silhouettes` does not create NaNs when a subdiagram is trivial.
- Change default value of `power` in `silhouette_amplitudes`
 and `silhouette_distances` to 1., to agree with Amplitude and PairwiseDistance docs.
- Fix use of np.array_equal in `persistence_image_distances` and `heat_distances` -- so far the boolean check for equal inputs always failed due to the in-place modifications of `diagrams_1`.
- No longer store weights in `effective_metric_params_` attribute of PairwiseDistance and Amplitude when the metric is persistence-image--based.
- Remove _heat from gtda.diagrams._metrics.
- Remove identity from gtda.diagrams._utils and make default `weight_function` kwargs equal to np.ones_like everywhere to agree with default in `PersistenceImage`.
- Other style improvements (variable names, linting)
  • Loading branch information
ulupo committed Aug 12, 2020
1 parent ce6fef0 commit 433ec6c
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 103 deletions.
197 changes: 124 additions & 73 deletions gtda/diagrams/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,55 +82,76 @@ def landscapes(diagrams, sampling, n_layers):
return fibers


def _heat(image, sampled_diagram, sigma):
_sample_image(image, sampled_diagram)
gaussian_filter(image, sigma, mode="constant", output=image)


def heats(diagrams, sampling, step_size, sigma):
# WARNING: modifies `diagrams` in place
heats_ = \
np.zeros((len(diagrams), len(sampling), len(sampling)), dtype=float)

diagrams[diagrams < sampling[0, 0]] = sampling[0, 0]
diagrams[diagrams > sampling[-1, 0]] = sampling[-1, 0]
diagrams[...] = (diagrams - sampling[0, 0]) / step_size
diagrams = diagrams.astype(int)

[_heat(heats_[i], sampled_diagram, sigma)
for i, sampled_diagram in enumerate(diagrams)]

heats_ = heats_ - np.transpose(heats_, (0, 2, 1))
# Set the values outside of the sampling range
first_sampling, last_sampling = sampling[0, 0, 0], sampling[-1, 0, 0]
diagrams[diagrams < first_sampling] = first_sampling
diagrams[diagrams > last_sampling] = last_sampling

# Calculate the value of `sigma` in pixel units
sigma_pixel = sigma / step_size

for i, diagram in enumerate(diagrams):
nontrivial_points_idx = np.flatnonzero(diagram[:, 1] != diagram[:, 0])
diagram_nontrivial_pixel_coords = np.array(
(diagram - first_sampling) / step_size, dtype=int
)[nontrivial_points_idx]
image = heats_[i]
_sample_image(image, diagram_nontrivial_pixel_coords)
gaussian_filter(image, sigma_pixel, mode="constant", output=image)

heats_ -= np.transpose(heats_, (0, 2, 1))
heats_ /= (step_size ** 2)
heats_ = np.rot90(heats_, k=1, axes=(1, 2))
return heats_


def persistence_images(diagrams, sampling, step_size, weights, sigma):
def persistence_images(diagrams, sampling, step_size, sigma, weights):
# For persistence images, `sampling` is a tall matrix with two columns
# (the first for birth and the second for persistence), and `step_size` is
# a 2d array
# WARNING: modifies `diagrams` in place
persistence_images_ = \
np.zeros((len(diagrams), len(sampling), len(sampling)), dtype=float)
# Transform diagrams from (birth, death, dim) to (birth, persistence, dim)
diagrams[:, :, 1] -= diagrams[:, :, 0]

sigma_pixel = []
first_samplings = sampling[0]
last_samplings = sampling[-1]
for ax in [0, 1]:
diagrams_ax = diagrams[:, :, ax]
# Set the values outside of the sampling range
diagrams_ax[diagrams_ax < sampling[0, ax]] = sampling[0, ax]
diagrams_ax[diagrams_ax > sampling[-1, ax]] = sampling[-1, ax]
# Convert into pixel
diagrams_ax[...] = (diagrams_ax - sampling[0, ax]) / step_size[ax]
diagrams = diagrams.astype(int)

# Sample the image
[_sample_image(persistence_images_[i], sampled_diagram)
for i, sampled_diagram in enumerate(diagrams)]

# Apply the weights
persistence_images_ *= weights
diagrams_ax[diagrams_ax < first_samplings[ax]] = first_samplings[ax]
diagrams_ax[diagrams_ax > last_samplings[ax]] = last_samplings[ax]
# Calculate the value of the component of `sigma` in pixel units
sigma_pixel.append(sigma / step_size[ax])

# diagrams_ax[...] = (diagrams_ax - first_sampling) / step_size[ax]
#
# diagrams = diagrams.astype(int)

# Sample the image, apply the weights, smoothen
for i, diagram in enumerate(diagrams):
nontrivial_points_idx = np.flatnonzero(diagram[:, 1])
diagram_nontrivial_pixel_coords = np.array(
(diagram - first_samplings) / step_size, dtype=int
)[nontrivial_points_idx]
image = persistence_images_[i]
_sample_image(image, diagram_nontrivial_pixel_coords)
image *= weights
gaussian_filter(image, sigma_pixel, mode="constant", output=image)

# Smoothen the weighted image
for image in persistence_images_:
gaussian_filter(image, sigma, mode="constant", output=image)
gaussian_filter(image, sigma_pixel, mode="constant", output=image)

persistence_images_ = np.rot90(persistence_images_, k=1, axes=(1, 2))
persistence_images_ /= np.product(step_size)
return persistence_images_


Expand All @@ -139,15 +160,17 @@ def silhouettes(diagrams, sampling, power, **kwargs):
returned by _bin) of a one-dimensional range.
"""
sampling = np.transpose(sampling, axes=(1, 2, 0))
weights = np.diff(diagrams, axis=2)[:, :, [0]]
weights = np.diff(diagrams, axis=2)
if power > 8.:
weights = weights/np.max(weights, axis=1, keepdims=True)
weights = weights**power
weights = weights / np.max(weights, axis=1, keepdims=True)
weights = weights ** power
total_weights = np.sum(weights, axis=1)
# Next line is a trick to avoid NaNs when computing `fibers_weighted_sum`
total_weights[total_weights == 0.] = np.inf
midpoints = (diagrams[:, :, [1]] + diagrams[:, :, [0]]) / 2.
heights = (diagrams[:, :, [1]] - diagrams[:, :, [0]]) / 2.
fibers = np.maximum(-np.abs(sampling - midpoints) + heights, 0)
fibers_weighted_sum = np.sum(weights*fibers, axis=1)/total_weights
fibers_weighted_sum = np.sum(weights * fibers, axis=1) / total_weights
return fibers_weighted_sum


Expand All @@ -168,8 +191,9 @@ def wasserstein_distances(diagrams_1, diagrams_2, p=2, delta=0.01, **kwargs):
def betti_distances(
diagrams_1, diagrams_2, sampling, step_size, p=2., **kwargs
):
are_arrays_equal = np.array_equal(diagrams_1, diagrams_2)
betti_curves_1 = betti_curves(diagrams_1, sampling)
if np.array_equal(diagrams_1, diagrams_2):
if are_arrays_equal:
unnorm_dist = squareform(pdist(betti_curves_1, "minkowski", p=p))
return (step_size ** (1 / p)) * unnorm_dist
betti_curves_2 = betti_curves(diagrams_2, sampling)
Expand Down Expand Up @@ -200,48 +224,62 @@ def landscape_distances(


def heat_distances(
diagrams_1, diagrams_2, sampling, step_size, sigma=1., p=2., **kwargs
diagrams_1, diagrams_2, sampling, step_size, sigma=0.1, p=2., **kwargs
):
heat_1 = heats(diagrams_1, sampling, step_size, sigma).\
# WARNING: `heats` modifies `diagrams` in place
step_size_factor = step_size ** (2 / p)
are_arrays_equal = np.array_equal(diagrams_1, diagrams_2)
heats_1 = heats(diagrams_1, sampling, step_size, sigma).\
reshape(len(diagrams_1), -1)
if np.array_equal(diagrams_1, diagrams_2):
unnorm_dist = squareform(pdist(heat_1, "minkowski", p=p))
return (step_size ** (1 / p)) * unnorm_dist
heat_2 = heats(diagrams_2, sampling, step_size, sigma).\
if are_arrays_equal:
distances = pdist(heats_1, "minkowski", p=p)
distances *= step_size_factor
return squareform(distances)
heats_2 = heats(diagrams_2, sampling, step_size, sigma).\
reshape(len(diagrams_2), -1)
unnorm_dist = cdist(heat_1, heat_2, "minkowski", p=p)
return (step_size ** (1 / p)) * unnorm_dist
distances = cdist(heats_1, heats_2, "minkowski", p=p)
distances *= step_size_factor
return distances


def persistence_image_distances(
diagrams_1, diagrams_2, sampling, step_size, weight_function=identity,
sigma=1., p=2., **kwargs
diagrams_1, diagrams_2, sampling, step_size, sigma=0.1,
weight_function=np.ones_like, p=2., **kwargs
):
weights = weight_function(sampling)
persistence_image_1 = \
persistence_images(diagrams_1, sampling, step_size, weights, sigma).\
# For persistence images, `sampling` is a tall matrix with two columns
# (the first for birth and the second for persistence), and `step_size` is
# a 2d array
weights = weight_function(sampling[:, 1])
step_sizes_factor = np.product(step_size) ** (1 / p)
# WARNING: `persistence_images` modifies `diagrams` in place
are_arrays_equal = np.array_equal(diagrams_1, diagrams_2)
persistence_images_1 = \
persistence_images(diagrams_1, sampling, step_size, sigma, weights).\
reshape(len(diagrams_1), -1)
if np.array_equal(diagrams_1, diagrams_2):
unnorm_dist = squareform(pdist(persistence_image_1, "minkowski", p=p))
return (step_size ** (1 / p)) * unnorm_dist
persistence_image_2 = persistence_images(
diagrams_2, sampling, step_size, weights, sigma
if are_arrays_equal:
distances = pdist(persistence_images_1, "minkowski", p=p)
distances *= step_sizes_factor
return squareform(distances)
persistence_images_2 = persistence_images(
diagrams_2, sampling, step_size, sigma, weights
).reshape(len(diagrams_2), -1)
unnorm_dist = cdist(
persistence_image_1, persistence_image_2, "minkowski", p=p
distances = cdist(
persistence_images_1, persistence_images_2, "minkowski", p=p
)
return (step_size ** (1 / p)) * unnorm_dist
distances *= step_sizes_factor
return distances


def silhouette_distances(
diagrams_1, diagrams_2, sampling, step_size, power=2., p=2., **kwargs
diagrams_1, diagrams_2, sampling, step_size, power=1., p=2., **kwargs
):
silhouette_1 = silhouettes(diagrams_1, sampling, power)
if np.array_equal(diagrams_1, diagrams_2):
unnorm_dist = squareform(pdist(silhouette_1, 'minkowski', p=p))
are_arrays_equal = np.array_equal(diagrams_1, diagrams_2)
silhouettes_1 = silhouettes(diagrams_1, sampling, power)
if are_arrays_equal:
unnorm_dist = squareform(pdist(silhouettes_1, 'minkowski', p=p))
else:
silhouette_2 = silhouettes(diagrams_2, sampling, power)
unnorm_dist = cdist(silhouette_1, silhouette_2, 'minkowski', p=p)
silhouettes_2 = silhouettes(diagrams_2, sampling, power)
unnorm_dist = cdist(silhouettes_1, silhouettes_2, 'minkowski', p=p)
return (step_size ** (1 / p)) * unnorm_dist


Expand Down Expand Up @@ -315,27 +353,39 @@ def landscape_amplitudes(
return (step_size ** (1 / p)) * np.linalg.norm(ls, axis=1, ord=p)


def heat_amplitudes(diagrams, sampling, step_size, sigma=1., p=2., **kwargs):
heat = heats(diagrams, sampling, step_size, sigma)
return np.linalg.norm(heat, axis=(1, 2), ord=p)
def heat_amplitudes(diagrams, sampling, step_size, sigma=0.1, p=2., **kwargs):
# WARNING: `heats` modifies `diagrams` in place
step_size_factor = step_size ** (2 / p)
heats_ = heats(diagrams, sampling, step_size, sigma).\
reshape(len(diagrams), -1)
amplitudes = np.linalg.norm(heats_, axis=1, ord=p)
amplitudes *= step_size_factor
return amplitudes


def persistence_image_amplitudes(
diagrams, sampling, step_size, weight_function=identity, sigma=1.,
diagrams, sampling, step_size, sigma=0.1, weight_function=np.ones_like,
p=2., **kwargs
):
weights = weight_function(sampling)
persistence_image = persistence_images(
diagrams, sampling, step_size, weights, sigma
)
return np.linalg.norm(persistence_image, axis=(1, 2), ord=p)
# For persistence images, `sampling` is a tall matrix with two columns
# (the first for birth and the second for persistence), and `step_size` is
# a 2d array
weights = weight_function(sampling[:, 1])
step_sizes_factor = np.product(step_size) ** (1 / p)
# WARNING: `persistence_images` modifies `diagrams` in place
persistence_images_ = persistence_images(
diagrams, sampling, step_size, sigma, weights
).reshape(len(diagrams), -1)
amplitudes = np.linalg.norm(persistence_images_, axis=1, ord=p)
amplitudes *= step_sizes_factor
return amplitudes


def silhouette_amplitudes(
diagrams, sampling, step_size, power=2., p=2., **kwargs
diagrams, sampling, step_size, power=1., p=2., **kwargs
):
sht = silhouettes(diagrams, sampling, power)
return (step_size ** (1 / p)) * np.linalg.norm(sht, axis=1, ord=p)
silhouettes_ = silhouettes(diagrams, sampling, power)
return (step_size ** (1 / p)) * np.linalg.norm(silhouettes_, axis=1, ord=p)


implemented_amplitude_recipes = {
Expand Down Expand Up @@ -365,7 +415,8 @@ def _parallel_amplitude(X, metric, metric_params, homology_dimensions, n_jobs):
amplitude_arrays = Parallel(n_jobs=n_jobs)(
delayed(amplitude_func)(
_subdiagrams(X[s], [dim], remove_dim=True),
sampling=samplings[dim], step_size=step_sizes[dim],
sampling=samplings[dim],
step_size=step_sizes[dim],
**effective_metric_params
)
for dim in homology_dimensions
Expand Down
12 changes: 4 additions & 8 deletions gtda/diagrams/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
import numpy as np


def identity(x):
"""The identity function."""
return x


def _subdiagrams(X, homology_dimensions, remove_dim=False):
"""For each diagram in a collection, extract the subdiagrams in a given
list of homology dimensions. It is assumed that all diagrams in X contain
Expand Down Expand Up @@ -37,9 +32,10 @@ def _pad(X, max_diagram_sizes):
return X_padded


def _sample_image(image, sampled_diag):
# NOTE: Modifies `image` in-place
unique, counts = np.unique(sampled_diag, axis=0, return_counts=True)
def _sample_image(image, diagram_pixel_coords):
# WARNING: Modifies `image` in-place
unique, counts = \
np.unique(diagram_pixel_coords, axis=0, return_counts=True)
unique = tuple(tuple(row) for row in unique.astype(np.int).T)
image[unique] = counts

Expand Down
16 changes: 8 additions & 8 deletions gtda/diagrams/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ class PairwiseDistance(BaseEstimator, TransformerMixin):
default: ``2.``), `power` (float, default: ``1.``) and `n_bins` (int,
default: ``100``).
- If ``metric == 'heat'`` the available arguments are `p` (float,
default: ``2.``), `sigma` (float, default: ``1.``) and `n_bins` (int,
default: ``100``).
default: ``2.``), `sigma` (float, default: ``0.1``) and `n_bins`
(int, default: ``100``).
- If ``metric == 'persistence_image'`` the available arguments are `p`
(float, default: ``2.``), `sigma` (float, default: ``1.``), `n_bins`
(float, default: ``2.``), `sigma` (float, default: ``0.1``), `n_bins`
(int, default: ``100``) and `weight_function` (callable or None,
default: ``None``).
Expand Down Expand Up @@ -185,11 +185,11 @@ def fit(self, X, y=None):
_bin(X, self.metric, **self.effective_metric_params_)

if self.metric == 'persistence_image':
weight_function = self.effective_metric_params_['weight_function']
samplings = self.effective_metric_params_['samplings']
weights = {dim: weight_function(samplings_dim[:, 1])
for dim, samplings_dim in samplings.items()}
self.effective_metric_params_['weights'] = weights
weight_function = self.effective_metric_params_.get(
'weight_function', None
)
if weight_function is None:
self.effective_metric_params_['weight_function'] = np.ones_like

self._X = X
return self
Expand Down
16 changes: 8 additions & 8 deletions gtda/diagrams/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,10 @@ class Amplitude(BaseEstimator, TransformerMixin):
default: ``2.``), `power` (float, default: ``1.``) and `n_bins` (int,
default: ``100``).
- If ``metric == 'heat'`` the available arguments are `p` (float,
default: ``2.``), `sigma` (float, default: ``1.``) and `n_bins` (int,
default: ``100``).
default: ``2.``), `sigma` (float, default: ``0.1``) and `n_bins`
(int, default: ``100``).
- If ``metric == 'persistence_image'`` the available arguments are `p`
(float, default: ``2.``), `sigma` (float, default: ``1.``), `n_bins`
(float, default: ``2.``), `sigma` (float, default: ``0.1``), `n_bins`
(int, default: ``100``) and `weight_function` (callable or None,
default: ``None``).
Expand Down Expand Up @@ -334,11 +334,11 @@ def fit(self, X, y=None):
_bin(X, self.metric, **self.effective_metric_params_)

if self.metric == 'persistence_image':
weight_function = self.effective_metric_params_['weight_function']
samplings = self.effective_metric_params_['samplings']
weights = {dim: weight_function(samplings_dim[:, 1])
for dim, samplings_dim in samplings.items()}
self.effective_metric_params_['weights'] = weights
weight_function = self.effective_metric_params_.get(
'weight_function', None
)
if weight_function is None:
self.effective_metric_params_['weight_function'] = np.ones_like

return self

Expand Down
Loading

0 comments on commit 433ec6c

Please sign in to comment.