From c26cdac9104d0258df89f557f531da3b111ad677 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Thu, 5 Nov 2020 09:44:26 +0100 Subject: [PATCH 01/17] Refactor ParallelClustering to return array of shape the length of the data --- gtda/mapper/cluster.py | 81 ++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index 2e24ccb7b..431d68619 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -51,8 +51,8 @@ class ParallelClustering(BaseEstimator): clusters_ : list of list of tuple Labels and indices of each cluster found in :meth:`fit`. The i-th - entry corresponds to the i-th portion of the data; it is a list - of triples of the form ``(i, label, indices)``, where ``label`` is a + entry corresponds to the i-th portion of the data; it is a list of + triples of the form ``(i, label, indices)``, where ``label`` is a cluster label and ``indices`` is the array of indices of points belonging to cluster ``(i, label)``. @@ -138,39 +138,53 @@ def fit(self, X, y=None, sample_weight=None): sample_weights = [None] * masks.shape[1] if self._precomputed: - single_fitter = self._fit_single_abs_labels_precomputed + single_labels_idx = self._single_labels_idx_precomputed else: - single_fitter = self._fit_single_abs_labels + single_labels_idx = self._single_labels_idx - self.clusterers_ = Parallel( + labels_idx = Parallel( n_jobs=self.n_jobs, prefer=self.parallel_backend_prefer - )(delayed(single_fitter)(X_tot, - np.flatnonzero(mask), - mask_num, - sample_weight=sample_weights[mask_num]) + )(delayed(single_labels_idx)( + X_tot, + np.flatnonzero(mask), + mask_num, + sample_weight=sample_weights[mask_num] + ) for mask_num, mask in enumerate(masks.T)) - self.clusters_ = [clusterer.abs_labels_ for clusterer in - self.clusterers_] - return self - def _fit_single_abs_labels(self, X, relative_indices, mask_num, - sample_weight=None): - cloned_clusterer, unique_labels, unique_labels_inverse = \ - self._fit_single(X, relative_indices, sample_weight) - self._create_abs_labels(cloned_clusterer, relative_indices, mask_num, - unique_labels, unique_labels_inverse) - return cloned_clusterer + self.labels_ = np.empty(len(X_tot), dtype=object) + self.labels_[:] = [[]] * len(X_tot) + for relative_indices, mask_num_rel_labels in labels_idx: + self.labels_[relative_indices] += mask_num_rel_labels + + return self - def _fit_single_abs_labels_precomputed(self, X, relative_indices, mask_num, - sample_weight=None): + def _single_labels_idx_precomputed(self, X, relative_indices, mask_num, + sample_weight=None): relative_2d_indices = np.ix_(relative_indices, relative_indices) - cloned_clusterer, unique_labels, unique_labels_inverse = \ - self._fit_single(X, relative_2d_indices, sample_weight) - self._create_abs_labels(cloned_clusterer, relative_indices, mask_num, - unique_labels, unique_labels_inverse) - return cloned_clusterer - def _fit_single(self, X, relative_indices, sample_weight): + mask_num_rel_labels = np.empty(len(relative_indices), dtype=object) + mask_num_rel_labels[:] = [ + [(mask_num, label)] + for label in self._single_labels(X, relative_2d_indices, + sample_weight) + ] + + return relative_indices, mask_num_rel_labels + + def _single_labels_idx(self, X, relative_indices, mask_num, + sample_weight=None): + + mask_num_rel_labels = np.empty(len(relative_indices), dtype=object) + mask_num_rel_labels[:] = [ + [(mask_num, label)] + for label in self._single_labels(X, relative_indices, + sample_weight) + ] + + return relative_indices, mask_num_rel_labels + + def _single_labels(self, X, relative_indices, sample_weight): cloned_clusterer = clone(self.clusterer) X_sub = X[relative_indices] @@ -180,16 +194,7 @@ def _fit_single(self, X, relative_indices, sample_weight): else: cloned_clusterer.fit(X_sub) - unique_labels, unique_labels_inverse = np.unique( - cloned_clusterer.labels_, return_inverse=True) - return cloned_clusterer, unique_labels, unique_labels_inverse - - @staticmethod - def _create_abs_labels(cloned_clusterer, relative_indices, mask_num, - unique_labels, inv): - cloned_clusterer.abs_labels_ = [ - (mask_num, label, relative_indices[inv == i]) - for i, label in enumerate(unique_labels)] + return cloned_clusterer.labels_ def fit_predict(self, X, y=None, sample_weight=None): """Fit to the data, and return the found clusters. @@ -220,7 +225,7 @@ def fit_predict(self, X, y=None, sample_weight=None): """ self.fit(X, sample_weight=sample_weight) - return self.clusters_ + return self.labels_ def transform(self, X, y=None): """Not implemented. From 715f9cdf6545b1375428ca969182569aa6e7faba Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Sun, 15 Nov 2020 16:14:31 +0100 Subject: [PATCH 02/17] Avoid transpositions in ParallelClustering --- gtda/mapper/cluster.py | 16 ++++++---------- gtda/mapper/pipeline.py | 5 +++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index 431d68619..abece7568 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -144,16 +144,12 @@ def fit(self, X, y=None, sample_weight=None): labels_idx = Parallel( n_jobs=self.n_jobs, prefer=self.parallel_backend_prefer - )(delayed(single_labels_idx)( - X_tot, - np.flatnonzero(mask), - mask_num, - sample_weight=sample_weights[mask_num] - ) - for mask_num, mask in enumerate(masks.T)) + )(delayed(single_labels_idx)(X_tot, np.flatnonzero(masks[:, i]), i, + sample_weight=sample_weights[i]) + for i in range(masks.shape[1])) self.labels_ = np.empty(len(X_tot), dtype=object) - self.labels_[:] = [[]] * len(X_tot) + self.labels_[:] = [tuple([])] * len(X_tot) for relative_indices, mask_num_rel_labels in labels_idx: self.labels_[relative_indices] += mask_num_rel_labels @@ -165,7 +161,7 @@ def _single_labels_idx_precomputed(self, X, relative_indices, mask_num, mask_num_rel_labels = np.empty(len(relative_indices), dtype=object) mask_num_rel_labels[:] = [ - [(mask_num, label)] + ((mask_num, label),) for label in self._single_labels(X, relative_2d_indices, sample_weight) ] @@ -177,7 +173,7 @@ def _single_labels_idx(self, X, relative_indices, mask_num, mask_num_rel_labels = np.empty(len(relative_indices), dtype=object) mask_num_rel_labels[:] = [ - [(mask_num, label)] + ((mask_num, label),) for label in self._single_labels(X, relative_indices, sample_weight) ] diff --git a/gtda/mapper/pipeline.py b/gtda/mapper/pipeline.py index 8dc48f89a..421e2f556 100644 --- a/gtda/mapper/pipeline.py +++ b/gtda/mapper/pipeline.py @@ -372,8 +372,8 @@ def make_mapper_pipeline(scaler=None, else: _scaler = scaler - # If filter_func is not a scikit-learn transformer, hope it as a - # callable to be applied on each row separately. Then attempt to create a + # If filter_func is not a scikit-learn transformer, hope it is a callable + # to be applied on each row separately. Then attempt to create a # FunctionTransformer object to implement this behaviour. if filter_func is None: from sklearn.decomposition import PCA @@ -425,4 +425,5 @@ def make_mapper_pipeline(scaler=None, mapper_pipeline = MapperPipeline( steps=all_steps, memory=memory, verbose=verbose) + return mapper_pipeline From 407947d47dad9f95be1c97cc31b06db06993f205 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Mon, 16 Nov 2020 10:30:18 +0100 Subject: [PATCH 03/17] Refactor Nerve following change in ParallelClustering output --- gtda/mapper/nerve.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index 6c0289070..87cf5a471 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -130,22 +130,28 @@ def fit_transform(self, X, y=None): """ # TODO: Include a validation step for X # Graph construction -- vertices with their metadata - nodes = reduce(iconcat, X, []) - graph = ig.Graph(len(nodes)) - - # Since `nodes` is a list, say of length N, of triples of the form - # (pullback_set_label, partial_cluster_label, node_elements), - # zip(*nodes) generates three tuples of length N, each corresponding to - # a type of node attribute. - node_attributes = zip(*nodes) - attribute_names = ["pullback_set_label", "partial_cluster_label", - "node_elements"] - for i, node_attribute in enumerate(node_attributes): - graph.vs[attribute_names[i]] = node_attribute + nodes_dict = {} + for i, sample in enumerate(X): + for node_id_pair in sample: + nodes_dict.setdefault(node_id_pair, []).append(i) + nodes_dict = {key: np.array(value, dtype=np.int32) + for key, value in nodes_dict.items()} + + graph = ig.Graph(len(nodes_dict)) + + # `nodes_dict` is a dictionary of, say, N key-value pairs of the form + # (pullback_set_label, partial_cluster_label): node_elements. Hence, + # zip(*nodes_dict) generates two tuples of length N, each corresponding + # to a type of node attribute in the final graph. + node_attributes = zip(*nodes_dict) + graph.vs["pullback_set_label"] = next(node_attributes) + graph.vs["partial_cluster_label"] = next(node_attributes) + graph.vs["node_elements"] = [*nodes_dict.values()] # Graph construction -- edges with weights given by intersection sizes. # In general, we need all information in `nodes` to narrow down the set - # of combinations to check when `contract_nodes` is True + # of combinations to check, especially when `contract_nodes` is True. + nodes = zip(*zip(*nodes_dict), nodes_dict.values()) node_index_pairs, weights, intersections, mapping = \ self._generate_edge_data(nodes) graph.es["weight"] = 1 From a378999c36c6ed55f4d97a1038449eec29634efa Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Sat, 21 Nov 2020 22:01:51 +0100 Subject: [PATCH 04/17] Simplify structure --- gtda/mapper/cluster.py | 76 ++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index abece7568..a9d3a4551 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -131,66 +131,56 @@ def fit(self, X, y=None, sample_weight=None): "of rows.") self._validate_clusterer() - if sample_weight is not None: - sample_weights = [sample_weight[masks[:, i]] - for i in range(masks.shape[1])] + fit_params = signature(self.clusterer.fit).parameters + if sample_weight is not None and "sample_weight" in fit_params: + sample_weight_computer = self._sample_weight_computer else: - sample_weights = [None] * masks.shape[1] + sample_weight_computer = lambda *args: {} if self._precomputed: - single_labels_idx = self._single_labels_idx_precomputed + idx_computer = self._idx_computer_precomputed else: - single_labels_idx = self._single_labels_idx - - labels_idx = Parallel( - n_jobs=self.n_jobs, prefer=self.parallel_backend_prefer - )(delayed(single_labels_idx)(X_tot, np.flatnonzero(masks[:, i]), i, - sample_weight=sample_weights[i]) - for i in range(masks.shape[1])) + idx_computer = self._idx_computer + + labels_idx = Parallel(n_jobs=self.n_jobs, + prefer=self.parallel_backend_prefer)( + delayed(self._single_labels_idx)( + X_tot[idx_computer(i, masks)], + i, + sample_weight_computer(i, masks, sample_weight) + ) + for i in range(masks.shape[1]) + ) self.labels_ = np.empty(len(X_tot), dtype=object) self.labels_[:] = [tuple([])] * len(X_tot) - for relative_indices, mask_num_rel_labels in labels_idx: - self.labels_[relative_indices] += mask_num_rel_labels + for i, mask_num_rel_labels in enumerate(labels_idx): + self.labels_[idx_computer(i, masks)] += mask_num_rel_labels return self - def _single_labels_idx_precomputed(self, X, relative_indices, mask_num, - sample_weight=None): - relative_2d_indices = np.ix_(relative_indices, relative_indices) - - mask_num_rel_labels = np.empty(len(relative_indices), dtype=object) - mask_num_rel_labels[:] = [ - ((mask_num, label),) - for label in self._single_labels(X, relative_2d_indices, - sample_weight) - ] + @staticmethod + def _idx_computer(i, masks): + return np.flatnonzero(masks[:, i]) - return relative_indices, mask_num_rel_labels + @staticmethod + def _idx_computer_precomputed(i, masks): + idx_one = np.flatnonzero(masks[:, i]) + return np.ix_(idx_one, idx_one) - def _single_labels_idx(self, X, relative_indices, mask_num, - sample_weight=None): + @staticmethod + def _sample_weight_computer(i, masks, sample_weight): + return {"sample_weight": sample_weight[masks[:, i]]} - mask_num_rel_labels = np.empty(len(relative_indices), dtype=object) + def _single_labels_idx(self, X, mask_num, kwargs): + cloned_clusterer = clone(self.clusterer) + mask_num_rel_labels = np.empty(len(X), dtype=object) mask_num_rel_labels[:] = [ ((mask_num, label),) - for label in self._single_labels(X, relative_indices, - sample_weight) + for label in cloned_clusterer.fit(X, **kwargs).labels_ ] - return relative_indices, mask_num_rel_labels - - def _single_labels(self, X, relative_indices, sample_weight): - cloned_clusterer = clone(self.clusterer) - X_sub = X[relative_indices] - - fit_params = signature(cloned_clusterer.fit).parameters - if 'sample_weight' in fit_params: - cloned_clusterer.fit(X_sub, sample_weight=sample_weight) - else: - cloned_clusterer.fit(X_sub) - - return cloned_clusterer.labels_ + return mask_num_rel_labels def fit_predict(self, X, y=None, sample_weight=None): """Fit to the data, and return the found clusters. From 03a9f5fe3a21e1c45b7cf6b18b6fc54c53f4a70c Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Sun, 22 Nov 2020 12:57:00 +0100 Subject: [PATCH 05/17] Simplify further --- gtda/mapper/cluster.py | 55 ++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index a9d3a4551..ad6aa1530 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -133,54 +133,41 @@ def fit(self, X, y=None, sample_weight=None): fit_params = signature(self.clusterer.fit).parameters if sample_weight is not None and "sample_weight" in fit_params: - sample_weight_computer = self._sample_weight_computer + self._sample_weight_computer = lambda rel_indices, sample_weight: \ + {"sample_weight": sample_weight[rel_indices]} else: - sample_weight_computer = lambda *args: {} + self._sample_weight_computer = lambda *args: {} if self._precomputed: - idx_computer = self._idx_computer_precomputed + self._indices_computer = lambda rel_indices: \ + np.ix_(rel_indices, rel_indices) else: - idx_computer = self._idx_computer - - labels_idx = Parallel(n_jobs=self.n_jobs, - prefer=self.parallel_backend_prefer)( - delayed(self._single_labels_idx)( - X_tot[idx_computer(i, masks)], - i, - sample_weight_computer(i, masks, sample_weight) - ) + self._indices_computer = lambda rel_indices: rel_indices + + labels_single = Parallel(n_jobs=self.n_jobs, + prefer=self.parallel_backend_prefer)( + delayed(self._labels_single)(X_tot, np.flatnonzero(masks[:, i]), + sample_weight) for i in range(masks.shape[1]) ) self.labels_ = np.empty(len(X_tot), dtype=object) self.labels_[:] = [tuple([])] * len(X_tot) - for i, mask_num_rel_labels in enumerate(labels_idx): - self.labels_[idx_computer(i, masks)] += mask_num_rel_labels + for i, (rel_indices, rel_labels) in enumerate(labels_single): + n_labels = len(rel_labels) + labels_i = np.empty(n_labels, dtype=object) + labels_i[:] = [((i, rel_label),) for rel_label in rel_labels] + self.labels_[rel_indices] += labels_i return self - @staticmethod - def _idx_computer(i, masks): - return np.flatnonzero(masks[:, i]) - - @staticmethod - def _idx_computer_precomputed(i, masks): - idx_one = np.flatnonzero(masks[:, i]) - return np.ix_(idx_one, idx_one) - - @staticmethod - def _sample_weight_computer(i, masks, sample_weight): - return {"sample_weight": sample_weight[masks[:, i]]} - - def _single_labels_idx(self, X, mask_num, kwargs): + def _labels_single(self, X, rel_indices, sample_weight): cloned_clusterer = clone(self.clusterer) - mask_num_rel_labels = np.empty(len(X), dtype=object) - mask_num_rel_labels[:] = [ - ((mask_num, label),) - for label in cloned_clusterer.fit(X, **kwargs).labels_ - ] + X_sub = X[self._indices_computer(rel_indices)] + kwargs = self._sample_weight_computer(rel_indices, sample_weight) - return mask_num_rel_labels + return (rel_indices, + cloned_clusterer.fit(X_sub, **kwargs).labels_) def fit_predict(self, X, y=None, sample_weight=None): """Fit to the data, and return the found clusters. From 8111ce04bb8bffffa852f1e22de5647d6627ee41 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Mon, 23 Nov 2020 10:24:47 +0100 Subject: [PATCH 06/17] Solve performance problem with joblib and refitting --- gtda/mapper/cluster.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index ad6aa1530..cbb108fff 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -144,11 +144,16 @@ def fit(self, X, y=None, sample_weight=None): else: self._indices_computer = lambda rel_indices: rel_indices + # This seems necessary to avoid large overheads when running fit a + # second time. Probably due to refcounts. NOTE: Only works if done + # before assigning labels_single. TODO: Investigate + self.labels_ = None + labels_single = Parallel(n_jobs=self.n_jobs, prefer=self.parallel_backend_prefer)( - delayed(self._labels_single)(X_tot, np.flatnonzero(masks[:, i]), + delayed(self._labels_single)(X_tot[rel_indices], rel_indices, sample_weight) - for i in range(masks.shape[1]) + for rel_indices in map(np.flatnonzero, masks.T) ) self.labels_ = np.empty(len(X_tot), dtype=object) @@ -163,11 +168,9 @@ def fit(self, X, y=None, sample_weight=None): def _labels_single(self, X, rel_indices, sample_weight): cloned_clusterer = clone(self.clusterer) - X_sub = X[self._indices_computer(rel_indices)] kwargs = self._sample_weight_computer(rel_indices, sample_weight) - return (rel_indices, - cloned_clusterer.fit(X_sub, **kwargs).labels_) + return rel_indices, cloned_clusterer.fit(X, **kwargs).labels_ def fit_predict(self, X, y=None, sample_weight=None): """Fit to the data, and return the found clusters. From fb6918d7dfc17a3e855983e83fb0f330415bc74e Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Mon, 23 Nov 2020 12:20:25 +0100 Subject: [PATCH 07/17] Improve variable names following @lewtun's review --- gtda/mapper/nerve.py | 12 ++++++------ gtda/mapper/pipeline.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index 87cf5a471..d6686781a 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -134,24 +134,24 @@ def fit_transform(self, X, y=None): for i, sample in enumerate(X): for node_id_pair in sample: nodes_dict.setdefault(node_id_pair, []).append(i) - nodes_dict = {key: np.array(value, dtype=np.int32) - for key, value in nodes_dict.items()} + labels_to_indices = {key: np.array(value, dtype=np.int32) + for key, value in nodes_dict.items()} - graph = ig.Graph(len(nodes_dict)) + graph = ig.Graph(len(labels_to_indices)) # `nodes_dict` is a dictionary of, say, N key-value pairs of the form # (pullback_set_label, partial_cluster_label): node_elements. Hence, # zip(*nodes_dict) generates two tuples of length N, each corresponding # to a type of node attribute in the final graph. - node_attributes = zip(*nodes_dict) + node_attributes = zip(*labels_to_indices) graph.vs["pullback_set_label"] = next(node_attributes) graph.vs["partial_cluster_label"] = next(node_attributes) - graph.vs["node_elements"] = [*nodes_dict.values()] + graph.vs["node_elements"] = [*labels_to_indices.values()] # Graph construction -- edges with weights given by intersection sizes. # In general, we need all information in `nodes` to narrow down the set # of combinations to check, especially when `contract_nodes` is True. - nodes = zip(*zip(*nodes_dict), nodes_dict.values()) + nodes = zip(*zip(*labels_to_indices), labels_to_indices.values()) node_index_pairs, weights, intersections, mapping = \ self._generate_edge_data(nodes) graph.es["weight"] = 1 diff --git a/gtda/mapper/pipeline.py b/gtda/mapper/pipeline.py index 421e2f556..ed98d2aca 100644 --- a/gtda/mapper/pipeline.py +++ b/gtda/mapper/pipeline.py @@ -372,7 +372,7 @@ def make_mapper_pipeline(scaler=None, else: _scaler = scaler - # If filter_func is not a scikit-learn transformer, hope it is a callable + # If filter_func is not a scikit-learn transformer, assume it is a callable # to be applied on each row separately. Then attempt to create a # FunctionTransformer object to implement this behaviour. if filter_func is None: From 5f63025124f3000718a4cdc51884e9f47311aafb Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Mon, 23 Nov 2020 16:32:30 +0100 Subject: [PATCH 08/17] Fix linting --- gtda/mapper/nerve.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index d6686781a..91b9a8f0d 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -1,9 +1,7 @@ """Construct the nerve of a refined Mapper cover.""" # License: GNU AGPLv3 -from functools import reduce from itertools import combinations, filterfalse -from operator import iconcat import igraph as ig import numpy as np From a163bf3265f26112485e0b91cfca85232205d216 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Mon, 23 Nov 2020 18:01:14 +0100 Subject: [PATCH 09/17] Add explicit n_nodes arg to be able to pass node as zip object --- gtda/mapper/nerve.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index 91b9a8f0d..a1759cc51 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -134,8 +134,8 @@ def fit_transform(self, X, y=None): nodes_dict.setdefault(node_id_pair, []).append(i) labels_to_indices = {key: np.array(value, dtype=np.int32) for key, value in nodes_dict.items()} - - graph = ig.Graph(len(labels_to_indices)) + n_nodes = len(labels_to_indices) + graph = ig.Graph(n_nodes) # `nodes_dict` is a dictionary of, say, N key-value pairs of the form # (pullback_set_label, partial_cluster_label): node_elements. Hence, @@ -151,7 +151,7 @@ def fit_transform(self, X, y=None): # of combinations to check, especially when `contract_nodes` is True. nodes = zip(*zip(*labels_to_indices), labels_to_indices.values()) node_index_pairs, weights, intersections, mapping = \ - self._generate_edge_data(nodes) + self._generate_edge_data(nodes, n_nodes) graph.es["weight"] = 1 graph.add_edges(node_index_pairs) graph.es["weight"] = weights @@ -174,7 +174,7 @@ def fit_transform(self, X, y=None): return graph - def _generate_edge_data(self, nodes): + def _generate_edge_data(self, nodes, n_nodes): def _in_same_pullback_set(_node_tuple): return _node_tuple[0][1][0] == _node_tuple[1][1][0] @@ -226,7 +226,7 @@ def _subset_check_metadata_append( intersection_behavior = _do_nothing if self.contract_nodes: - mapping = np.arange(len(nodes)) + mapping = np.arange(n_nodes) behavior = _subset_check_metadata_append else: mapping = None From 6c5d8283dfbb7ae624ca94a6d25f5f964bef3550 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Mon, 23 Nov 2020 18:01:44 +0100 Subject: [PATCH 10/17] Fix precomputed behaviour and fix/enhance ParallelClustering tests --- gtda/mapper/cluster.py | 7 +++-- gtda/mapper/tests/test_cluster.py | 28 +++++++++---------- .../tests/test_collection_transformer.py | 2 +- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index cbb108fff..7410c948a 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -151,8 +151,11 @@ def fit(self, X, y=None, sample_weight=None): labels_single = Parallel(n_jobs=self.n_jobs, prefer=self.parallel_backend_prefer)( - delayed(self._labels_single)(X_tot[rel_indices], rel_indices, - sample_weight) + delayed(self._labels_single)( + X_tot[self._indices_computer(rel_indices)], + rel_indices, + sample_weight + ) for rel_indices in map(np.flatnonzero, masks.T) ) diff --git a/gtda/mapper/tests/test_cluster.py b/gtda/mapper/tests/test_cluster.py index b9c1297db..2cf5f16f2 100644 --- a/gtda/mapper/tests/test_cluster.py +++ b/gtda/mapper/tests/test_cluster.py @@ -43,22 +43,24 @@ def test_parallel_clustering_transform_not_implemented(): pc.transform(X) +@pytest.mark.parametrize("n_jobs", [1, 2, -1]) @pytest.mark.parametrize("sample_weight", [None, np.random.random(5)]) -def test_parallel_clustering_kmeans(sample_weight): +def test_parallel_clustering_kmeans(n_jobs, sample_weight): kmeans = sk.cluster.KMeans(n_clusters=2, random_state=0) pc = ParallelClustering(kmeans) X = [np.random.random((5, 4)), np.ones((5, 4), dtype=bool)] single_labels = kmeans.fit_predict(X[0], sample_weight=sample_weight) - unique_labels, inverse = np.unique(single_labels, return_inverse=True) + _, inverse = np.unique(single_labels, return_inverse=True) res = pc.fit_predict(X, sample_weight=sample_weight) - res = [[(i, label, list(indices)) for [i, label, indices] in sublist] - for sublist in res] - exp = [[(i, label, list(np.flatnonzero(inverse == label))) - for label in unique_labels] - for i in range(X[1].shape[1])] + exp = np.empty(5, dtype=object) + exp[:] = [tuple([])] * 5 + for i in range(4): + labels_i = np.empty(len(single_labels), dtype=object) + labels_i[:] = [((i, rel_label),) for rel_label in inverse] + exp[:] += labels_i - assert res == exp + assert np.array_equal(res, exp) def test_parallel_clustering_metric_affinity_precomputed_not_implemented(): @@ -75,7 +77,8 @@ def __init__(self, metric="precomputed", affinity="precomputed"): pc.fit(X) -def test_parallel_clustering_precomputed(): +@pytest.mark.parametrize("n_jobs", [1, 2, -1]) +def test_parallel_clustering_precomputed(n_jobs): pc = ParallelClustering(sk.cluster.DBSCAN()) masks = np.random.choice([True, False], size=20).reshape((10, 2)) X = [np.random.random((10, 4)), masks] @@ -84,13 +87,8 @@ def test_parallel_clustering_precomputed(): res = pc.fit_predict(X) res_precomp = pc_precomp.fit_predict(X_precomp) - res = [[(i, label, list(indices)) for [i, label, indices] in sublist] - for sublist in res] - res_precomp = [[(i, label, list(indices)) - for [i, label, indices] in sublist] - for sublist in res_precomp] - assert res == res_precomp + assert np.array_equal(res, res_precomp) @composite diff --git a/gtda/metaestimators/tests/test_collection_transformer.py b/gtda/metaestimators/tests/test_collection_transformer.py index a6de70728..0aad972f0 100644 --- a/gtda/metaestimators/tests/test_collection_transformer.py +++ b/gtda/metaestimators/tests/test_collection_transformer.py @@ -51,7 +51,7 @@ def fit_transform(self): @pytest.mark.parametrize("X", [X_arr, X_list]) -@pytest.mark.parametrize("n_jobs", [None, 2, -1]) +@pytest.mark.parametrize("n_jobs", [1, 2, -1]) def test_collection_transformer_fit_transform(X, n_jobs): n_components = 3 pca = PCA(n_components=n_components) From 439cbc64132a76b5f548977ffd522371a1234576 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Tue, 24 Nov 2020 10:32:16 +0100 Subject: [PATCH 11/17] Update ParallelClustering and Nerve docs to new API --- gtda/mapper/cluster.py | 39 ++++++++++++++++++--------------------- gtda/mapper/nerve.py | 26 ++++++++++---------------- 2 files changed, 28 insertions(+), 37 deletions(-) diff --git a/gtda/mapper/cluster.py b/gtda/mapper/cluster.py index 7410c948a..3c4cac35f 100644 --- a/gtda/mapper/cluster.py +++ b/gtda/mapper/cluster.py @@ -22,10 +22,10 @@ class ParallelClustering(BaseEstimator): An arbitrary clustering class which stores a ``labels_`` attribute in ``fit`` can be passed to the constructor. Examples are most classes in ``sklearn.cluster``. The input of :meth:`fit` is of the form ``[X_tot, - masks]`` where ``X_tot`` is the full dataset, and ``masks`` is a - two-dimensional boolean array, each column of which indicates the - location of a portion of ``X_tot`` to cluster separately. Parallelism is - achieved over the columns of ``masks``. + masks]`` where ``X_tot`` is the full dataset, and ``masks`` is a 2D boolean + array, each column of which indicates the location of a portion of + ``X_tot`` to cluster separately. Parallelism is achieved over the columns + of ``masks``. Parameters ---------- @@ -45,16 +45,12 @@ class ParallelClustering(BaseEstimator): Attributes ---------- - clusterers_ : tuple of object - Clones of `clusterer` fitted to the portions of the full data array - specified in :meth:`fit`. - - clusters_ : list of list of tuple - Labels and indices of each cluster found in :meth:`fit`. The i-th - entry corresponds to the i-th portion of the data; it is a list of - triples of the form ``(i, label, indices)``, where ``label`` is a - cluster label and ``indices`` is the array of indices of points - belonging to cluster ``(i, label)``. + labels_ : ndarray of shape (n_samples,) + For each point in the dataset passed to :meth:`fit`, a tuple of pairs + of the form ``(i, partial_label)`` where ``i`` is the index of a boolean + mask which selects that point and ``partial_label`` is the cluster label + assigned to the point when clustering the subset of the data selected by + mask ``i``. References ---------- @@ -161,10 +157,11 @@ def fit(self, X, y=None, sample_weight=None): self.labels_ = np.empty(len(X_tot), dtype=object) self.labels_[:] = [tuple([])] * len(X_tot) - for i, (rel_indices, rel_labels) in enumerate(labels_single): - n_labels = len(rel_labels) + for i, (rel_indices, partial_labels) in enumerate(labels_single): + n_labels = len(partial_labels) labels_i = np.empty(n_labels, dtype=object) - labels_i[:] = [((i, rel_label),) for rel_label in rel_labels] + labels_i[:] = [((i, partial_label),) + for partial_label in partial_labels] self.labels_[rel_indices] += labels_i return self @@ -199,8 +196,8 @@ def fit_predict(self, X, y=None, sample_weight=None): Returns ------- - clusters : list of list of tuple - See :attr:`clusters_`. + labels : ndarray of shape (n_samples,) + See :attr:`labels_`. """ self.fit(X, sample_weight=sample_weight) @@ -249,8 +246,8 @@ def fit_transform(self, X, y=None, **fit_params): Returns ------- - Xt : list of list of tuple - See :attr:`clusters_`. + Xt : ndarray of shape (n_samples,) + See :attr:`labels_`. """ Xt = self.fit_predict(X, y, **fit_params) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index a1759cc51..c9c40a334 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -34,8 +34,6 @@ class Nerve(BaseEstimator, TransformerMixin): by :func:`gtda.mapper.make_mapper_pipeline`. It corresponds the last two arrows in `this diagram <../../../../_images/mapper_pipeline.svg>`_. - This transformer is not intended for direct use. - Parameters ---------- min_intersection : int, optional, default: ``1`` @@ -94,20 +92,16 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : list of list of tuple - Data structure describing a cover of a dataset (e.g. as depicted in - `this diagram <../../../../_images/mapper_pipeline.svg>`_) produced - by the clustering step of a :class:`gtda.mapper.MapperPipeline`. - Each sublist corresponds to a (non-empty) pullback cover set -- - equivalently, to a cover set in the filter range which has - non-empty preimage. It contains triples of the form - ``(pullback_set_label, partial_cluster_label, node_elements)`` - where ``partial_cluster_label`` is a cluster label within the - pullback cover set identified by ``pullback_set_label``, and - ``node_elements`` is an array of integer indices. To each pair - ``(pullback_set_label, partial_cluster_label)`` there corresponds - a unique node in the output Mapper graph. This node represents - the data subset defined by the indices in ``node_elements``. + X : ndarray of shape (n_samples,) + Cluster labels describing a refined cover of a dataset produced by + the clustering step of a :class:`gtda.mapper.MapperPipeline`, + as depicted in + `this diagram <../../../../_images/mapper_pipeline.svg>`_. Each + entry is a tuple of pairs of the form + ``(pullback_cluster_label, partial_label)`` where + ``partial_cluster_label`` is a cluster label within the pullback + cover set identified by ``pullback_set_label``. Unique such pairs + correspond to nodes in the output graph. y : None There is no need for a target in a transformer, yet the pipeline From 9749dab0fc41644e823be8deed6322676fa8bf7d Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Fri, 4 Dec 2020 21:09:46 +0100 Subject: [PATCH 12/17] Linting in __init__ --- gtda/diagrams/__init__.py | 4 ++-- gtda/homology/__init__.py | 5 ++--- gtda/images/__init__.py | 5 ++--- gtda/plotting/__init__.py | 4 ++-- gtda/point_clouds/__init__.py | 5 ++--- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/gtda/diagrams/__init__.py b/gtda/diagrams/__init__.py index c62e6fa6e..43a0b17e7 100644 --- a/gtda/diagrams/__init__.py +++ b/gtda/diagrams/__init__.py @@ -1,6 +1,6 @@ """The module :mod:`gtda.diagrams` implements transformers to preprocess -persistence diagrams, extract features from them, or compute pairwise -distances between diagrams.""" +persistence diagrams, extract features from them, or compute pairwise distances +between diagrams.""" from .preprocessing import ForgetDimension, Scaler, Filtering from .distance import PairwiseDistance diff --git a/gtda/homology/__init__.py b/gtda/homology/__init__.py index 1918833e1..7fcc6a45b 100644 --- a/gtda/homology/__init__.py +++ b/gtda/homology/__init__.py @@ -1,6 +1,5 @@ -"""The module :mod:`gtda.homology` implements transformers -to generate persistence diagrams. -""" +"""The module :mod:`gtda.homology` implements transformers to generate +persistence diagrams.""" # License: GNU AGPLv3 from .simplicial import VietorisRipsPersistence, SparseRipsPersistence, \ diff --git a/gtda/images/__init__.py b/gtda/images/__init__.py index f4b700e91..f7733feba 100644 --- a/gtda/images/__init__.py +++ b/gtda/images/__init__.py @@ -1,6 +1,5 @@ -"""The module :mod:`gtda.images` implements techniques -that can be used to apply Topological Data Analysis to images. -""" +"""The module :mod:`gtda.images` implements techniques that can be used to +apply Topological Data Analysis to images.""" # License: GNU AGPLv3 from .preprocessing import Binarizer, Inverter, Padder, ImageToPointCloud diff --git a/gtda/plotting/__init__.py b/gtda/plotting/__init__.py index 421f2e05e..c98e17011 100644 --- a/gtda/plotting/__init__.py +++ b/gtda/plotting/__init__.py @@ -1,5 +1,5 @@ -"""The module :mod:`gtda.plotting` implements function to plot the -outputs of giotto-tda transformers.""" +"""The module :mod:`gtda.plotting` implements function to plot the outputs of +giotto-tda transformers.""" from .point_clouds import plot_point_cloud from .persistence_diagrams import plot_diagram diff --git a/gtda/point_clouds/__init__.py b/gtda/point_clouds/__init__.py index 7fe81f718..1850056f1 100644 --- a/gtda/point_clouds/__init__.py +++ b/gtda/point_clouds/__init__.py @@ -1,6 +1,5 @@ -"""The module :mod:`gtda.homology` implements transformers -to process point clouds and modify metric spaces. -""" +"""The module :mod:`gtda.homology` implements transformers to process point +clouds and modify metric spaces.""" # License: GNU AGPLv3 from .rescaling import ConsistentRescaling, ConsecutiveRescaling From 72ed8dff73227a9228cca798b8fd7494e5393e23 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Fri, 4 Dec 2020 21:27:15 +0100 Subject: [PATCH 13/17] Fix docstring typos in Nerve --- gtda/mapper/nerve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index c9c40a334..e6ac9c007 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -97,8 +97,8 @@ def fit_transform(self, X, y=None): the clustering step of a :class:`gtda.mapper.MapperPipeline`, as depicted in `this diagram <../../../../_images/mapper_pipeline.svg>`_. Each - entry is a tuple of pairs of the form - ``(pullback_cluster_label, partial_label)`` where + entry in `X` is a tuple of pairs of the form + ``(pullback_set_label, partial_cluster_label)`` where ``partial_cluster_label`` is a cluster label within the pullback cover set identified by ``pullback_set_label``. Unique such pairs correspond to nodes in the output graph. From d878722d0b8806fe23d1bef05991710315cc9f47 Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Fri, 4 Dec 2020 21:40:49 +0100 Subject: [PATCH 14/17] Improve variable name following @lewtun's comment --- gtda/mapper/nerve.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index e6ac9c007..2032f04d6 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -122,19 +122,19 @@ def fit_transform(self, X, y=None): """ # TODO: Include a validation step for X # Graph construction -- vertices with their metadata - nodes_dict = {} + labels_to_indices = {} for i, sample in enumerate(X): for node_id_pair in sample: - nodes_dict.setdefault(node_id_pair, []).append(i) - labels_to_indices = {key: np.array(value, dtype=np.int32) - for key, value in nodes_dict.items()} + labels_to_indices.setdefault(node_id_pair, []).append(i) + labels_to_indices = {key: np.array(value) + for key, value in labels_to_indices.items()} n_nodes = len(labels_to_indices) graph = ig.Graph(n_nodes) - # `nodes_dict` is a dictionary of, say, N key-value pairs of the form - # (pullback_set_label, partial_cluster_label): node_elements. Hence, - # zip(*nodes_dict) generates two tuples of length N, each corresponding - # to a type of node attribute in the final graph. + # labels_to_indices is a dictionary of, say, N key-value pairs of the + # form (pullback_set_label, partial_cluster_label): node_elements. + # Hence, zip(*labels_to_indices) generates two tuples of length N, each + # corresponding to a type of node attribute in the final graph. node_attributes = zip(*labels_to_indices) graph.vs["pullback_set_label"] = next(node_attributes) graph.vs["partial_cluster_label"] = next(node_attributes) From b84cb81e2297e71fec631f5483e6c1b2ec918eda Mon Sep 17 00:00:00 2001 From: Umberto Lupo Date: Thu, 10 Dec 2020 15:31:53 +0100 Subject: [PATCH 15/17] Fix linting --- gtda/mapper/cover.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gtda/mapper/cover.py b/gtda/mapper/cover.py index 9db055cfb..478471853 100644 --- a/gtda/mapper/cover.py +++ b/gtda/mapper/cover.py @@ -213,7 +213,8 @@ def _fit_transform_balanced(self, X): X_rank, self.n_intervals, self.overlap_frac, is_uniform=False) X_rank = np.broadcast_to(X_rank[:, None], (X.shape[0], self.n_intervals)) - Xt = np.logical_and(X_rank > self._left_limits, X_rank < self._right_limits) + Xt = np.logical_and(X_rank > self._left_limits, + X_rank < self._right_limits) return Xt def _fit_transform(self, X): From 24e8ead496719a110cab6d13a4387ccef29d0dbd Mon Sep 17 00:00:00 2001 From: Umberto Lupo <46537483+ulupo@users.noreply.github.com> Date: Fri, 11 Dec 2020 12:01:57 +0100 Subject: [PATCH 16/17] Fix wording following @wreise's review --- gtda/mapper/nerve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index 2032f04d6..b22a5ef13 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -100,7 +100,7 @@ def fit_transform(self, X, y=None): entry in `X` is a tuple of pairs of the form ``(pullback_set_label, partial_cluster_label)`` where ``partial_cluster_label`` is a cluster label within the pullback - cover set identified by ``pullback_set_label``. Unique such pairs + cover set identified by ``pullback_set_label``. Then unique pairs correspond to nodes in the output graph. y : None From 43eb7d75b856b55ba750f9d88ab335042ae05b53 Mon Sep 17 00:00:00 2001 From: Umberto Lupo <46537483+ulupo@users.noreply.github.com> Date: Fri, 11 Dec 2020 12:03:31 +0100 Subject: [PATCH 17/17] Fix "then" -> "the" --- gtda/mapper/nerve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gtda/mapper/nerve.py b/gtda/mapper/nerve.py index b22a5ef13..5eae5144f 100644 --- a/gtda/mapper/nerve.py +++ b/gtda/mapper/nerve.py @@ -100,7 +100,7 @@ def fit_transform(self, X, y=None): entry in `X` is a tuple of pairs of the form ``(pullback_set_label, partial_cluster_label)`` where ``partial_cluster_label`` is a cluster label within the pullback - cover set identified by ``pullback_set_label``. Then unique pairs + cover set identified by ``pullback_set_label``. The unique pairs correspond to nodes in the output graph. y : None