helmholtz-analytics · Markus-Goetz · Jul 22, 2020 · Jul 7, 2020 · Jul 8, 2020 · Jul 8, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,7 @@
 - [#620](https://github.com/helmholtz-analytics/heat/pull/620) New feature: KNN
 - [#624](https://github.com/helmholtz-analytics/heat/pull/624) Bugfix: distributed median() indexing and casting
 - [#629](https://github.com/helmholtz-analytics/heat/pull/629) New features: `asin`, `acos`, `atan`, `atan2`
+- [#634](https://github.com/helmholtz-analytics/heat/pull/634) New features: `kmedians`, `kmedoids`, `manhattan`
 - [#633](https://github.com/helmholtz-analytics/heat/pull/633) Documentation: updated contributing.md
 - [#635](https://github.com/helmholtz-analytics/heat/pull/635) `DNDarray.__getitem__` balances and resplits the given key to None if the key is a DNDarray
 

diff --git a/examples/cluster/demo_kClustering.py b/examples/cluster/demo_kClustering.py
@@ -0,0 +1,106 @@
+import torch
+import heat as ht
+
+
+def create_spherical_dataset(
+    num_samples_cluster, radius=1.0, offset=4.0, dtype=ht.float32, random_state=1
+):
+    """
+    Creates k=4 sperical clusters in 3D space along the space-diagonal
+
+    Parameters
+    ----------
+    num_samples_cluster: int
+        Number of samples per cluster. Each process will create n // MPI_WORLD.size elements for each cluster
+    radius: float
+        Radius of the sphere
+    offset: float
+        Shift of the clusters along the axes. The 4 clusters will be positioned centered around c1=(offset, offset,offset),
+        c2=(2*offset,2*offset,2*offset), c3=(-offset, -offset, -offset) and c4=(2*offset, -2*offset, -2*offset)
+    dtype: ht.datatype
+    random_state: int
+        seed of the torch random number generator
+    """
+    # create k sperical clusters with each n elements per cluster. Each process creates k * n/p elements
+    ht.random.seed(random_state)
+    # radius between 0 and 1
+    r = ht.random.rand(num_samples_cluster, split=0) * radius
+    # theta between 0 and pi
+    theta = ht.random.rand(num_samples_cluster, split=0) * ht.constants.PI
+    # phi between 0 and 2pi
+    phi = ht.random.rand(num_samples_cluster, split=0) * 2 * ht.constants.PI
+    # Cartesian coordinates
+    x = r * ht.sin(theta) * ht.cos(phi)
+    x.astype(dtype, copy=False)
+    y = r * ht.sin(theta) * ht.sin(phi)
+    y.astype(dtype, copy=False)
+    z = r * ht.cos(theta)
+    z.astype(dtype, copy=False)
+
+    cluster1 = ht.stack((x + offset, y + offset, z + offset), axis=1)
+    cluster2 = ht.stack((x + 2 * offset, y + 2 * offset, z + 2 * offset), axis=1)
+    cluster3 = ht.stack((x - offset, y - offset, z - offset), axis=1)
+    cluster4 = ht.stack((x - 2 * offset, y - 2 * offset, z - 2 * offset), axis=1)
+
+    data = ht.concatenate((cluster1, cluster2, cluster3, cluster4), axis=0)
+    return data
+
+
+def main():
+    seed = 1
+    n = 20 * ht.MPI_WORLD.size
+
+    data = create_spherical_dataset(
+        num_samples_cluster=n, radius=1.0, offset=4.0, dtype=ht.float32, random_state=seed
+    )
+    reference = ht.array([[-8, -8, -8], [-4, -4, -4], [4, 4, 4], [8, 8, 8]], dtype=ht.float32)
+
+    clusterer = {
+        "kmeans": ht.cluster.KMeans(n_clusters=4, init="kmeans++"),
+        "kmedians": ht.cluster.KMedians(n_clusters=4, init="kmedians++"),
+        "kmedoids": ht.cluster.KMedoids(n_clusters=4, init="kmedoids++"),
+    }
+
+    print(f"4 Spherical clusters with radius 1.0, each {n} samples (dtype = ht.float32)")
+    for name, c in clusterer.items():
+        c.fit(data)
+        print(
+            f"### Fitting with {name} ### \n"
+            f"Original sphere centers = {reference}\n"
+            f"Fitted cluster centers = {c.cluster_centers_}"
+        )
+
+    # More Samples
+    n = 100 * ht.MPI_WORLD.size
+    data = create_spherical_dataset(
+        num_samples_cluster=n, radius=1.0, offset=4.0, dtype=ht.float32, random_state=seed
+    )
+    print("4 Spherical  with radius 1.0, each {} samples (dtype = ht.float32) ".format(n))
+    for name, c in clusterer.items():
+        c.fit(data)
+        print(
+            f"### Fitting with {name} ### \n"
+            f"Original sphere centers = {reference}\n"
+            f"Fitted cluster centers = {c.cluster_centers_}"
+        )
+
+    # On integers (different radius, offset and datatype)
+    n = 20 * ht.MPI_WORLD.size
+    data = create_spherical_dataset(
+        num_samples_cluster=n, radius=10.0, offset=40.0, dtype=ht.int32, random_state=seed
+    )
+    reference = ht.array(
+        [[-80, -80, -80], [-40, -40, -40], [40, 40, 40], [80, 80, 80]], dtype=ht.float32
+    )
+    print("4 Spherical clusters with radius 10, each {} samples (dtype = ht.int32) ".format(n))
+    for name, c in clusterer.items():
+        c.fit(data)
+        print(
+            f"### Fitting with {name} ### \n"
+            f"Original sphere centers = {reference}\n"
+            f"Fitted cluster centers = {c.cluster_centers_}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/heat/cluster/__init__.py b/heat/cluster/__init__.py
@@ -1,2 +1,5 @@
+from ._kcluster import *
 from .kmeans import *
+from .kmedians import *
+from .kmedoids import *
 from .spectral import *
diff --git a/heat/cluster/_kcluster.py b/heat/cluster/_kcluster.py
@@ -0,0 +1,249 @@
+import heat as ht
+
+
+class _KCluster(ht.ClusteringMixin, ht.BaseEstimator):
+    """
+    Base class for k-statistics clustering algorithms (kmeans, kmedians, kmedoids).
+    The clusters are represented by centroids ci (we use the term from kmeans for simplicity)
+
+    Parameters
+    ----------
+    metric : function
+        One of the distance metrics in ht.spatial.distance. Needs to be passed as lambda function to take only two arrays as input
+    n_clusters : int
+        The number of clusters to form as well as the number of centroids to generate.
+    init : str or DNDarray
+        Method for initialization, defaults to ‘random’:
+            - ‘probability_based’ : selects initial cluster centers for the clustering in a smart way to speed up convergence (k-means++) \n
+            - ‘random’: choose k observations (rows) at random from data for the initial centroids. \n
+            - ``DNDarray``: gives the initial centers, should be of Shape = (n_clusters, n_features)
+    max_iter : int
+        Maximum number of iterations for a single run.
+    tol : float, default: 1e-4
+        Relative tolerance with regards to inertia to declare convergence.
+    random_state : int
+        Determines random number generation for centroid initialization.
+    """
+
+    def __init__(self, metric, n_clusters, init, max_iter, tol, random_state):
+        self.n_clusters = n_clusters
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.random_state = random_state
+
+        # in-place properties
+        self._metric = metric
+        self._cluster_centers = None
+        self._labels = None
+        self._inertia = None
+        self._n_iter = None
+
+    @property
+    def cluster_centers_(self):
+        """
+        Returns the coordinates of cluster centers.
+        """
+        return self._cluster_centers
+
+    @property
+    def labels_(self):
+        """
+        Returns the labels of each point.
+        """
+        return self._labels
+
+    @property
+    def inertia_(self):
+        """
+        Returns sum of squared distances of samples to their closest cluster center.
+        """
+        return self._inertia
+
+    @property
+    def n_iter_(self):
+        """
+        Returns the number of iterations run.
+        """
+        return self._n_iter
+
+    def _initialize_cluster_centers(self, X):
+        """
+        Initializes the K-Means centroids.
+
+        Parameters
+        ----------
+        X : DNDarray,
+            The data to initialize the clusters for. Shape = (n_point, n_features)
+        """
+        # always initialize the random state
+        if self.random_state is not None:
+            ht.random.seed(self.random_state)
+
+        # initialize the centroids by randomly picking some of the points
+        if self.init == "random":
+            # Samples will be equally distributed drawn from all involved processes
+            _, displ, _ = X.comm.counts_displs_shape(shape=X.shape, axis=0)
+            # At least float32 precision, if X has higher precision datatype, use that
+            datatype = ht.promote_types(X.dtype, ht.float32)
+            centroids = ht.empty(
+                (self.n_clusters, X.shape[1]),
+                split=None,
+                dtype=datatype,
+                device=X.device,
+                comm=X.comm,
+            )
+            if X.split is None or X.split == 0:
+                for i in range(self.n_clusters):
+                    samplerange = (
+                        X.gshape[0] // self.n_clusters * i,
+                        X.gshape[0] // self.n_clusters * (i + 1),
+                    )
+                    sample = ht.random.randint(samplerange[0], samplerange[1]).item()
+                    proc = 0
+                    for p in range(X.comm.size):
+                        if displ[p] > sample:
+                            break
+                        proc = p
+                    xi = ht.zeros(X.shape[1], dtype=X.dtype)
+                    if X.comm.rank == proc:
+                        idx = sample - displ[proc]
+                        xi = ht.array(X.lloc[idx, :], device=X.device, comm=X.comm)
+                    xi.comm.Bcast(xi, root=proc)
+                    centroids[i, :] = xi
+
+            else:
+                raise NotImplementedError("Not implemented for other splitting-axes")
+
+            self._cluster_centers = centroids
+
+        # directly passed centroids
+        elif isinstance(self.init, ht.DNDarray):
+            if len(self.init.shape) != 2:
+                raise ValueError(
+                    "passed centroids need to be two-dimensional, but are {}".format(len(self.init))
+                )
+            if self.init.shape[0] != self.n_clusters or self.init.shape[1] != X.shape[1]:
+                raise ValueError("passed centroids do not match cluster count or data shape")
+            self._cluster_centers = self.init.resplit(None)
+
+        # smart centroid guessing, random sampling with probability weight proportional to distance to existing centroids
+        elif self.init == "probability_based":
+            datatype = ht.promote_types(X.dtype, ht.float32)
+            if X.split is None or X.split == 0:
+                centroids = ht.zeros(
+                    (self.n_clusters, X.shape[1]),
+                    split=None,
+                    dtype=datatype,
+                    device=X.device,
+                    comm=X.comm,
+                )
+                sample = ht.random.randint(0, X.shape[0] - 1).item()
+                _, displ, _ = X.comm.counts_displs_shape(shape=X.shape, axis=0)
+                proc = 0
+                for p in range(X.comm.size):
+                    if displ[p] > sample:
+                        break
+                    proc = p
+                x0 = ht.zeros(X.shape[1], dtype=X.dtype, device=X.device, comm=X.comm)
+                if X.comm.rank == proc:
+                    idx = sample - displ[proc]
+                    x0 = ht.array(X.lloc[idx, :], device=X.device, comm=X.comm)
+                x0.comm.Bcast(x0, root=proc)
+                centroids[0, :] = x0
+                for i in range(1, self.n_clusters):
+                    distances = self._metric(X, centroids)
+                    D2 = distances.min(axis=1)
+                    D2.resplit_(axis=None)
+                    prob = D2 / D2.sum()
+                    x = ht.random.rand().item()
+                    sample = 0
+                    sum = 0
+                    for j in range(len(prob)):
+                        if sum > x:
+                            break
+                        sum += prob[j].item()
+                        sample = j
+                    proc = 0
+                    for p in range(X.comm.size):
+                        if displ[p] > sample:
+                            break
+                        proc = p
+                    xi = ht.zeros(X.shape[1], dtype=X.dtype)
+                    if X.comm.rank == proc:
+                        idx = sample - displ[proc]
+                        xi = ht.array(X.lloc[idx, :], device=X.device, comm=X.comm)
+                    xi.comm.Bcast(xi, root=proc)
+                    centroids[i, :] = xi
+
+            else:
+                raise NotImplementedError("Not implemented for other splitting-axes")
+
+            self._cluster_centers = centroids
+
+        else:
+            raise ValueError(
+                'init needs to be one of "random", ht.DNDarray or "kmeans++", but was {}'.format(
+                    self.init
+                )
+            )
+
+    def _assign_to_cluster(self, X):
+        """
+        Assigns the passed data points to the centroids based on the respective metric
+
+        Parameters
+        ----------
+        X : DNDarray
+            Data points, Shape = (n_samples, n_features)
+        """
+        # calculate the distance matrix and determine the closest centroid
+        distances = self._metric(X, self._cluster_centers)
+        matching_centroids = distances.argmin(axis=1, keepdim=True)
+
+        return matching_centroids
+
+    def _update_centroids(self, X, matching_centroids):
+        """
+        The Update strategy is algorithm specific (e.g. calculate mean of assigned points for kmeans, median for kmedians, etc.)
+
+        Parameters
+        ----------
+        X : DNDarray
+            Input Data
+        matching_centroids: DNDarray
+            Index array of assigned centroids
+
+        """
+        raise NotImplementedError()
+
+    def fit(self, X):
+        """
+       Computes the centroid of the clustering algorithm to fit the data X. The full pipeline is algorithm specific.
+
+        Parameters
+        ----------
+        X : DNDarray
+            Training instances to cluster. Shape = (n_samples, n_features)
+
+        """
+        raise NotImplementedError()
+
+    def predict(self, X):
+        """
+        Predict the closest cluster each sample in X belongs to.
+
+        In the vector quantization literature, cluster_centers_ is called the code book and each value returned by
+        predict is the index of the closest code in the code book.
+
+        Parameters
+        ----------
+        X : DNDarray
+            New data to predict. Shape = (n_samples, n_features)
+       """
+        # input sanitation
+        if not isinstance(X, ht.DNDarray):
+            raise ValueError("input needs to be a ht.DNDarray, but was {}".format(type(X)))
+
+        # determine the centroids
+        return self._assign_to_cluster(X)