expose SPENC kwargs (#312)

* expose SPENC kwargs * resolves #310
pysal · Dec 11, 2022 · ecfe2cc · ecfe2cc
1 parent f3066c8
commit ecfe2cc
Show file tree

Hide file tree

Showing 8 changed files with 268 additions and 99 deletions.
diff --git a/.ci/310-DEV.yaml b/.ci/310-DEV.yaml
@@ -11,7 +11,6 @@ dependencies:
   - pip
   - scikit-learn>=0.22
   - scipy>=1.0
-  - spenc
   - spaghetti
   # testing
   - codecov

diff --git a/.ci/310-DEV_shapely_dev.yaml b/.ci/310-DEV_shapely_dev.yaml
@@ -13,7 +13,6 @@ dependencies:
   - scikit-learn>=0.22
   - scipy>=1.0
   - shapely>=2.0b1
-  - spenc
   - spaghetti
   # testing
   - codecov

diff --git a/.ci/310.yaml b/.ci/310.yaml
@@ -12,7 +12,6 @@ dependencies:
   - scikit-learn>=0.22
   - scipy>=1.0
   - shapely
-  - spenc
   - spaghetti
   # testing
   - codecov

diff --git a/.ci/38.yaml b/.ci/38.yaml
@@ -12,7 +12,6 @@ dependencies:
   - scikit-learn>=0.22
   - scipy>=1.0
   - shapely
-  - spenc
   - spaghetti
   # testing
   - codecov

diff --git a/.ci/39.yaml b/.ci/39.yaml
@@ -12,7 +12,6 @@ dependencies:
   - scikit-learn>=0.22
   - scipy>=1.0
   - shapely
-  - spenc
   - spaghetti
   # testing
   - codecov

diff --git a/docs/_static/references.bib b/docs/_static/references.bib
@@ -149,8 +149,20 @@ @article{openshaw1995algorithms
 }
 
 
+@article{shi_malik_2000,
+    author={Jianbo Shi and Malik, J.},
+    journal={{IEEE Transactions on Pattern Analysis and Machine Intelligence}},
+    title={Normalized cuts and image segmentation},
+    year={2000},
+    volume={22},
+    number={8},
+    pages={888--905},
+    doi={10.1109/34.868688}
+}
+
+
 @article{toregas_swain_revelle_bergman_1971, 
-  author={Toregas, Constantine and Swain, Ralph and Revelle, Charles and Bergman, Lawrence}, 
+  author={Toregas, Constantine and Swain, Ralph and Revelle, Charles and Bergman, Lawrence},
   title={The Location of Emergency Service Facilities}, 
   volume={19}, 
   journal={Operations Research}, 
@@ -161,6 +173,19 @@ @article{toregas_swain_revelle_bergman_1971
 }
 
 
+@article{von2007tutorial,
+  title={A tutorial on spectral clustering},
+  author={{von Luxburg}, Ulrike},
+  journal={{Statistics and Computing}},
+  volume={17},
+  number={4},
+  pages={395--416},
+  year={2007},
+  publisher={Springer},
+  doi={10.1007/s11222-007-9033-z},
+}
+
+
 @article{wei2020efficient,
   title={Efficient Regionalization for Spatially Explicit Neighborhood Delineation},
   author={Wei, R and Rey, SJ and Knaap, E},
@@ -192,4 +217,14 @@ @article{wolf2021
 }
 
 
+@inproceedings{yu_shi_2003,
+  author={Yu, Stella X. and Shi, Jianbo},
+  booktitle={Proceedings Ninth IEEE International Conference on Computer Vision}, 
+  title={Multiclass spectral clustering}, 
+  year={2003},
+  volume={1},
+  pages={313-319},
+  doi={10.1109/ICCV.2003.1238361}}
+
+
 %======
diff --git a/spopt/region/spenc.py b/spopt/region/spenc.py
@@ -4,50 +4,204 @@
 
 class Spenc(BaseSpOptHeuristicSolver):
     """
-    Spatially encouraged spectral clustering.
-    :cite:`wolf2018`
+    Spatially encouraged spectral clustering found in :cite:`wolf2018`.
+
+    Apply clustering to a projection of the normalized laplacian, using
+    spatial information to constrain the clustering. In practice Spectral
+    Clustering is very useful when the structure of the individual clusters
+    is highly non-convex or more generally when a measure of the center and
+    spread of the cluster is not a suitable description of the complete cluster.
+    For instance when clusters are nested circles on the 2D plan.
+
+    Spatially-Encouraged Spectral Clustering (*SPENC*) is useful for when
+    there may be highly non-convex clusters or clusters with irregular
+    topology in a geographic context. If a binary weights matrix is provided
+    during fit, this method can be used to find weighted normalized graph cuts.
+
+    When calling ``fit``, an affinity matrix is constructed using either
+    kernel function such the Gaussian (aka RBF) kernel of the euclidean
+    distanced :math:`d(X, X)`::
+
+        numpy.exp(-gamma * d(X,X) ** 2)
+
+    or a :math:`k`-nearest neighbors connectivity matrix. Alternatively,
+    using ``precomputed``, a user-provided affinity matrix can be used.
+    Read more in the ``scikit-learn`` user guide on spectral clustering.
+
     """
 
-    def __init__(self, gdf, w, attrs_name, n_clusters=5, random_state=None, gamma=1):
+    def __init__(
+        self,
+        gdf,
+        w,
+        attrs_name,
+        n_clusters=5,
+        random_state=None,
+        gamma=1,
+        eigen_solver=None,
+        n_init=10,
+        affinity="rbf",
+        n_neighbors=10,
+        eigen_tol=1e-9,
+        assign_labels="discretize",
+        degree=3,
+        coef0=1,
+        kernel_params=None,
+        n_jobs=1,
+    ):
         """
 
         Parameters
         ----------
 
         gdf : geopandas.GeoDataFrame
             Input data.
-
         w : libpywal.weights.W
             Spatial weights matrix.
-
         attrs_name : list
-            Strings for attribute names
-            (columns of ``geopandas.GeoDataFrame``).
+            Strings for attribute names from columns in ``gdf``.
+        n_clusters : int (default 5)
+            The number of clusters to form.
+        random_state : int or numpy.random.RandomState (default None)
+            A pseudo random number generator used for the initialization of the lobpcg
+            eigen vectors decomposition when ``eigen_solver='amg'`` and by the
+            :math:`k`-Means initialization.  If ``int``, ``random_state`` is the seed
+            used by the random number generator; If ``numpy.random.RandomState``,
+            ``random_state`` is the random number generator; If ``None``,
+            the random number generator is the numpy.random.RandomState
+            instance used by ``numpy.random``.
+        gamma : int, float (default 1)
+            Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
+            Ignored for ``affinity='nearest_neighbors'``.
+        eigen_solver : str (default None)
+            The eigenvalue decomposition strategy to use. Valid values include
+            ``{'arpack', 'lobpcg', 'amg'}``. AMG requires ``pyamg`` to be installed,
+            which may be faster on very large, sparse problems, but may also lead to
+            instabilities. *Note* – ``eigen_solver`` is ignored unless fitting using
+            the ``breakme`` flag in the ``.fit()`` method (so do not use then).
+        n_init : int (default 10)
+            The number of times the :math:`k`-means algorithm will be run with
+            different centroid seeds. The final results will be the best output of
+            ``n_init`` consecutive runs in terms of inertia.
+        affinity : str, array-like, callable (default 'rbf')
+            If a ``str``, valid values include
+            ``{'nearest_neighbors', 'precomputed', 'rbf'}`` or one of the kernels
+            supported by ``sklearn.metrics.pairwise_kernels``. Only kernels that
+            produce similarity scores (non-negative values that increase with
+            similarity) should be used. *This property is not checked
+            by the clustering algorithm*.
+        n_neighbors : int (default 10)
+            The number of neighbors to use when constructing the affinity matrix using
+            the nearest neighbors method. Ignored for ``affinity='rbf'``.
+        eigen_tol : float (default 1e-7)
+            Stopping criterion for eigen decomposition of the Laplacian matrix
+            when using ``'arpack'`` as the ``eigen_solver``.
+        assign_labels : str (default 'discretize')
+            The strategy to use to assign labels in the embedding
+            space. There are three ways to assign labels after the laplacian
+            embedding: ``{'kmeans', 'discretize', 'hierarchical'}``:
 
-        n_clusters : int
-            The number of clusters to form. Default is ``5``.
+            * ``'kmeans'`` can be applied and is a popular choice. But it can also be sensitive to initialization.
+            * ``'discretize'`` is another approach which is less sensitive to random initialization, and which usually finds better clusters.
+            * ``'hierarchical'`` decomposition repeatedly bi-partitions the graph, instead of finding the decomposition all at once, as suggested in :cite:`shi_malik_2000`.
 
-        random_state : int
-            Random seed to set for reproducible results.
-            Default is ``None``.
+        degree : float (default 3)
+            Degree of the polynomial affinity kernel. Ignored by other kernels.
+        coef0 : float (default 1)
+            Zero coefficient for polynomial and sigmoid affinity kernels.
+            Ignored by other kernels.
+        kernel_params : dict (default None)
+            Parameters (keyword arguments) and values for affinity kernel passed as
+            callable object. Ignored by other affinity kernels.
+        n_jobs : int (default 1)
+            The number of parallel jobs to run for the nearest-neighbors
+            affinity kernel, if used. If ``-1``, then the number of jobs
+            is set to the number of CPU cores.
 
-        gamma: int
-            Default is ``1``.
+        Attributes
+        ----------
+
+        affinity_matrix_ : array-like
+            Affinity matrix used for clustering in the shape of
+            ``(n_samples, n_samples)``. Available only if after calling ``fit``.
+        labels_ : list
+            Cluster labels of each point or area.
+
+        Notes
+        -----
+
+        If you have an affinity matrix, such as a distance matrix,
+        for which ``0`` means identical elements, and high values mean
+        very dissimilar elements, it can be transformed in a
+        similarity matrix that is well suited for the algorithm by
+        applying the Gaussian (RBF, heat) kernel::
+
+            numpy.exp(-dist_matrix ** 2 / (2. * delta ** 2))
+
+        Where ``delta`` is a free parameter representing the width of the Gaussian
+        kernel.
+
+        Another alternative is to take a symmetric version of the
+        :math:`k`-nearest neighbors connectivity matrix of the points/areas.
+
+        References
+        ----------
+
+        - :cite:`shi_malik_2000` Normalized cuts and image segmentation, 2000
+          Jianbo Shi, Jitendra Malik – https://doi.org/10.1109/34.868688
+
+        - :cite:`von2007tutorial` A Tutorial on Spectral Clustering, 2007
+          Ulrike von Luxburg – https://doi.org/10.1007/s11222-007-9033-z
+
+        - :cite:`yu_shi_2003` Multiclass spectral clustering, 2003
+          Stella X. Yu, Jianbo Shi – https://doi.org/10.1109/ICCV.2003.1238361
+
+        """  # noqa E402
 
-        """
         self.gdf = gdf
         self.w = w
         self.attrs_name = attrs_name
         self.n_clusters = n_clusters
         self.gamma = gamma
         self.random_state = random_state
+        self.eigen_solver = eigen_solver
+        self.n_init = n_init
+        self.affinity = affinity
+        self.n_neighbors = n_neighbors
+        self.eigen_tol = eigen_tol
+        self.assign_labels = assign_labels
+        self.degree = degree
+        self.coef0 = coef0
+        self.kernel_params = kernel_params
+        self.n_jobs = n_jobs
+
+    def solve(self, fit_kwargs=dict()):
+        """Solve the spenc.
+
+        Parameters
+        ----------
+
+        fit_kwargs : dict
+            Keyword arguments passed into ``spenclib.abstracts.SPENC.fit()``.
+
+        """
 
-    def solve(self):
-        """Solve the spenc"""
         data = self.gdf
         X = data[self.attrs_name].values
         model = SPENC(
-            n_clusters=self.n_clusters, random_state=self.random_state, gamma=self.gamma
+            n_clusters=self.n_clusters,
+            random_state=self.random_state,
+            gamma=self.gamma,
+            eigen_solver=self.eigen_solver,
+            n_init=self.n_init,
+            affinity=self.affinity,
+            n_neighbors=self.n_neighbors,
+            eigen_tol=self.eigen_tol,
+            assign_labels=self.assign_labels,
+            degree=self.degree,
+            coef0=self.coef0,
+            kernel_params=self.kernel_params,
+            n_jobs=self.n_jobs,
         )
-        model.fit(X, self.w.sparse)
+        model.fit(X, self.w.sparse, **fit_kwargs)
         self.labels_ = model.labels_