diff --git a/.ci/310-DEV.yaml b/.ci/310-DEV.yaml index 9e2167aa..e50f040a 100644 --- a/.ci/310-DEV.yaml +++ b/.ci/310-DEV.yaml @@ -11,7 +11,6 @@ dependencies: - pip - scikit-learn>=0.22 - scipy>=1.0 - - spenc - spaghetti # testing - codecov diff --git a/.ci/310-DEV_shapely_dev.yaml b/.ci/310-DEV_shapely_dev.yaml index 6806cd0e..f9444dd3 100644 --- a/.ci/310-DEV_shapely_dev.yaml +++ b/.ci/310-DEV_shapely_dev.yaml @@ -13,7 +13,6 @@ dependencies: - scikit-learn>=0.22 - scipy>=1.0 - shapely>=2.0b1 - - spenc - spaghetti # testing - codecov diff --git a/.ci/310.yaml b/.ci/310.yaml index 7f71ece6..e4b66641 100644 --- a/.ci/310.yaml +++ b/.ci/310.yaml @@ -12,7 +12,6 @@ dependencies: - scikit-learn>=0.22 - scipy>=1.0 - shapely - - spenc - spaghetti # testing - codecov diff --git a/.ci/38.yaml b/.ci/38.yaml index 82110f45..db9c90c4 100644 --- a/.ci/38.yaml +++ b/.ci/38.yaml @@ -12,7 +12,6 @@ dependencies: - scikit-learn>=0.22 - scipy>=1.0 - shapely - - spenc - spaghetti # testing - codecov diff --git a/.ci/39.yaml b/.ci/39.yaml index 8eae52b7..1ff70a20 100644 --- a/.ci/39.yaml +++ b/.ci/39.yaml @@ -12,7 +12,6 @@ dependencies: - scikit-learn>=0.22 - scipy>=1.0 - shapely - - spenc - spaghetti # testing - codecov diff --git a/docs/_static/references.bib b/docs/_static/references.bib index b67db295..2074cd74 100644 --- a/docs/_static/references.bib +++ b/docs/_static/references.bib @@ -149,8 +149,20 @@ @article{openshaw1995algorithms } +@article{shi_malik_2000, + author={Jianbo Shi and Malik, J.}, + journal={{IEEE Transactions on Pattern Analysis and Machine Intelligence}}, + title={Normalized cuts and image segmentation}, + year={2000}, + volume={22}, + number={8}, + pages={888--905}, + doi={10.1109/34.868688} +} + + @article{toregas_swain_revelle_bergman_1971, - author={Toregas, Constantine and Swain, Ralph and Revelle, Charles and Bergman, Lawrence}, + author={Toregas, Constantine and Swain, Ralph and Revelle, Charles and Bergman, Lawrence}, title={The Location of Emergency Service Facilities}, volume={19}, journal={Operations Research}, @@ -161,6 +173,19 @@ @article{toregas_swain_revelle_bergman_1971 } +@article{von2007tutorial, + title={A tutorial on spectral clustering}, + author={{von Luxburg}, Ulrike}, + journal={{Statistics and Computing}}, + volume={17}, + number={4}, + pages={395--416}, + year={2007}, + publisher={Springer}, + doi={10.1007/s11222-007-9033-z}, +} + + @article{wei2020efficient, title={Efficient Regionalization for Spatially Explicit Neighborhood Delineation}, author={Wei, R and Rey, SJ and Knaap, E}, @@ -192,4 +217,14 @@ @article{wolf2021 } +@inproceedings{yu_shi_2003, + author={Yu, Stella X. and Shi, Jianbo}, + booktitle={Proceedings Ninth IEEE International Conference on Computer Vision}, + title={Multiclass spectral clustering}, + year={2003}, + volume={1}, + pages={313-319}, + doi={10.1109/ICCV.2003.1238361}} + + %====== diff --git a/spopt/region/spenc.py b/spopt/region/spenc.py index 44ab80ab..c92f7805 100644 --- a/spopt/region/spenc.py +++ b/spopt/region/spenc.py @@ -4,11 +4,51 @@ class Spenc(BaseSpOptHeuristicSolver): """ - Spatially encouraged spectral clustering. - :cite:`wolf2018` + Spatially encouraged spectral clustering found in :cite:`wolf2018`. + + Apply clustering to a projection of the normalized laplacian, using + spatial information to constrain the clustering. In practice Spectral + Clustering is very useful when the structure of the individual clusters + is highly non-convex or more generally when a measure of the center and + spread of the cluster is not a suitable description of the complete cluster. + For instance when clusters are nested circles on the 2D plan. + + Spatially-Encouraged Spectral Clustering (*SPENC*) is useful for when + there may be highly non-convex clusters or clusters with irregular + topology in a geographic context. If a binary weights matrix is provided + during fit, this method can be used to find weighted normalized graph cuts. + + When calling ``fit``, an affinity matrix is constructed using either + kernel function such the Gaussian (aka RBF) kernel of the euclidean + distanced :math:`d(X, X)`:: + + numpy.exp(-gamma * d(X,X) ** 2) + + or a :math:`k`-nearest neighbors connectivity matrix. Alternatively, + using ``precomputed``, a user-provided affinity matrix can be used. + Read more in the ``scikit-learn`` user guide on spectral clustering. + """ - def __init__(self, gdf, w, attrs_name, n_clusters=5, random_state=None, gamma=1): + def __init__( + self, + gdf, + w, + attrs_name, + n_clusters=5, + random_state=None, + gamma=1, + eigen_solver=None, + n_init=10, + affinity="rbf", + n_neighbors=10, + eigen_tol=1e-9, + assign_labels="discretize", + degree=3, + coef0=1, + kernel_params=None, + n_jobs=1, + ): """ Parameters @@ -16,38 +56,152 @@ def __init__(self, gdf, w, attrs_name, n_clusters=5, random_state=None, gamma=1) gdf : geopandas.GeoDataFrame Input data. - w : libpywal.weights.W Spatial weights matrix. - attrs_name : list - Strings for attribute names - (columns of ``geopandas.GeoDataFrame``). + Strings for attribute names from columns in ``gdf``. + n_clusters : int (default 5) + The number of clusters to form. + random_state : int or numpy.random.RandomState (default None) + A pseudo random number generator used for the initialization of the lobpcg + eigen vectors decomposition when ``eigen_solver='amg'`` and by the + :math:`k`-Means initialization. If ``int``, ``random_state`` is the seed + used by the random number generator; If ``numpy.random.RandomState``, + ``random_state`` is the random number generator; If ``None``, + the random number generator is the numpy.random.RandomState + instance used by ``numpy.random``. + gamma : int, float (default 1) + Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. + Ignored for ``affinity='nearest_neighbors'``. + eigen_solver : str (default None) + The eigenvalue decomposition strategy to use. Valid values include + ``{'arpack', 'lobpcg', 'amg'}``. AMG requires ``pyamg`` to be installed, + which may be faster on very large, sparse problems, but may also lead to + instabilities. *Note* – ``eigen_solver`` is ignored unless fitting using + the ``breakme`` flag in the ``.fit()`` method (so do not use then). + n_init : int (default 10) + The number of times the :math:`k`-means algorithm will be run with + different centroid seeds. The final results will be the best output of + ``n_init`` consecutive runs in terms of inertia. + affinity : str, array-like, callable (default 'rbf') + If a ``str``, valid values include + ``{'nearest_neighbors', 'precomputed', 'rbf'}`` or one of the kernels + supported by ``sklearn.metrics.pairwise_kernels``. Only kernels that + produce similarity scores (non-negative values that increase with + similarity) should be used. *This property is not checked + by the clustering algorithm*. + n_neighbors : int (default 10) + The number of neighbors to use when constructing the affinity matrix using + the nearest neighbors method. Ignored for ``affinity='rbf'``. + eigen_tol : float (default 1e-7) + Stopping criterion for eigen decomposition of the Laplacian matrix + when using ``'arpack'`` as the ``eigen_solver``. + assign_labels : str (default 'discretize') + The strategy to use to assign labels in the embedding + space. There are three ways to assign labels after the laplacian + embedding: ``{'kmeans', 'discretize', 'hierarchical'}``: - n_clusters : int - The number of clusters to form. Default is ``5``. + * ``'kmeans'`` can be applied and is a popular choice. But it can also be sensitive to initialization. + * ``'discretize'`` is another approach which is less sensitive to random initialization, and which usually finds better clusters. + * ``'hierarchical'`` decomposition repeatedly bi-partitions the graph, instead of finding the decomposition all at once, as suggested in :cite:`shi_malik_2000`. - random_state : int - Random seed to set for reproducible results. - Default is ``None``. + degree : float (default 3) + Degree of the polynomial affinity kernel. Ignored by other kernels. + coef0 : float (default 1) + Zero coefficient for polynomial and sigmoid affinity kernels. + Ignored by other kernels. + kernel_params : dict (default None) + Parameters (keyword arguments) and values for affinity kernel passed as + callable object. Ignored by other affinity kernels. + n_jobs : int (default 1) + The number of parallel jobs to run for the nearest-neighbors + affinity kernel, if used. If ``-1``, then the number of jobs + is set to the number of CPU cores. - gamma: int - Default is ``1``. + Attributes + ---------- + + affinity_matrix_ : array-like + Affinity matrix used for clustering in the shape of + ``(n_samples, n_samples)``. Available only if after calling ``fit``. + labels_ : list + Cluster labels of each point or area. + + Notes + ----- + + If you have an affinity matrix, such as a distance matrix, + for which ``0`` means identical elements, and high values mean + very dissimilar elements, it can be transformed in a + similarity matrix that is well suited for the algorithm by + applying the Gaussian (RBF, heat) kernel:: + + numpy.exp(-dist_matrix ** 2 / (2. * delta ** 2)) + + Where ``delta`` is a free parameter representing the width of the Gaussian + kernel. + + Another alternative is to take a symmetric version of the + :math:`k`-nearest neighbors connectivity matrix of the points/areas. + + References + ---------- + + - :cite:`shi_malik_2000` Normalized cuts and image segmentation, 2000 + Jianbo Shi, Jitendra Malik – https://doi.org/10.1109/34.868688 + + - :cite:`von2007tutorial` A Tutorial on Spectral Clustering, 2007 + Ulrike von Luxburg – https://doi.org/10.1007/s11222-007-9033-z + + - :cite:`yu_shi_2003` Multiclass spectral clustering, 2003 + Stella X. Yu, Jianbo Shi – https://doi.org/10.1109/ICCV.2003.1238361 + + """ # noqa E402 - """ self.gdf = gdf self.w = w self.attrs_name = attrs_name self.n_clusters = n_clusters self.gamma = gamma self.random_state = random_state + self.eigen_solver = eigen_solver + self.n_init = n_init + self.affinity = affinity + self.n_neighbors = n_neighbors + self.eigen_tol = eigen_tol + self.assign_labels = assign_labels + self.degree = degree + self.coef0 = coef0 + self.kernel_params = kernel_params + self.n_jobs = n_jobs + + def solve(self, fit_kwargs=dict()): + """Solve the spenc. + + Parameters + ---------- + + fit_kwargs : dict + Keyword arguments passed into ``spenclib.abstracts.SPENC.fit()``. + + """ - def solve(self): - """Solve the spenc""" data = self.gdf X = data[self.attrs_name].values model = SPENC( - n_clusters=self.n_clusters, random_state=self.random_state, gamma=self.gamma + n_clusters=self.n_clusters, + random_state=self.random_state, + gamma=self.gamma, + eigen_solver=self.eigen_solver, + n_init=self.n_init, + affinity=self.affinity, + n_neighbors=self.n_neighbors, + eigen_tol=self.eigen_tol, + assign_labels=self.assign_labels, + degree=self.degree, + coef0=self.coef0, + kernel_params=self.kernel_params, + n_jobs=self.n_jobs, ) - model.fit(X, self.w.sparse) + model.fit(X, self.w.sparse, **fit_kwargs) self.labels_ = model.labels_ diff --git a/spopt/region/spenclib/abstracts.py b/spopt/region/spenclib/abstracts.py index 3361619a..e614c266 100644 --- a/spopt/region/spenclib/abstracts.py +++ b/spopt/region/spenclib/abstracts.py @@ -53,115 +53,100 @@ def __init__( kernel function such the Gaussian (aka RBF) kernel of the euclidean distanced ``d(X, X)``:: - np.exp(-gamma * d(X,X) ** 2) + numpy.exp(-gamma * d(X,X) ** 2) - or a k-nearest neighbors connectivity matrix. + or a :math:`k`-nearest neighbors connectivity matrix. Alternatively, using ``precomputed``, a user-provided affinity - matrix can be used. - - Read more in the scikit-learn user guide on spectral clustering + matrix can be used. Read more in the ``scikit-learn`` user guide + on spectral clustering. Parameters ----------- - n_clusters : integer, optional - The number of clusters to search for. - - eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} - NOTE: ignored unless fitting using the `breakme` flag. So, do not use. - The eigenvalue decomposition strategy to use. AMG requires pyamg - to be installed. It can be faster on very large, sparse problems, - but may also lead to instabilities - - random_state : int, RandomState instance or None, optional, default: None - A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by - the K-Means initialization. If int, random_state is the seed used by - the random number generator; If RandomState instance, random_state is - the random number generator; If None, the random number generator is - the RandomState instance used by `np.random`. - - n_init : int, optional, default: 10 - Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. - + n_clusters : int (default 5) + The number of clusters to form. + eigen_solver : str (default None) + The eigenvalue decomposition strategy to use. Valid values include + ``{'arpack', 'lobpcg', 'amg'}``. AMG requires ``pyamg`` to be installed, + which may be faster on very large, sparse problems, but may also lead to + instabilities. *Note* – ``eigen_solver`` is ignored unless fitting using + the ``breakme`` flag in the ``.fit()`` method (so do not use then). + random_state : int or numpy.random.RandomState (default None) + A pseudo random number generator used for the initialization of the lobpcg + eigen vectors decomposition when ``eigen_solver='amg'`` and by the + :math:`k`-Means initialization. If ``int``, ``random_state`` is the seed + used by the random number generator; If ``numpy.random.RandomState``, + ``random_state`` is the random number generator; If ``None``, + the random number generator is the numpy.random.RandomState + instance used by ``numpy.random``. + n_init : int (default 10) + The number of times the :math:`k`-means algorithm will be run with + different centroid seeds. The final results will be the best output of + ``n_init`` consecutive runs in terms of inertia. gamma : float, default=1.0 Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. Ignored for ``affinity='nearest_neighbors'``. - - affinity : string, array-like or callable, default 'rbf' - If a string, this may be one of 'nearest_neighbors', 'precomputed', - 'rbf' or one of the kernels supported by - `sklearn.metrics.pairwise_kernels`. - - Only kernels that produce similarity scores (non-negative values that - increase with similarity) should be used. This property is not checked - by the clustering algorithm. - - n_neighbors : integer - Number of neighbors to use when constructing the affinity matrix using + affinity : str, array-like, callable (default 'rbf') + If a ``str``, valid values include + ``{'nearest_neighbors', 'precomputed', 'rbf'}`` or one of the kernels + supported by ``sklearn.metrics.pairwise_kernels``. Only kernels that + produce similarity scores (non-negative values that increase with + similarity) should be used. *This property is not checked + by the clustering algorithm*. + n_neighbors : int (default 10) + The number of neighbors to use when constructing the affinity matrix using the nearest neighbors method. Ignored for ``affinity='rbf'``. - - eigen_tol : float, optional, default: 1e-7 - Stopping criterion for eigendecomposition of the Laplacian matrix - when using arpack eigen_solver. - - assign_labels : {'kmeans', 'discretize', 'hierarchical'}, default: 'discretize' + eigen_tol : float (default 1e-7) + Stopping criterion for eigen decomposition of the Laplacian matrix + when using ``'arpack'`` as the ``eigen_solver``. + assign_labels : str (default 'discretize') The strategy to use to assign labels in the embedding space. There are three ways to assign labels after the laplacian - embedding. - 1. k-means can be applied and is a popular choice. But it can - also be sensitive to initialization. - 2. Discretization is another approach which is less sensitive to - random initialization, and which usually finds better clusters. - 3. Hierarchical decomposition repeatedly bi-partitions the graph, - instead of finding the decomposition all at once, as suggested in - Shi & Malik (2000). - - degree : float, default=3 - Degree of the polynomial affinity kernel. Ignored by other kernels. + embedding: ``{'kmeans', 'discretize', 'hierarchical'}``: + + * ``'kmeans'`` can be applied and is a popular choice. But it can also be sensitive to initialization. + * ``'discretize'`` is another approach which is less sensitive to random initialization, and which usually finds better clusters. + * ``'hierarchical'`` decomposition repeatedly bi-partitions the graph, instead of finding the decomposition all at once, as suggested in :cite:`shi_malik_2000`. - coef0 : float, default=1 + degree : float (default 3) + Degree of the polynomial affinity kernel. Ignored by other kernels. + coef0 : float (default 1) Zero coefficient for polynomial and sigmoid affinity kernels. Ignored by other kernels. - - kernel_params : dictionary of string to any, optional + kernel_params : dict (default None) Parameters (keyword arguments) and values for affinity kernel passed as callable object. Ignored by other affinity kernels. - - n_jobs : int, optional (default = 1) + n_jobs : int (default 1) The number of parallel jobs to run for the nearest-neighbors - affinity kernel, if used. - If ``-1``, then the number of jobs is set to the number of CPU cores. + affinity kernel, if used. If ``-1``, then the number of jobs + is set to the number of CPU cores. Attributes ---------- - affinity_matrix_ : array-like, shape (n_samples, n_samples) - Affinity matrix used for clustering. Available only if after calling - ``fit``. - - labels_ : - Labels of each point + affinity_matrix_ : array-like + Affinity matrix used for clustering in the shape of + ``(n_samples, n_samples)``. Available only if after calling ``fit``. + labels_ : list + Cluster labels of each point or area. Notes ----- If you have an affinity matrix, such as a distance matrix, - for which 0 means identical elements, and high values means + for which ``0`` means identical elements, and high values mean very dissimilar elements, it can be transformed in a similarity matrix that is well suited for the algorithm by applying the Gaussian (RBF, heat) kernel:: - np.exp(- dist_matrix ** 2 / (2. * delta ** 2)) + numpy.exp(-dist_matrix ** 2 / (2. * delta ** 2)) Where ``delta`` is a free parameter representing the width of the Gaussian kernel. - Another alternative is to take a symmetric version of the k - nearest neighbors connectivity matrix of the points. + Another alternative is to take a symmetric version of the + :math:`k`-nearest neighbors connectivity matrix of the points/areas. References ---------- @@ -179,6 +164,7 @@ def __init__( http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf """ + self.n_clusters = n_clusters self.eigen_solver = eigen_solver self.random_state = random_state @@ -223,7 +209,6 @@ def fit( shift_invert : bool, default True boolean governing whether or not to use shift-invert trick to finding sparse eigenvectors - breakme : bool, default False Whether or not to simply pipe down to the sklearn spectral clustering class. Will likely break the formal guarantees