Skip to content

Commit

Permalink
expose SPENC kwargs (#312)
Browse files Browse the repository at this point in the history
* expose SPENC kwargs

* resolves #310
  • Loading branch information
jGaboardi authored Dec 11, 2022
1 parent f3066c8 commit ecfe2cc
Show file tree
Hide file tree
Showing 8 changed files with 268 additions and 99 deletions.
1 change: 0 additions & 1 deletion .ci/310-DEV.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ dependencies:
- pip
- scikit-learn>=0.22
- scipy>=1.0
- spenc
- spaghetti
# testing
- codecov
Expand Down
1 change: 0 additions & 1 deletion .ci/310-DEV_shapely_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ dependencies:
- scikit-learn>=0.22
- scipy>=1.0
- shapely>=2.0b1
- spenc
- spaghetti
# testing
- codecov
Expand Down
1 change: 0 additions & 1 deletion .ci/310.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ dependencies:
- scikit-learn>=0.22
- scipy>=1.0
- shapely
- spenc
- spaghetti
# testing
- codecov
Expand Down
1 change: 0 additions & 1 deletion .ci/38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ dependencies:
- scikit-learn>=0.22
- scipy>=1.0
- shapely
- spenc
- spaghetti
# testing
- codecov
Expand Down
1 change: 0 additions & 1 deletion .ci/39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ dependencies:
- scikit-learn>=0.22
- scipy>=1.0
- shapely
- spenc
- spaghetti
# testing
- codecov
Expand Down
37 changes: 36 additions & 1 deletion docs/_static/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,20 @@ @article{openshaw1995algorithms
}


@article{shi_malik_2000,
author={Jianbo Shi and Malik, J.},
journal={{IEEE Transactions on Pattern Analysis and Machine Intelligence}},
title={Normalized cuts and image segmentation},
year={2000},
volume={22},
number={8},
pages={888--905},
doi={10.1109/34.868688}
}


@article{toregas_swain_revelle_bergman_1971,
author={Toregas, Constantine and Swain, Ralph and Revelle, Charles and Bergman, Lawrence},
author={Toregas, Constantine and Swain, Ralph and Revelle, Charles and Bergman, Lawrence},
title={The Location of Emergency Service Facilities},
volume={19},
journal={Operations Research},
Expand All @@ -161,6 +173,19 @@ @article{toregas_swain_revelle_bergman_1971
}


@article{von2007tutorial,
title={A tutorial on spectral clustering},
author={{von Luxburg}, Ulrike},
journal={{Statistics and Computing}},
volume={17},
number={4},
pages={395--416},
year={2007},
publisher={Springer},
doi={10.1007/s11222-007-9033-z},
}


@article{wei2020efficient,
title={Efficient Regionalization for Spatially Explicit Neighborhood Delineation},
author={Wei, R and Rey, SJ and Knaap, E},
Expand Down Expand Up @@ -192,4 +217,14 @@ @article{wolf2021
}


@inproceedings{yu_shi_2003,
author={Yu, Stella X. and Shi, Jianbo},
booktitle={Proceedings Ninth IEEE International Conference on Computer Vision},
title={Multiclass spectral clustering},
year={2003},
volume={1},
pages={313-319},
doi={10.1109/ICCV.2003.1238361}}
%======
192 changes: 173 additions & 19 deletions spopt/region/spenc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,50 +4,204 @@

class Spenc(BaseSpOptHeuristicSolver):
"""
Spatially encouraged spectral clustering.
:cite:`wolf2018`
Spatially encouraged spectral clustering found in :cite:`wolf2018`.
Apply clustering to a projection of the normalized laplacian, using
spatial information to constrain the clustering. In practice Spectral
Clustering is very useful when the structure of the individual clusters
is highly non-convex or more generally when a measure of the center and
spread of the cluster is not a suitable description of the complete cluster.
For instance when clusters are nested circles on the 2D plan.
Spatially-Encouraged Spectral Clustering (*SPENC*) is useful for when
there may be highly non-convex clusters or clusters with irregular
topology in a geographic context. If a binary weights matrix is provided
during fit, this method can be used to find weighted normalized graph cuts.
When calling ``fit``, an affinity matrix is constructed using either
kernel function such the Gaussian (aka RBF) kernel of the euclidean
distanced :math:`d(X, X)`::
numpy.exp(-gamma * d(X,X) ** 2)
or a :math:`k`-nearest neighbors connectivity matrix. Alternatively,
using ``precomputed``, a user-provided affinity matrix can be used.
Read more in the ``scikit-learn`` user guide on spectral clustering.
"""

def __init__(self, gdf, w, attrs_name, n_clusters=5, random_state=None, gamma=1):
def __init__(
self,
gdf,
w,
attrs_name,
n_clusters=5,
random_state=None,
gamma=1,
eigen_solver=None,
n_init=10,
affinity="rbf",
n_neighbors=10,
eigen_tol=1e-9,
assign_labels="discretize",
degree=3,
coef0=1,
kernel_params=None,
n_jobs=1,
):
"""
Parameters
----------
gdf : geopandas.GeoDataFrame
Input data.
w : libpywal.weights.W
Spatial weights matrix.
attrs_name : list
Strings for attribute names
(columns of ``geopandas.GeoDataFrame``).
Strings for attribute names from columns in ``gdf``.
n_clusters : int (default 5)
The number of clusters to form.
random_state : int or numpy.random.RandomState (default None)
A pseudo random number generator used for the initialization of the lobpcg
eigen vectors decomposition when ``eigen_solver='amg'`` and by the
:math:`k`-Means initialization. If ``int``, ``random_state`` is the seed
used by the random number generator; If ``numpy.random.RandomState``,
``random_state`` is the random number generator; If ``None``,
the random number generator is the numpy.random.RandomState
instance used by ``numpy.random``.
gamma : int, float (default 1)
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
Ignored for ``affinity='nearest_neighbors'``.
eigen_solver : str (default None)
The eigenvalue decomposition strategy to use. Valid values include
``{'arpack', 'lobpcg', 'amg'}``. AMG requires ``pyamg`` to be installed,
which may be faster on very large, sparse problems, but may also lead to
instabilities. *Note* – ``eigen_solver`` is ignored unless fitting using
the ``breakme`` flag in the ``.fit()`` method (so do not use then).
n_init : int (default 10)
The number of times the :math:`k`-means algorithm will be run with
different centroid seeds. The final results will be the best output of
``n_init`` consecutive runs in terms of inertia.
affinity : str, array-like, callable (default 'rbf')
If a ``str``, valid values include
``{'nearest_neighbors', 'precomputed', 'rbf'}`` or one of the kernels
supported by ``sklearn.metrics.pairwise_kernels``. Only kernels that
produce similarity scores (non-negative values that increase with
similarity) should be used. *This property is not checked
by the clustering algorithm*.
n_neighbors : int (default 10)
The number of neighbors to use when constructing the affinity matrix using
the nearest neighbors method. Ignored for ``affinity='rbf'``.
eigen_tol : float (default 1e-7)
Stopping criterion for eigen decomposition of the Laplacian matrix
when using ``'arpack'`` as the ``eigen_solver``.
assign_labels : str (default 'discretize')
The strategy to use to assign labels in the embedding
space. There are three ways to assign labels after the laplacian
embedding: ``{'kmeans', 'discretize', 'hierarchical'}``:
n_clusters : int
The number of clusters to form. Default is ``5``.
* ``'kmeans'`` can be applied and is a popular choice. But it can also be sensitive to initialization.
* ``'discretize'`` is another approach which is less sensitive to random initialization, and which usually finds better clusters.
* ``'hierarchical'`` decomposition repeatedly bi-partitions the graph, instead of finding the decomposition all at once, as suggested in :cite:`shi_malik_2000`.
random_state : int
Random seed to set for reproducible results.
Default is ``None``.
degree : float (default 3)
Degree of the polynomial affinity kernel. Ignored by other kernels.
coef0 : float (default 1)
Zero coefficient for polynomial and sigmoid affinity kernels.
Ignored by other kernels.
kernel_params : dict (default None)
Parameters (keyword arguments) and values for affinity kernel passed as
callable object. Ignored by other affinity kernels.
n_jobs : int (default 1)
The number of parallel jobs to run for the nearest-neighbors
affinity kernel, if used. If ``-1``, then the number of jobs
is set to the number of CPU cores.
gamma: int
Default is ``1``.
Attributes
----------
affinity_matrix_ : array-like
Affinity matrix used for clustering in the shape of
``(n_samples, n_samples)``. Available only if after calling ``fit``.
labels_ : list
Cluster labels of each point or area.
Notes
-----
If you have an affinity matrix, such as a distance matrix,
for which ``0`` means identical elements, and high values mean
very dissimilar elements, it can be transformed in a
similarity matrix that is well suited for the algorithm by
applying the Gaussian (RBF, heat) kernel::
numpy.exp(-dist_matrix ** 2 / (2. * delta ** 2))
Where ``delta`` is a free parameter representing the width of the Gaussian
kernel.
Another alternative is to take a symmetric version of the
:math:`k`-nearest neighbors connectivity matrix of the points/areas.
References
----------
- :cite:`shi_malik_2000` Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik – https://doi.org/10.1109/34.868688
- :cite:`von2007tutorial` A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg – https://doi.org/10.1007/s11222-007-9033-z
- :cite:`yu_shi_2003` Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi – https://doi.org/10.1109/ICCV.2003.1238361
""" # noqa E402

"""
self.gdf = gdf
self.w = w
self.attrs_name = attrs_name
self.n_clusters = n_clusters
self.gamma = gamma
self.random_state = random_state
self.eigen_solver = eigen_solver
self.n_init = n_init
self.affinity = affinity
self.n_neighbors = n_neighbors
self.eigen_tol = eigen_tol
self.assign_labels = assign_labels
self.degree = degree
self.coef0 = coef0
self.kernel_params = kernel_params
self.n_jobs = n_jobs

def solve(self, fit_kwargs=dict()):
"""Solve the spenc.
Parameters
----------
fit_kwargs : dict
Keyword arguments passed into ``spenclib.abstracts.SPENC.fit()``.
"""

def solve(self):
"""Solve the spenc"""
data = self.gdf
X = data[self.attrs_name].values
model = SPENC(
n_clusters=self.n_clusters, random_state=self.random_state, gamma=self.gamma
n_clusters=self.n_clusters,
random_state=self.random_state,
gamma=self.gamma,
eigen_solver=self.eigen_solver,
n_init=self.n_init,
affinity=self.affinity,
n_neighbors=self.n_neighbors,
eigen_tol=self.eigen_tol,
assign_labels=self.assign_labels,
degree=self.degree,
coef0=self.coef0,
kernel_params=self.kernel_params,
n_jobs=self.n_jobs,
)
model.fit(X, self.w.sparse)
model.fit(X, self.w.sparse, **fit_kwargs)
self.labels_ = model.labels_
Loading

0 comments on commit ecfe2cc

Please sign in to comment.