Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add inertia and silhouette score progress bars for neighborhood analysis notebook #758

Merged
merged 3 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ark/spLDA/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def compute_topic_eda(features, featurization, topics, silhouette=False, num_boo
stat_names = ['inertia', 'silhouette', 'gap_stat', 'gap_sds', 'percent_var_exp', "cell_counts"]
stats = dict(zip(stat_names, [{} for name in stat_names]))

# iterative over topic number candidates
# iterate over topic number candidates
pb_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
for k in tqdm(topics, bar_format=pb_format):
# cluster with KMeans
Expand Down
17 changes: 11 additions & 6 deletions ark/utils/spatial_analysis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from statsmodels.stats.multitest import multipletests
from tqdm.notebook import tqdm

import ark.settings as settings
from ark.utils import io_utils, misc_utils
Expand Down Expand Up @@ -481,8 +482,8 @@ def compute_kmeans_inertia(neighbor_mat_data, min_k=2, max_k=10):

Returns:
xarray.DataArray:
contains a single dimension, cluster_num, which indicates the inertia
when cluster_num was set as k for k-means clustering
contains a single dimension, `cluster_num`, which indicates the inertia
when `cluster_num` was set as k for k-means clustering
"""

# create array we can store the results of each k for clustering
Expand All @@ -491,7 +492,9 @@ def compute_kmeans_inertia(neighbor_mat_data, min_k=2, max_k=10):
stats_raw_data = np.zeros(max_k - 1)
cluster_stats = xr.DataArray(stats_raw_data, coords=coords, dims=dims)

for n in range(min_k, max_k + 1):
# iterate over each k value
pb_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
for n in tqdm(range(min_k, max_k + 1), bar_format=pb_format):
cluster_fit = KMeans(n_clusters=n).fit(neighbor_mat_data)
cluster_stats.loc[n] = cluster_fit.inertia_

Expand All @@ -516,8 +519,8 @@ def compute_kmeans_silhouette(neighbor_mat_data, min_k=2, max_k=10, subsample=No

Returns:
xarray.DataArray:
contains a single dimension, cluster_num, which indicates the Silhouette score
when cluster_num was set as k for k-means clustering
contains a single dimension, `cluster_num`, which indicates the Silhouette score
when `cluster_num` was set as k for k-means clustering
"""

# create array we can store the results of each k for clustering
Expand All @@ -526,7 +529,9 @@ def compute_kmeans_silhouette(neighbor_mat_data, min_k=2, max_k=10, subsample=No
stats_raw_data = np.zeros(max_k - 1)
cluster_stats = xr.DataArray(stats_raw_data, coords=coords, dims=dims)

for n in range(min_k, max_k + 1):
# iterate over each k value
pb_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
for n in tqdm(range(min_k, max_k + 1), bar_format=pb_format):
cluster_fit = KMeans(n_clusters=n).fit(neighbor_mat_data)
cluster_labels = cluster_fit.labels_

Expand Down