-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering_algs.py
384 lines (329 loc) · 14 KB
/
clustering_algs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# Michael Tripp
# 5/1/2023 | CS614
# Lab 3: Unsupervised Learning
#
# Program to implement two types of unsupervised learning
# algorithms, agglomerative clustering and DBSCAN. See
# README.md for more information on how these are implemented
# outside of the code comments. See also results_discussion.pdf
# for analysis and discussion of the results.
import pprint
import math
import copy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
'''
Function that calculates the euclidian distance
between two points p1 and p2
'''
def euclidian_distance(p1, p2):
if len(p1) != len(p2):
print("Error: p1 and p2 are not the same dimension")
summation = 0
n = len(p1)
for i in range(0, n):
summation += (float(p1[i]) - float(p2[i]))**2
return math.sqrt(summation)
'''
Method used to calculate the distance between all points in data
using a distance function d and return these distances in a list.
'''
def calc_distances(data, d):
distances = []
n = len(data)
for i in range(n):
row = []
for j in range(n):
if i == j:
dist = 0
else:
dist = d(data[i][0], data[j][0])
row.append(dist)
distances.append(row)
return distances
'''
Method used to plot 4D data in a 2D plane given x_index and y_index
as the indices of which features to plot. For higher dimensional data lists.
'''
def plot_data(data, x_index, y_index, show_plot):
x = []
y = []
for i in range(len(data)):
x_row = []
y_row = []
for j in range(len(data[i])):
x_row.append(data[i][j][x_index])
y_row.append(data[i][j][y_index])
x.append(x_row)
y.append(y_row)
plt.xlabel('x - axis')
plt.ylabel('y - axis')
for i in range(len(data)):
plt.scatter(x[i], y[i])
if show_plot:
plt.show()
'''
Method used to plot 4D data in a 2D plane given x_index and y_index
as the indices of which features to plot. For 1D data lists.
'''
def plot_data_1D(data, x_index, y_index):
x = []
y = []
for i in range(len(data)):
x.append(data[i][x_index])
y.append(data[i][y_index])
plt.xlabel('x - axis')
plt.ylabel('y - axis')
plt.scatter(x, y, color='grey')
plt.show()
'''
Single Link Agglomerative Clustering
- cluster_map : data to be clustered
- d : distance function
- growth_rate : rate at which radius around each data point increases
- x_axis/y_axis : indicate which features to plot as the axes
'''
def agglomerative_clustering(cluster_map, d, growth_rate, x_axis, y_axis):
radius = 0.0
num_clusters = len(cluster_map) # Used to partition later at 3 clusters
distances = calc_distances(cluster_map, d) # Store distance data between all points once at start for use later
n = len(cluster_map)
partition_at_3_clusters = []
partition_with_longest_time = []
greatest_time = 0 # Greatest amount of time spent at a given number of clusters
time = 0 # Time spent at current number of clusters
while num_clusters > 1:
radius += growth_rate # Increase radius at each time step
time += 1 # Increment time to track current time step
# For each point cluster_map[i], cluster points that are within radius
for i in range(n):
for j in range(i+1, n):
if distances[i][j] <= radius:
q = i
r = j
# Find what cluster each point is actually in by following the map
# (Since points already clustered are replaced by the cluster index they were merged into)
while type(cluster_map[q]) == int:
q = cluster_map[q]
while type(cluster_map[r]) == int:
r = cluster_map[r]
if q != r:
'''
Combine two clusters into each other
'''
# Start by saving partitions according to cluster number/time spent
if num_clusters == 3 or time > greatest_time:
clusters = []
# Save clusters in dictionary in a list to be stored as partitions
for k in range(len(cluster_map)):
if type(cluster_map[k]) != int:
clusters.append(cluster_map[k])
# Save partitions accordingly
if num_clusters == 3:
partition_at_3_clusters = copy.deepcopy(clusters)
if time > greatest_time:
greatest_time = time
partition_with_longest_time = copy.deepcopy(clusters)
time = 0
# Combine clusters
for k in range(len(cluster_map[r])):
cluster_map[q].append(cluster_map[r][k])
cluster_map[r] = q
# Finally, decrement number of clusters
num_clusters -= 1
plt.figure(num_clusters)
#plot_data(partition_at_3_clusters, x_axis, y_axis)
plot_data(partition_with_longest_time, x_axis, y_axis, True)
'''
Method used to convert a list of indices to a list containing the actual
data entries corresponding to those indices. For higher dimensional lists.
'''
def indicies_to_data(data, data_indices):
l = []
for i in range(len(data_indices)):
row = []
for j in range(len(data_indices[i])):
row.append(data[data_indices[i][j]])
l.append(row)
return l
'''
Method used to convert a list of indices to a list containing the actual
data entries corresponding to those indices. For 1D lists.
'''
def indicies_to_data2(data, data_indices):
l = []
for i in data_indices:
l.append(data[i])
return l
'''
Function to implement DBSCAN
- data : data to be clustered
- d : distance function
- eps : search distance epsilon
- minPts : minimum features per cluster
- x_axis/y_axis : indicate which features to plot as the axes
'''
def DBSCAN(data, d, eps, minPts, x_axis, y_axis):
n = len(data)
all_Nx_indices = []
core_pt_indices = []
noise_indices = []
clusters = []
''' 1. Find all core points '''
# For all points x, find and store its corresponding Nx list
for i in range(n):
x = data[i]
Nx_indices = []
for j in range(i+1, n):
y = data[j]
if d(x, y) <= eps:
Nx_indices.append(j) # If distance x and y are within the search distance, add y to Nx
if len(Nx_indices) > minPts:
core_pt_indices.append(i) # Mark x as a core point if there are enough points in Nx
all_Nx_indices.append(Nx_indices)
''' 2. Find all core points reachable from other core points
and merge accordingly '''
m = len(core_pt_indices)
# For each core point core_pt1, find which core points are reachable and merge as necessary
for i in range(m):
core_pt1_index = core_pt_indices[i]
core_pt1 = data[core_pt1_index]
for j in range(i+1, m):
core_pt2_index = core_pt_indices[j]
core_pt2 = data[core_pt2_index]
# Join both core points into the same cluster if within distance
if d(core_pt1, core_pt2) <= eps:
cluster_pt1 = -1 # Stores which cluster pt1 is in (-1 if not in a cluster)
cluster_pt2 = -1 # Same for pt2
# Find if either are already in a cluster and store which cluster they are in if so
for k in range(len(clusters)):
if core_pt1_index in clusters[k]:
cluster_pt1 = k
if core_pt2_index in clusters[k]:
cluster_pt2 = k
if cluster_pt1 != -1 and cluster_pt2 != -1:
break
if cluster_pt1 != -1 and cluster_pt1 == cluster_pt2:
# Both core points are already in the same cluster
break
elif cluster_pt1 == -1 and cluster_pt2 == -1:
# Neither core point is already in a cluster, so add them both to a new cluster
clusters.append([core_pt1_index, core_pt2_index])
elif cluster_pt1 != -1 and cluster_pt2 != -1:
# Both core points are in clusters already, so merge the smaller cluster into the bigger one
if len(clusters[cluster_pt1]) < len(clusters[cluster_pt2]):
clusters[cluster_pt2].extend(clusters[cluster_pt1])
clusters.pop(cluster_pt1)
else:
clusters[cluster_pt1].extend(clusters[cluster_pt2])
clusters.pop(cluster_pt2)
elif cluster_pt1 != -1:
# Only core point 1 is in a cluster, so merge core point 2 into core point 1's cluster
clusters[cluster_pt1].append(core_pt2_index)
else:
# Only core point 2 is in a cluster, so merge core point 1 into core point 2's cluster
clusters[cluster_pt2].append(core_pt1_index)
# Add in remaining core points as clusters of their own
for i in range(m):
clustered = False
for j in range(len(clusters)):
# Search through all clusters and check if core point is in a cluster already
if core_pt_indices[i] in clusters[j]:
clustered = True
break
if not clustered:
# Add core point as its own individual cluster if not in any cluster already
clusters.append([core_pt_indices[i]])
''' 3. For each non-core point z, join into cluster or assign as noise '''
for z in range(n):
if z in core_pt_indices:
continue
is_noise = True
for j in range(len(core_pt_indices)):
# Check if z is in neighborhood Nx of any core point x
x_index = core_pt_indices[j]
Nx = all_Nx_indices[x_index]
if z in Nx:
# Add z to x's cluster (loop through to find what cluster is x's)
for k in range(len(clusters)):
if x_index in clusters[k]:
clusters[k].append(z)
break
is_noise = False
if is_noise:
noise_indices.append(z)
# Plot each cluster in a different color and noise in grey
clusters_data = indicies_to_data(data, clusters)
plot_data(clusters_data, x_axis, y_axis, False)
noise_data = indicies_to_data2(data, noise_indices)
plot_data_1D(noise_data, x_axis, y_axis)
def plot_labeled_data(labeled_data):
x = []
y = []
for i in range(len(labeled_data)):
x_row = []
y_row = []
for j in range(len(labeled_data[i])):
x_row.append(labeled_data[i][j][x_axis])
y_row.append(labeled_data[i][j][y_axis])
x.append(x_row)
y.append(y_row)
plt.xlabel('x - axis')
plt.ylabel('y - axis')
fig, ax = plt.subplots()
plt.scatter(x[0], y[0], label = 'Iris-setosa')
plt.scatter(x[1], y[1], label = 'Iris-versicolor')
plt.scatter(x[2], y[2], label = 'Iris-virginica')
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles=handles, loc = 'lower right')
plt.show()
if __name__ == '__main__':
'''
FILE NAME: Set file name path for data here
'''
fname = "./data/iris.data"
fin = open(fname)
data_map = {}
data_list = []
labeled_data = [[], [], []]
clusternum = 0
AGGLOM_ALG = 0
DBSCAN_ALG = 1
PLOT_LABELS = 2
# Store data from file
for line in fin:
if line != "" and line != "\n":
line = ((line.strip()).split(","))
label = line[4]
line = line[:4]
line = list(map(float, line))
# Collect labeled data for later comparison
if label == "Iris-setosa":
labeled_data[0].append(line)
elif label == "Iris-versicolor":
labeled_data[1].append(line)
else:
labeled_data[2].append(line)
# Store data in a list for DBSCAN and in a dictionary for agglomerative clustering
data_list.append(line)
data_map[clusternum] = [line]
clusternum += 1
'''
PARAMETERS: Set specific parameters here!
'''
# Agglomerative clustering
growth_rate = 0.01
# DBSCAN
search_distance = 0.6
minPts = 10
# indicate which features to plot as the axes
x_axis = 0
y_axis = 3
ALG = DBSCAN_ALG # <-- Choose what algorithm to run
if ALG == DBSCAN_ALG:
DBSCAN(data_list, euclidian_distance, search_distance, minPts, x_axis, y_axis)
elif ALG == AGGLOM_ALG:
agglomerative_clustering(data_map, euclidian_distance, growth_rate, x_axis, y_axis)
elif ALG == PLOT_LABELS:
plot_labeled_data(labeled_data)