-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans.py
91 lines (72 loc) · 2.58 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import csv
import random
import pdb
import numpy as np
def getRandomPoint(data):
dimension = data.shape[1]
point = np.zeros((1, dimension))
for d in range(dimension):
minVal = np.amin(data[:, d])
maxVal = np.amax(data[:, d])
point[0, d] = random.random() * (maxVal - minVal) + minVal
return point
def getInitialCenters(dimension, numClusters, data):
initialCenters = np.zeros((numClusters, dimension))
# # Generate points randomly in range
# for d in range(dimension):
# minVal = np.amin(data[:, d])
# maxVal = np.amax(data[:, d])
# for c in range(numClusters):
# initialCenters[c, d] = random.random() * (maxVal - minVal) + minVal
# Choose one of the data points randomly
for c in range(numClusters):
initialCenters[c, :] = data[random.randint(0, data.shape[0]-1), :]
return initialCenters
def assignPointsToNearestCluster(data, centers):
dimension = data.shape[1]
numPoints = data.shape[0]
numClusters = centers.shape[0]
reshapedDataPoints = np.tile(data.reshape(numPoints, dimension, 1), (1, 1, numClusters))
reshapedCenters = np.tile(np.transpose(centers).reshape(1, dimension, numClusters), (numPoints, 1, 1))
calNearest = reshapedDataPoints - reshapedCenters
calNearest = np.square(calNearest)
calNearest = np.sum(calNearest, axis=1)
calNearest = np.sqrt(calNearest).reshape(numPoints, numClusters)
currentAssignment = np.argmin(calNearest, axis=1).reshape(numPoints) + 1
# So, no cluster is assigned an id 0
return currentAssignment
def runKMeans(numClusters, data):
threshold = 1e-10
dimension = data.shape[1]
numPoints = data.shape[0]
initialCenters = getInitialCenters(dimension, numClusters, data)
currentCenters = initialCenters
currentAssignment = np.zeros((numPoints, 1))
while True:
# Assign each point to its nearest center
currentAssignment = assignPointsToNearestCluster(data, currentCenters)
# Calculate the new mean of each cluster
newCenters = np.zeros((numClusters, dimension))
flag = -1
for c in range(numClusters):
numAssigned = np.count_nonzero(currentAssignment == c)
if numAssigned == 0:
flag = c
break
newCenters[c, :] = np.sum(data[currentAssignment == c, :], axis=0)/(1.0 * numAssigned)
if flag != -1:
currentCenters[flag] = data[random.randint(0, numPoints-1), :]
print 'reassigning',
continue
# Test convergence
change = (newCenters - currentCenters)
change = np.square(change)
change = np.sum(change, axis=1)
change = np.sqrt(change)
change = np.sum(change, axis=0)
if change < threshold:
break
# Else repeat
currentCenters = newCenters
return currentAssignment, newCenters