-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathkplus.py
163 lines (116 loc) · 3.85 KB
/
kplus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import math; #For pow and sqrt
import sys;
from random import shuffle, uniform;
###_Pre-Processing_###
def ReadData(fileName):
#Read the file, splitting by lines
f = open(fileName,'r');
lines = f.read().splitlines();
f.close();
items = [];
for i in range(1,len(lines)):
line = lines[i].split(',');
itemFeatures = [];
for j in range(len(line)-1):
v = float(line[j]); #Convert feature value to float
itemFeatures.append(v); #Add feature value to dict
items.append(itemFeatures);
shuffle(items);
return items;
###_Auxiliary Function_###
def FindColMinMax(items):
n = len(items[0]);
minima = [sys.maxint for i in range(n)];
maxima = [-sys.maxint -1 for i in range(n)];
for item in items:
for f in range(len(item)):
if(item[f] < minima[f]):
minima[f] = item[f];
if(item[f] > maxima[f]):
maxima[f] = item[f];
return minima,maxima;
def EuclideanDistance(x,y):
S = 0; #The sum of the squared differences of the elements
for i in range(len(x)):
S += math.pow(x[i]-y[i],2);
return math.sqrt(S); #The square root of the sum
def InitializeMeans(items,k,cMin,cMax):
#Initialize means to random numbers between
#the min and max of each column/feature
f = len(items[0]); #number of features
means = [[0 for i in range(f)] for j in range(k)];
for mean in means:
for i in range(len(mean)):
#Set value to a random float
#(adding +-1 to avoid a wide placement of a mean)
mean[i] = uniform(cMin[i]+1,cMax[i]-1);
return means;
def UpdateMean(n,mean,item):
for i in range(len(mean)):
m = mean[i];
m = (m*(n-1)+item[i])/float(n);
mean[i] = round(m,3);
return mean;
def FindClusters(means,items):
clusters = [[] for i in range(len(means))]; #Init clusters
for item in items:
#Classify item into a cluster
index = Classify(means,item);
#Add item to cluster
clusters[index].append(item);
return clusters;
###_Core Functions_###
def Classify(means,item):
#Classify item to the mean with minimum distance
minimum = sys.maxint;
index = -1;
for i in range(len(means)):
#Find distance from item to mean
dis = EuclideanDistance(item,means[i]);
if(dis < minimum):
minimum = dis;
index = i;
return index;
def CalculateMeans(k,items,maxIterations=100000):
#Find the minima and maxima for columns
cMin, cMax = FindColMinMax(items);
#Initialize means at random points
means = InitializeMeans(items,k,cMin,cMax);
#Initialize clusters, the array to hold
#the number of items in a class
clusterSizes = [0 for i in range(len(means))];
#An array to hold the cluster an item is in
belongsTo = [0 for i in range(len(items))];
#Calculate means
for e in range(maxIterations):
#If no change of cluster occurs, halt
noChange = True;
for i in range(len(items)):
item = items[i];
#Classify item into a cluster and update the
#corresponding means.
index = Classify(means,item);
clusterSizes[index] += 1;
means[index] = UpdateMean(clusterSizes[index],means[index],item);
#Item changed cluster
if(index != belongsTo[i]):
noChange = False;
belongsTo[i] = index;
#Nothing changed, return
if(noChange):
break;
return means;
###_Main_###
'''
def main():
items = ReadData('data.txt');
k = 3;
means = CalculateMeans(k,items);
clusters = FindClusters(means,items);
print means;
print clusters;
#newItem = [5.4,3.7,1.5,0.2];
#print Classify(means,newItem);
if __name__ == "__main__":
main();
'''