-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo5.py
154 lines (136 loc) · 4.75 KB
/
demo5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*- coding: UTF-8 -*-
from collections import defaultdict
from random import uniform
from math import sqrt
def read_points():
dataset = []
with open('D:\debt.csv', 'r', encoding='utf8') as file:
for line in file:
if line == '\n':
continue
result = line.strip().split(',')
fltline = [float(i) for i in result]
dataset.append(fltline)
file.close()
return dataset
def write_results(listResult, dataset, k):
with open('D:\demo1.csv', 'a') as file:
for kind in range(k):
file.write("CLASSINFO:%d\n" % (kind + 1))
for j in listResult[kind]:
file.write('%d\n' % j)
file.write('\n')
file.write('\n\n')
file.close()
def point_avg(points):
dimensions = len(points[0])
new_center = []
for dimension in range(dimensions):
sum = 0
for p in points:
sum += p[dimension]
new_center.append(float("%.8f" % (sum / float(len(points)))))
return new_center
def update_centers(data_set, assignments, k):
new_means = defaultdict(list)
centers = []
for assignment, point in zip(assignments, data_set):
new_means[assignment].append(point)
for i in range(k):
points = new_means[i]
if len(points) != 0:
centers.append(point_avg(points))
return centers
def assign_points(data_points, centers):
assignments = []
for point in data_points:
shortest = float('inf')
shortest_index = 0
for i in range(len(centers)):
value = distance(point, centers[i])
if value < shortest:
shortest = value
shortest_index = i
assignments.append(shortest_index)
# if len(set(assignments)) < len(centers):
# print("\n--!!!产生随机数错误,请重新运行程序!!!!--\n")
# exit()
return assignments
def distance(a, b):
dimention = len(a)
sum = 0
for i in range(dimention):
sq = (a[i] - b[i]) ** 2
sum += sq
return sqrt(sum)
def generate_k(data_set, k):
centers = []
dimentions = len(data_set[0])
min_max = defaultdict(int)
for point in data_set:
for i in range(dimentions):
value = point[i]
min_key = 'min_%d' % i
max_key = 'max_%d' % i
if min_key not in min_max or value < min_max[min_key]:
min_max[min_key] = value
if max_key not in min_max or value > min_max[max_key]:
min_max[max_key] = value
for j in range(k):
rand_point = []
for i in range(dimentions):
min_val = min_max['min_%d' % i]
max_val = min_max['max_%d' % i]
tmp = float("%.8f" % (uniform(min_val, max_val)))
rand_point.append(tmp)
centers.append(rand_point)
return centers
def k_means(dataset, k):
k_points = generate_k(dataset, k)
assignments = assign_points(dataset, k_points)
old_assignments = None
while assignments != old_assignments:
new_centers = update_centers(dataset, assignments, k)
old_assignments = assignments
assignments = assign_points(dataset, new_centers)
result = list(zip(assignments, dataset))
# print(len(result))
result_len=len(result)
print(result_len)
print('\n\n---------------------------------分类结果---------------------------------------\n\n')
for out in result:
print(out, end='\n')
print('\n\n---------------------------------标号简记---------------------------------------\n\n')
listResult = [[] for i in range(k)]
count = 0
for i in assignments:
listResult[i].append(count)
count = count + 1
listResult_len0=len(listResult[0])
listResult_len1=len(listResult[1])
listResult_len2=len(listResult[2])
print(listResult_len0)
write_results(listResult, dataset, k)
for kind in range(k):
# kind.to_bytes()
print("第%d类数据对应的行数为:" % (kind + 1))
count = 0
for j in listResult[kind]:
print(j, end=' ')
count = count + 1
if count % 25 == 0:
print('\n')
print('\n')
print('\n\n--------------------------------------------------------------------------------\n\n')
precent0=round(listResult_len0*100/result_len,2)
precent1=round(listResult_len1*100/result_len,2)
precent2=round(listResult_len2*100/result_len,2)
print ("第一类数据占总数的:" +str(precent0)+"%")
print("第二类数据占总数的:" + str(precent1)+"%")
print("第三类数据占总数的:" + str(precent2)+"%")
def main():
dataset = read_points()
k_means(dataset, 3)
if __name__ == "__main__":
main()
pass