-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsafire_kmeans.py
executable file
·154 lines (117 loc) · 4.03 KB
/
safire_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Code source: https://github.com/subhadarship/kmeans_pytorch
import numpy as np
import torch
def initialize(X, num_clusters):
"""
initialize cluster centers
:param X: (torch.tensor) matrix
:param num_clusters: (int) number of clusters
:return: (np.array) initial state
"""
num_samples = len(X)
indices = np.random.choice(num_samples, num_clusters, replace=False)
initial_state = X[indices]
return initial_state
def kmeans(
X,
num_clusters,
distance='euclidean',
tol=1e-4,
device=torch.device('cpu')
):
"""
perform kmeans
:param X: (torch.tensor) matrix
:param num_clusters: (int) number of clusters
:param distance: (str) distance [options: 'euclidean', 'cosine'] [default: 'euclidean']
:param tol: (float) threshold [default: 0.0001]
:param device: (torch.device) device [default: cpu]
:return: (torch.tensor, torch.tensor) cluster ids, cluster centers
"""
if distance == 'euclidean':
pairwise_distance_function = pairwise_distance
elif distance == 'cosine':
pairwise_distance_function = pairwise_cosine
else:
raise NotImplementedError
# convert to float
X = X.float()
# transfer to device
X = X.to(device)
# initialize
initial_state = initialize(X, num_clusters)
iteration = 0
while True:
dis = pairwise_distance_function(X, initial_state)
choice_cluster = torch.argmin(dis, dim=1)
initial_state_pre = initial_state.clone()
for index in range(num_clusters):
selected = torch.nonzero(choice_cluster == index).squeeze().to(device)
selected = torch.index_select(X, 0, selected)
initial_state[index] = selected.mean(dim=0)
center_shift = torch.sum(
torch.sqrt(
torch.sum((initial_state - initial_state_pre) ** 2, dim=1)
))
if torch.isnan(center_shift):
break
# increment iteration
iteration = iteration + 1
if center_shift ** 2 < tol:
break
if iteration > 1000:
break
return choice_cluster.cpu(), initial_state.cpu()
def kmeans_predict(
X,
cluster_centers,
distance='euclidean',
device=torch.device('cpu')
):
"""
predict using cluster centers
:param X: (torch.tensor) matrix
:param cluster_centers: (torch.tensor) cluster centers
:param distance: (str) distance [options: 'euclidean', 'cosine'] [default: 'euclidean']
:param device: (torch.device) device [default: 'cpu']
:return: (torch.tensor) cluster ids
"""
print(f'predicting on {device}..')
if distance == 'euclidean':
pairwise_distance_function = pairwise_distance
elif distance == 'cosine':
pairwise_distance_function = pairwise_cosine
else:
raise NotImplementedError
# convert to float
X = X.float()
# transfer to device
X = X.to(device)
dis = pairwise_distance_function(X, cluster_centers)
choice_cluster = torch.argmin(dis, dim=1)
return choice_cluster.cpu()
def pairwise_distance(data1, data2, device=torch.device('cpu')):
# transfer to device
data1, data2 = data1.to(device), data2.to(device)
# N*1*M
A = data1.unsqueeze(dim=1)
# 1*N*M
B = data2.unsqueeze(dim=0)
dis = (A - B) ** 2.0
# return N*N matrix for pairwise distance
dis = dis.sum(dim=-1).squeeze()
return dis
def pairwise_cosine(data1, data2, device=torch.device('cpu')):
# transfer to device
data1, data2 = data1.to(device), data2.to(device)
# N*1*M
A = data1.unsqueeze(dim=1)
# 1*N*M
B = data2.unsqueeze(dim=0)
# normalize the points | [0.3, 0.4] -> [0.3/sqrt(0.09 + 0.16), 0.4/sqrt(0.09 + 0.16)] = [0.3/0.5, 0.4/0.5]
A_normalized = A / A.norm(dim=-1, keepdim=True)
B_normalized = B / B.norm(dim=-1, keepdim=True)
cosine = A_normalized * B_normalized
# return N*N matrix for pairwise distance
cosine_dis = 1 - cosine.sum(dim=-1).squeeze()
return cosine_dis