-
Notifications
You must be signed in to change notification settings - Fork 1
/
clus_sub.py
186 lines (171 loc) · 4.47 KB
/
clus_sub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import ipaddr
from cluster import *
import math
import numpy as np
from numpy import arange,array,ones,linalg
def hamming_distance(h1, h2, b):
x = (h1 ^ h2) & ((1 << b) - 1)
tot = 0
while x:
tot += 1
x &= x-1
return tot
def mse(data):
l = len(data)
xi = arange(0,l)
A = array([xi,ones(l)])
y = data
w = linalg.lstsq(A.T,y)[0]
predicts = w[0] * xi + w[1]
return np.sqrt(((predicts - y) ** 2).mean())
def cutoff(data, c, b):
L = data[0:c]
R = data[c:]
return float(c) / b * mse(L) + float(b - c) / b * mse(R)
def Lmethod(data):
b = len(data)
tmp = cutoff(data, 1, b)
m = 0
for c in xrange(1, b):
r = cutoff(data, c, b)
if r <= tmp:
m = c
tmp = r
return m
def find_best(data):
b = len(data)
c = b
lk = b
ck = b
while 1:
lk = ck
ck = Lmethod(data[:c])
c = ck * 2
if ck >= lk:
break
based = data[10]
low = data[ck+1]
high = data[ck-1]
if ck >= 10:
if high == based:
return 10,based
if (high == data[ck] + 1) and (low == data[ck] - 1):
return ck-1, high
if ck > 20:
return ck-1, high
else:
return ck, data[ck]
else:
if based < low:
return 10, based
return ck + 1, low
def text_sim_score(h1, h2, b=96):
return 100 * float(hamming_distance(h1, h2, b)) / b
def text_sim(h1, h2, thres):
dis = text_sim_score(h1, h2)
if dis <= thres:
return True
return False
def simple_clustering(st_list, thres):
tmp = {}
clus = {}
for k in st_list:
tmp[k] = 0
while len(tmp) != 0:
st = tmp.keys()[0]
clus[st] = [st]
tmp.pop(st)
keys = tmp.keys()
for v in keys:
if text_sim(st, v, thres):
tmp.pop(v)
clus[st].append(v)
if 0 in clus:
clus.pop(0)
out = []
for k in clus:
out.append(clus[k])
return out
def clus_sub(target_clus, st, ed,no):
res = {}
c = 0
for k in target_clus.keys()[st:ed]:
c = c + 1
its = target_clus[k]
tmp = {}
for i in its:
h = int(i[2])
if h in tmp:
tmp[h].append((i[0], i[1]))
else:
tmp[h] = [(i[0], i[1])]
if len(tmp) == 1:
tmp_k = tmp.keys()[0]
new_k = k + (tmp_k, )
res[new_k] = tmp[tmp_k]
else:
data = {}
index = {}
dlen = len(tmp)
if dlen < 2000:
cl = HierarchicalClustering(tmp.keys(), text_sim_score)
for i in range(1, 50):
if dlen >= 2000:
d = simple_clustering(tmp.keys(), i)
else:
d = cl.getlevel(i)
data[i] = len(d)
index[len(d)] = i
m, v = find_best(data.values())
if dlen >= 2000:
clus = simple_clustering(tmp.keys(), index[v])
else:
clus = cl.getlevel(index[v])
for tmp_cls in clus:
new_k = k + (tmp_cls[0], )
tgs=[]
for tmp_kk in tmp_cls:
tgs += tmp[tmp_kk]
res[new_k] = tgs
f = open("clus_2_%s.db"%(no), "w")
cPickle.dump(res, f)
f.close()
def cluster_no():
res = []
out = {}
for i in range(0, 5):
r = cPickle.load(open("clus_2_%s.db" % (i)))
res += r.keys()
for x in xrange(len(res)):
out[res[x]] = x
f = open("clus_no.db", "w")
cPickle.dump(out, f)
f.close()
def reverse_db():
cno = cPickle.load(open("clus_no.db"))
out = {}
for i in range(0, 5):
r = cPickle.load(open("clus_2_%s.db" % (i)))
for k in r:
no = cno[k]
for it in r[k]:
out[it] = no
f=open("clus_2.db", "w")
cPickle.dump(out, f)
f.close()
if __name__ == '__main__':
# assume the clus_1.db stores the top-level clustering results
clus_1 = cPickle.load(open("clus_1.db"))
no_all = len(clus_1)
# split top clusters into 5 pieces
l = no_all / 5 + 1
for i in xrange(5):
st = i * l
ed = (i + 1) * l
if ed > no_all - 1:
ed = no_all - 1
# run clustering heuristics
clus_sub(clus_1, st, ed, i)
#merge clustering results
cluster_no()
reverse_db()