-
Notifications
You must be signed in to change notification settings - Fork 2
/
simRank_twostep.py
124 lines (85 loc) · 2.53 KB
/
simRank_twostep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
#
# Author: cao4@illinois.edu (Liangliang Cao)
# mtsai2@illinois.edu (Min-Hsuan Tsai)
# zhenli3@illinois.edu (Zhen Li)
# Beckman Institute, University of Illinois, 2010-2011
#
import sys
C = 0.4
def load_adj(fn_adj = 'adj.txt'):
file = open(fn_adj, "r")
adj = {}
for ln in file:
ele = ln.split()
if len(ele) > 1:
adj[ele[0]] = ele[1:]
else:
adj[ele[0]] = []
file.close()
file = open("inLen.txt", "r")
leng = {}
for ln in file:
ele = ln.split()
leng[ele[0]] = float(ele[1])
file.close()
return adj, leng
def get_adj():
adj,len = load_adj("adj.txt")
return adj, len
class Mapper_simRank_step1:
def __init__(self):
'''
file = open("excludes.txt", "r")
self.excludes = set(line.strip() for line in file)
file.close()
'''
# init adjacent matrix
self.adj, self.len = get_adj()
def __call__(self, key, value):
# for each node i, distribute s(i,j) to all neighbors
ii, jj, simstr = value.split()
#if ii == jj:
# yield [ii,jj], 1.0
simij = float(simstr)
for b in self.adj[jj]:
yield [ii,b], simij
class Mapper_simRank_step2:
def __init__(self):
# init adjacent matrix
self.adj, self.len = get_adj()
def __call__(self, key, value):
# for each node i, distribute s(i,j) to all neighbors
ii, jj, simstr = value.split()
simij = float(simstr)
for a in self.adj[ii]:
yield [a,jj], simij
class Reducer_simRank_step1:
# for each node_pair (i,j), sum all the score together
def __call__(self, key, values):
a,b = key
s = 0.0
for v in values:
s += float(v)
yield 'sim', (a +' ' +b + ' ' + str(s))
class Reducer_simRank_step2:
def __init__(self):
# init adjacent matrix
self.adj, self.len = get_adj()
def __call__(self, key, values):
a,b = key
if a == b:
yield 'sim', (a +' ' +b + ' ' + '1.0' )
return;
s = 0.0
for v in values:
s += float(v)
sim_new = s/self.len[a]/self.len[b]*C;
yield 'sim', (a +' ' +b + ' ' + str(sim_new))
if __name__ == "__main__":
import dumbo
job = dumbo.Job()
for iter in range(3):
job.additer(Mapper_simRank_step1, Reducer_simRank_step1)
job.additer(Mapper_simRank_step2, Reducer_simRank_step2)
job.run()