-
Notifications
You must be signed in to change notification settings - Fork 7
/
plot_scatter.py
214 lines (189 loc) · 8.39 KB
/
plot_scatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import numpy
from matplotlib import pyplot
from matplotlib.backends.backend_pdf import PdfPages
import os, sys, re
from collections import defaultdict
import argparse
import common, dot
from nltk.stem.snowball import *
"""Read the program similarity result files and plot text similarity vs. program similarity"""
def compute_method_text_similarity(m1_full_str, m2_full_str, name_re, camel_re, stemmer):
# (0) get just the name of the method
# (1) remove all non-letter characters in the name
# (2) split using camel case
# (3) stem all words
# (4) count the number of matched stemmed words (including duplicates)
# (5) score = len(all matched words)/len(all stemmed words)
# (0):
m1_method_name = get_method_name_only(m1_full_str, name_re)
m2_method_name = get_method_name_only(m2_full_str, name_re)
# (1):
m1_method_clean = re.sub("[\d$_]", "", m1_method_name)
m2_method_clean = re.sub("[\d$_]", "", m2_method_name)
#m1_remove_len = len(m1_method_name) - len(m1_method_clean)
#m2_remove_len = len(m2_method_name) - len(m2_method_clean)
# (2):
m1_word_lst = get_method_word_list(m1_method_clean, camel_re)
m2_word_lst = get_method_word_list(m2_method_clean, camel_re)
# (3):
#m1_word_lst = [w.lower() for w in m1_word_lst]
#m2_word_lst = [w.lower() for w in m2_word_lst]
# (3):
m1_stemmed_word_lst = [stemmer.stem(w) for w in m1_word_lst]
m2_stemmed_word_lst = [stemmer.stem(w) for w in m2_word_lst]
#m1_stem_len = sum([len(w) for w in m1_word_lst]) - sum([len(w) for w in m1_stemmed_word_lst])
#m2_stem_len = sum([len(w) for w in m2_word_lst]) - sum([len(w) for w in m2_stemmed_word_lst])
# (4):
m1_word_dict = defaultdict(int)
m2_word_dict = defaultdict(int)
for w1 in m1_stemmed_word_lst:
m1_word_dict[w1]+=1
for w2 in m2_stemmed_word_lst:
m2_word_dict[w2]+=1
common_word_set = set(m1_stemmed_word_lst) & set(m2_stemmed_word_lst)
common_word_len = 0
for wd in common_word_set:
common_word_len += len(wd)*2*min(m1_word_dict[wd], m2_word_dict[wd])
# (5):
score = float(common_word_len)/(sum([len(w) for w in m1_stemmed_word_lst]) + sum([len(w) for w in m2_stemmed_word_lst]))
return score
def get_method_word_list(method_str, camel_re):
word_lst = []
for match in camel_re.finditer(method_str):
word_lst.append(match.group(0))
return word_lst
def compile_camel_case_re_pattern():
return re.compile(r".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)")
def compile_method_re_pattern():
return re.compile(r"<[\w\d_$\.]+\s*:\s+[\w\d_$.\[\]]+\s+<*([\w\d_$\']+)>*\([\[\].\w\d_$\,\s]*\)>")
def get_method_name_only(method_full_str, re_prog):
#Example1: <org.dyn4j.dynamics.joint.RevoluteJoint: void setMotorEnabled(boolean)>
#Example2: <com.flowpowered.react.math.Quaternion: float lengthSquare()>
#Example3: <com.flowpowered.react.math.Quaternion: void <init>(float,float,float,float)>
#Example4: <org.dyn4j.dynamics.joint.MotorJoint: java.lang.String toString()>
#Example5: <org.dyn4j.dynamics.Body: java.util.List removeFixtures(org.dyn4j.geometry.Vector2)>
#Example6: <com.jme3.material.plugins.ShaderNodeLoaderDelegate: com.jme3.shader.VariableMapping parseMapping(com.jme3.util.blockparser.Statement,boolean[])>
#Example7: <org.dyn4j.geometry.Polygon: org.dyn4j.geometry.Vector2[] getAxes(org.dyn4j.geometry.Vector2[],org.dyn4j.geometry.Transform)>
#Example8: <org.dyn4j.geometry.Vector3: org.dyn4j.geometry.Vector3 'to'(double,double,double)>
m = re_prog.match(method_full_str)
if m:
return m.group(1)
else:
print("Should always find a method name. The fully qualitified method name was:")
print(method_full_str)
sys.exit(0)
def create_stemmer():
return SnowballStemmer('english')
def stem_word_lst(stemmer, word_lst):
return [stemmer.stem(w) for w in word_lst]
def get_dot_method_map(proj_lst):
dot_method_map = {}
for proj in proj_lst:
output_dir_lst = dot.dot_dirs(proj)
for output_dir in output_dir_lst:
method_file = dot.get_method_path(proj, output_dir)
with open(method_file, "r") as mf:
for line in mf:
line = line.rstrip()
items = line.split("\t")
method_name = items[0]
method_dot = items[1]
method_dot_path = dot.get_dot_path(proj, output_dir, method_dot)
dot_method_map[method_dot_path] = method_name
return dot_method_map
def parse_result_file(result_file, dot_method_map):
"""
file format:
path_to_dotA:
path_to_similar_dot1 , score
...
path_to_similar_dot5 , score
path_to_dotB:
...
"""
method_dict = {} # method_dict[method] = [similar_method, prog_score, text_score]
stemmer = create_stemmer()
name_re = compile_method_re_pattern()
camel_re = compile_camel_case_re_pattern()
count = 0
current_dot = None
with open(result_file, "r") as fi:
for line in fi:
line = line.rstrip('\n')
if len(line)>0 and line[-1]==":":
current_dot = line[:-1]
current_method = dot_method_map[current_dot]
else:
linarr = line.split(" , ")
if linarr[0][-3:]=="dot":
# consider most similar method only
if count == 0:
similar_method = dot_method_map[linarr[0]]
# compute word based similarity
prog_score = float(linarr[1])
text_score = compute_method_text_similarity(current_method, similar_method, name_re, camel_re, stemmer)
method_dict[current_method] = [similar_method, prog_score, text_score]
count += 1
if count == 5:
count = 0
return method_dict
def plot_scatter(x, x_axis_label, y, y_axis_label, fig_file, title=""):
"""
heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
pyplot.figure()
pyplot.imshow(heatmap, extent=extent)
pyplot.title(title)
pyplot.xlabel(x_axis_label)
pyplot.ylabel(y_axis_label)
"""
pyplot.figure()
pyplot.scatter(x, y, marker='x', alpha=0.5)
pyplot.title(title)
pyplot.xlabel(x_axis_label)
pyplot.ylabel(y_axis_label)
pyplot.xlim(-0.05, 1.05)
pyplot.ylim(-0.05, 1.05)
pp = PdfPages(fig_file+".pdf")
pyplot.savefig(pp, format="pdf")
pp.close()
def main():
parser = argparse.ArgumentParser()
#parser.add_argument("-nc", "--nocluster", required=True, type=str, help="path to the result folder without relabeling")
parser.add_argument("-c", "--cluster", required=True, type=str, help="path to the result folder with relabeling")
parser.add_argument("-f", "--fig", type=str, help="path to the figure folder")
parser.add_argument("-s", "--strategy", required=True, type=str, help="name of the strategy")
args = parser.parse_args()
proj_lst = common.LIMITED_PROJECT_LIST
fig_dir = args.strategy+"_scatter"
if args.fig:
fig_dir = args.fig
common.mkdir(fig_dir)
dot_method_map = get_dot_method_map(proj_lst)
for proj in proj_lst:
print(proj+":")
proj_result_file_name = proj + "_result.txt"
method_dict = parse_result_file(os.path.join(args.cluster, proj_result_file_name), dot_method_map)
xs = []
ys = []
count11 = 0
for m in list(method_dict.keys()):
x_v = method_dict[m][1]
y_v = method_dict[m][2]
xs.append(x_v)
ys.append(y_v)
if abs((1.0 - x_v)) < 0.0005 and abs((1.0 - y_v)) < 0.0005:
#print(m + "\t" + method_dict[m][0])
count11 += 1
print("(1,1): {0}".format(count11))
# save xs and ys
with open(os.path.join(fig_dir, proj+"_data.txt"), "w") as df:
df.write(",".join([str(x) for x in xs]))
df.write("\n")
df.write(",".join([str(y) for y in ys]))
plot_scatter(xs, "semantic similarity", ys, "name similarity", os.path.join(fig_dir, proj), proj+" : "+args.strategy)
# correlation:
print(numpy.corrcoef(xs,ys))
print("\n")
if __name__ == "__main__":
main()