-
Notifications
You must be signed in to change notification settings - Fork 32
/
digraph_numerical_features.py
executable file
·293 lines (243 loc) · 9.54 KB
/
digraph_numerical_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
##############################################################################
# #
# Code for the USENIX Security '22 paper: #
# How Machine Learning Is Solving the Binary Function Similarity Problem. #
# #
# MIT License #
# #
# Copyright (c) 2019-2022 Cisco Talos #
# #
# Permission is hereby granted, free of charge, to any person obtaining #
# a copy of this software and associated documentation files (the #
# "Software"), to deal in the Software without restriction, including #
# without limitation the rights to use, copy, modify, merge, publish, #
# distribute, sublicense, and/or sell copies of the Software, and to #
# permit persons to whom the Software is furnished to do so, subject to #
# the following conditions: #
# #
# The above copyright notice and this permission notice shall be #
# included in all copies or substantial portions of the Software. #
# #
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE #
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION #
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #
# #
# digraph_numerical_features.py - Convert each function into a NetworkX #
# graph and associate each basic block to the corresponding features. #
# #
##############################################################################
import base64
import click
import itertools
import json
import networkx as nx
import numpy as np
import os
from collections import defaultdict
from multiprocessing import Pool
from scipy.sparse import coo_matrix
from tqdm import tqdm
# Number of numerical features
NUM_ACFG_FEATURES = 8
def create_graph(node_list, edge_list):
"""
Create a Networkx direct graph from the list of nodes and edges.
Args
node_list: list of nodes
edge_list: list of edges
Return
np.matrix: Numpy adjacency matrix
nx.DiGraph: Networkx direct graph CFG
"""
G = nx.DiGraph()
for node in node_list:
G.add_node(node)
node_set = set(node_list)
for edge in edge_list:
if edge[0] in node_set and edge[1] in node_set:
G.add_edge(edge[0], edge[1])
node_list = list(G.nodes())
adj_mat = nx.to_numpy_matrix(G, nodelist=node_list, dtype=np.int8)
return adj_mat, G
def get_n_offspring(G, bb_va):
"""
Return the number of reachable nodes.
Args
G: nx.DiGraph CFG
bb_va: basic block virtual address
Return
int: basic block offspring
"""
return len(nx.descendants(G, bb_va))
def chunks(ll, n):
"""
Divide a list of nodes `ll` in `n` chunks
Source: https://networkx.org/documentation/stable/
auto_examples/algorithms/plot_parallel_betweenness.html
"""
l_c = iter(ll)
while 1:
x = tuple(itertools.islice(l_c, n))
if not x:
return
yield x
def betweenness_centrality_parallel(G, processes=None):
"""
Parallel betweenness centrality function
Source: https://networkx.org/documentation/stable/
auto_examples/algorithms/plot_parallel_betweenness.html
"""
bt_sc = []
with Pool(processes=processes) as p:
node_divisor = len(p._pool) * 4
node_chunks = list(
chunks(G.nodes(), int(G.order() / node_divisor) + 1))
num_chunks = len(node_chunks)
bt_sc = p.starmap(
nx.betweenness_centrality_subset,
zip(
[G] * num_chunks,
node_chunks,
[list(G)] * num_chunks,
[True] * num_chunks,
[None] * num_chunks,
),
)
# Reduce the partial solutions
bt_c = bt_sc[0]
for bt in bt_sc[1:]:
for n in bt:
bt_c[n] += bt[n]
return bt_c
def create_features_matrix(G, fva_data, num_processes):
"""
Create the matrix with numerical features.
Args
G: nx.DiGraph CFG
fva_data: dict with features associated to a function
num_processes: number of parallel processes
Return
np.array: Numpy matrix with numerical features
"""
f_mat = list()
if G.order() > 200:
betweenness = betweenness_centrality_parallel(
G, min(int(G.order() / 100), num_processes))
else:
betweenness = nx.betweenness_centrality(G)
# Iterate over each BBs
for node_idx, node_va in enumerate(list(G.nodes())):
node_data = fva_data["basic_blocks"][str(node_va)]
f_mat.append([
# 'n_string_consts'
node_data['n_string_consts'],
# 'n_numeric_consts'
node_data['n_numeric_consts'],
# 'n_transfer_instrs'
node_data['n_transfer_instrs'],
# 'n_calls'
node_data['n_call_instrs'],
# 'n_instructions'
node_data['n_instructions'],
# 'n_arith_instrs'
node_data['n_arith_instrs'],
# 'n_offspring'
get_n_offspring(G, node_va),
# 'betweenness'
betweenness[node_va]
])
# Here I'm forcing the dtype to float32, to limit memory spaces
# If you want to change the type (e.g., use float64), be
# sure to change the parsing method on the respective
# function.
return np.array(f_mat, dtype=np.float32)
def np_to_str(np_mat):
"""
Args
np.array: numpy matrix
Return
str: serialized matrix
"""
strides = np_mat.strides
shape = np_mat.shape
np_str = "{}::{}::{}::{}::{}".format(
strides[0], strides[1], shape[0], shape[1],
base64.b64encode(np_mat.tobytes('C')).decode('ascii'))
return np_str
def np_to_scipy_sparse(np_mat):
"""
Convert the numpy matrix in input to a scipy coo sparse matrix.
Args
np_mat: a Numpy matrix
Return
str: serialized adj matrix
"""
cmat = coo_matrix(np_mat)
# Custom string serialization
row_str = ';'.join([str(x) for x in cmat.row])
col_str = ';'.join([str(x) for x in cmat.col])
data_str = ';'.join([str(x) for x in cmat.data])
n_row = str(np_mat.shape[0])
n_col = str(np_mat.shape[1])
mat_str = "::".join([row_str, col_str, data_str, n_row, n_col])
return mat_str
def create_functions_dict(input_folder, num_processes):
"""
Args
input_folder: a folder with JSON files from IDA_acfg_features
num_processes: number of parallel processes
Return
dict: a dictionary with serialized adj and features matrices
"""
try:
functions_dict = defaultdict(dict)
for f_json in tqdm(os.listdir(input_folder)):
if not f_json.endswith(".json"):
continue
f_path = os.path.join(input_folder, f_json)
with open(f_path) as f_in:
jj = json.load(f_in)
idb_path = list(jj.keys())[0]
print("[D] Processing: {}".format(idb_path))
j_data = jj[idb_path]
# Iterate over each function
for fva in j_data:
fva_data = j_data[fva]
g_mat, G = create_graph(
fva_data['nodes'], fva_data['edges'])
f_mat = create_features_matrix(G, fva_data, num_processes)
graph_str = np_to_scipy_sparse(g_mat)
features_str = np_to_str(f_mat)
functions_dict[idb_path][fva] = {
'adj_mat': graph_str,
'features_mat': features_str
}
return functions_dict
except Exception as e:
print("[!] Exception in create_functions_dict\n{}".format(e))
return dict()
@click.command()
@click.option('-i', '--input-dir', required=True,
help='IDA_acfg_features JSON files.')
@click.option('-p', '--num-processes',
default=20,
help='Maximum number of processes.')
@click.option('-o', '--output-dir', required=True,
help='Output directory.')
def main(input_dir, output_dir, num_processes):
# Create output directory if it doesn't exist
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
o_dict = create_functions_dict(input_dir, num_processes)
output_path = os.path.join(output_dir,
'digraph_numerical_features.json')
with open(output_path, 'w') as f_out:
json.dump(o_dict, f_out)
if __name__ == '__main__':
main()