-
Notifications
You must be signed in to change notification settings - Fork 10
/
visualizer
executable file
·234 lines (224 loc) · 11.8 KB
/
visualizer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
from pbox import *
from tinyscript import *
__script__ = "Visualization tool"
__version__ = "2.0.5"
__contributors__ = [
{'author': "Romain Jennes", 'reason': "added visualizations for observing effects of alterations"},
]
__doc__ = """
This tool aims to generate representations of binaries, emphasizing their sections and plotting their entropy,
for the sake of comparing them, e.g. an original PE file and its packed versions.
"""
__description__ = "Generate a figure with representations of input binaries' sections and entropy"
__examples__ = [
"compare \"PsExec.exe$\" samples-folder \"PsExec.exe$\" altered-samples-folder",
"features . dataset-packed-pe --label Exe32Pack",
"find . dataset-packed-pe --max-not-matching 2 --exclude outliers --do-not-display",
"plot \"PsExec.exe$\" PackingData -l not-packed -l MEW -l NSPack -l RLPack -l UPX",
]
def find_files_with_labels(folder, pattern, selected=None, nbr_not_selected=None, display=False, force_display=False,
remove=False, filter_selected=True, exclude=None):
""" Find files matching the given filename pattern withing the specified folder. It uses subfolders' names as labels
for categorizing the matching file.
For instance:
FOLDER
+-- not-packed
+-- packed
+-- ASPack
+-- UPX
+-- WinUPack
if a file matches in every subfolder, we will have (filename => label):
[filename-from-not-packed] => Original
[filename-from-packed/ASPack] => ASPack
[...]
:param folder: source folder
:param pattern: pattern for matching files
:param selected: list of labels to be filtered (keeps the order)
"""
prefixes, available_labels, files, disp_all = {}, [], {}, selected is None
# collect (best matching) prefixes first
for dp in ts.Path(folder).walk(filter_func=lambda x: x.is_dir()):
# discard .git/* and non-leaf subfolders of the folder tree structure
if ".git" in dp.parts or any(s in dp.parts for s in (exclude or [])) or \
len(list(dp.listdir(filter_func=lambda x: x.is_dir()))) > 0:
continue
l, prev, prefix = dp.basename, None, {}
for f in dp.listdir(filter_func=lambda x: x.is_file()):
common = ""
for c1, c2 in zip(prev or f.stem, f.stem):
if c1 == c2:
common += c1
else:
break
prefix.setdefault(common, 0)
prefix[common] += 1
prev = f.stem
if len(prefix) > 0:
prefixes[l] = max(prefix, key=prefix.get)
# check for label errors
error = False
if selected is not None:
for l in selected[:]:
if l not in prefixes.keys():
for good in prefixes.keys():
if good.lower() == l.lower():
logger.debug("Fixed label %s to %s" % (l, good))
selected[selected.index(l)] = good
l = None
break
if l is not None:
logger.error("Bad label: %s" % l)
error = True
if error:
sys.exit(1)
# collect matching files
for fp in ts.Path(folder).walk(filter_func=lambda x: x.is_file()):
# discard files from .git/* and non-leaf subfolders of the folder tree structure
if ".git" in fp.parts or any(s in fp.parts for s in (exclude or [])) or \
len(list(fp.dirname.listdir(filter_func=lambda x: x.is_dir()))) > 0:
continue
l = fp.dirname.basename
if l not in available_labels:
available_labels.append(l)
if re.search(pattern, fp.filename):
fname = fp.filename[len(prefixes[l]):]
files.setdefault(fname, {'files': [], 'labels': []})
files[fname]['labels'].append(l)
files[fname]['files'].append(fp)
if selected is None:
selected = sorted(available_labels)
len_al = len(available_labels)
# discard files that have too few positive labels or that do not have the selected labels in their packed versions
n, disp = 0, display or force_display
for file, data in files.items():
# set booleans for files having their label match one of the selected labels
labels = {}
for label in sorted(available_labels):
try:
data['files'][data['labels'].index(label)]
labels[label] = True
except ValueError:
labels[label] = False
# discard files for which either there is a non matching label in the selected ones or too few positive labels
n_good = sum(labels.values())
if not disp_all and any(not labels[l] for l in selected) or \
nbr_not_selected is not None and n_good + nbr_not_selected < len_al:
continue
# display valid results
if disp:
file += " (%d/%d)" % (n_good, len_al)
(logger.info if disp else logger.debug)(file)
for label, found in labels.items():
l = LABELS.get(label, label)
if display:
(logger.success if found else logger.failure)(l)
n += 1
if filter_selected:
nfiles = {}
for file, data in files.items():
if remove:
logger.info(file)
ndata = {'files': [], 'labels': []}
for label in selected:
try:
ndata['files'].append(data['files'][data['labels'].index(label)])
ndata['labels'].append(label)
except ValueError:
logger.warning("label '%s' not found" % label)
nfiles[file] = ndata
files = nfiles
if n == 0:
logger.warning("No match")
return files
if __name__ == "__main__":
sparsers = parser.add_subparsers(dest="command", metavar="CMD", title="positional argument",
help="command to be executed")
compare = sparsers.add_parser("compare", category="visualization", help="compare files from two sources")
#compare = add_argument(sparsers.add_parser("compare", category="visualization", help="compare files from two "
# "sources"), "folder", fmt=True, extended=True)
compare.add_argument("pattern1", help="pattern for the path of the first file(s) to be compared")
compare.add_argument("pattern2", help="pattern for the path of the second file(s) to be compared")
compare.add_argument("-l1", "--legend1", help="legend for the first binary")
compare.add_argument("-l2", "--legend2", help="legend for the second binary")
compare.add_argument("-o", "--out-file", metavar="FILE", help="output file pattern to save the image(s) to")
compare.add_argument("--text", action="store_true", help="textual comparison")
compgrp = compare.add_mutually_exclusive_group()
compgrp.add_argument("-t", "--title", help="set plot title")
compgrp.add_argument("--no-title", action="store_true", help="do not generate a main title")
feat = add_argument(sparsers.add_parser("features", category="visualization", help="compute features for files "
"matching the regex with the input labels"), "folder")
feat.add_argument("-r", "--raw", action="store_true", help="display raw features only (not transformed ones)")
find = add_argument(sparsers.add_parser("find", category="utils", help="find files matching the regex with the "
"input labels"), "folder")
find.add_argument("-d", "--no-display", action="store_false", help="do not display packer label matches")
find.add_argument("-x", "--exclude", nargs="*", help="folder to be excluded")
plot = add_argument(sparsers.add_parser("plot", category="visualization", help="plot files matching the regex given"
" the selected labels"), "folder", alias=True, fmt=True)
plot.add_argument("--legend-location", default="lower-center", type=legend_location,
help="hyphen-separated pair of locators for the plot's legend")
plot.add_argument("-n", "--no-title", action="store_true", help="do not generate a title")
plot.add_argument("-s", "--scale", action="store_true", help="scale plots according to the first binary")
rm = add_argument(sparsers.add_parser("remove", category="utils", help="remove files matching the regex with the "
"input labels"), "folder")
initialize(noargs_action="usage", autocomplete=True)
configure_logging(args.verbose)
if args.command != "compare":
ff_args = args.folder, args.filename, args.label, getattr(args, "max_not_matching", None)
msg = " files from %s matching '%s'%s..." % (args.folder, args.filename,
[" with the selected labels", ""][args.label is None])
LABELS.update(getattr(args, "alias", None) or {})
if args.command == "features":
logger.info("Computing features of" + msg)
for file, data in find_files_with_labels(*ff_args).items():
logger.info(file)
files = []
for f, l in zip(data['files'], data['labels']):
exe = Executable(f)
row = {'path': str(f), 'label': LABELS.get(l, l)}
row.update(exe.rawdata if args.raw else exe.data)
files.append(row)
with data_to_temp_file(pd.DataFrame(files), prefix="visualizer-features-") as tmp:
edit_file(tmp, logger=logger)
elif args.command == "find":
logger.info("Searching for" + msg)
for _ in find_files_with_labels(*ff_args, display=args.no_display, exclude=args.exclude, force_display=True,
filter_selected=False).keys():
pass
elif args.command == "plot":
from bintropy import plot
logger.info("Collecting" + msg)
for file, data in find_files_with_labels(*ff_args).items():
files, labels = data['files'], data['labels']
labels = [LABELS.get(l, l) for l in labels]
imgn = ts.Path(file).stem
logger.info("Making plot to %s.%s..." % (imgn, args.format))
for f in files:
logger.debug(f)
plot(*files, img_name=imgn, labels=labels, sublabel="size-ep-ent", logger=logger, **vars(args))
elif args.command == "compare":
logger.info("Comparing files matching pattern '%s' to those matching pattern '%s'..." % \
(args.pattern1, args.pattern2))
n = sum(1 for _ in find_files_in_folder(args.pattern1))
for i, files in enumerate(zip(find_files_in_folder(args.pattern1), find_files_in_folder(args.pattern2))):
f1, f2 = files[0], files[1]
if args.text:
print(Executable.diff_text(str(f1), str(f2), logger=logger, **vars(args)))
else:
if args.out_file is None:
imgn = "%s_%s.png" % (f1.stem, f2.stem)
elif n > 0:
fp = ts.Path(args.out_file)
stem, ext = fp.dirname.joinpath(fp.stem), fp.extension
imgn = "%s_%d%s" % (stem, i+1, ext)
else:
imgn = args.out_file
logger.info("Making plot to %s..." % imgn)
Executable.diff_plot(str(f1), str(f2), img_name=imgn, logger=logger, **vars(args))
elif args.command == "remove":
logger.info("Removing" + msg)
for file, data in find_files_with_labels(*ff_args, remove=True).items():
if ts.confirm("Remove '%s' ?" % file):
for f in data['files']:
f.remove()