src/files/tools/visualizer

#!/usr/bin/python3
# -*- coding: UTF-8 -*-
from pbox import *
from tinyscript import *


__script__      = "Visualization tool"
__version__     = "2.0.5"
__contributors__ = [
    {'author': "Romain Jennes", 'reason': "added visualizations for observing effects of alterations"},
]
__doc__         = """
This tool aims to generate representations of binaries, emphasizing their sections and plotting their entropy,
 for the sake of comparing them, e.g. an original PE file and its packed versions.
"""
__description__ = "Generate a figure with representations of input binaries' sections and entropy"
__examples__    = [
    "compare \"PsExec.exe$\" samples-folder \"PsExec.exe$\" altered-samples-folder",
    "features . dataset-packed-pe --label Exe32Pack",
    "find . dataset-packed-pe --max-not-matching 2 --exclude outliers --do-not-display",
    "plot \"PsExec.exe$\" PackingData -l not-packed -l MEW -l NSPack -l RLPack -l UPX",
]


def find_files_with_labels(folder, pattern, selected=None, nbr_not_selected=None, display=False, force_display=False,
                           remove=False, filter_selected=True, exclude=None):
    """ Find files matching the given filename pattern withing the specified folder. It uses subfolders' names as labels
         for categorizing the matching file.
    
    For instance:
        FOLDER
          +-- not-packed
          +-- packed
                +-- ASPack
                +-- UPX
                +-- WinUPack
        
        if a file matches in every subfolder, we will have (filename => label):
            [filename-from-not-packed] => Original
            [filename-from-packed/ASPack] => ASPack
            [...]
    
    :param folder:   source folder
    :param pattern:  pattern for matching files
    :param selected: list of labels to be filtered (keeps the order)
    """
    prefixes, available_labels, files, disp_all = {}, [], {}, selected is None
    # collect (best matching) prefixes first
    for dp in ts.Path(folder).walk(filter_func=lambda x: x.is_dir()):
        # discard .git/* and non-leaf subfolders of the folder tree structure
        if ".git" in dp.parts or any(s in dp.parts for s in (exclude or [])) or \
           len(list(dp.listdir(filter_func=lambda x: x.is_dir()))) > 0:
            continue
        l, prev, prefix = dp.basename, None, {}
        for f in dp.listdir(filter_func=lambda x: x.is_file()):
            common = ""
            for c1, c2 in zip(prev or f.stem, f.stem):
                if c1 == c2:
                    common += c1
                else:
                    break
            prefix.setdefault(common, 0)
            prefix[common] += 1
            prev = f.stem
        if len(prefix) > 0:
            prefixes[l] = max(prefix, key=prefix.get)
    # check for label errors
    error = False
    if selected is not None:
        for l in selected[:]:
            if l not in prefixes.keys():
                for good in prefixes.keys():
                    if good.lower() == l.lower():
                        logger.debug("Fixed label %s to %s" % (l, good))
                        selected[selected.index(l)] = good
                        l = None
                        break
                if l is not None:
                    logger.error("Bad label: %s" % l)
                    error = True
    if error:
        sys.exit(1)
    # collect matching files
    for fp in ts.Path(folder).walk(filter_func=lambda x: x.is_file()):
        # discard files from .git/* and non-leaf subfolders of the folder tree structure
        if ".git" in fp.parts or any(s in fp.parts for s in (exclude or [])) or \
           len(list(fp.dirname.listdir(filter_func=lambda x: x.is_dir()))) > 0:
            continue
        l = fp.dirname.basename
        if l not in available_labels:
            available_labels.append(l)
        if re.search(pattern, fp.filename):
            fname = fp.filename[len(prefixes[l]):]
            files.setdefault(fname, {'files': [], 'labels': []})
            files[fname]['labels'].append(l)
            files[fname]['files'].append(fp)
    if selected is None:
        selected = sorted(available_labels)
    len_al = len(available_labels)
    # discard files that have too few positive labels or that do not have the selected labels in their packed versions
    n, disp = 0, display or force_display
    for file, data in files.items():
        # set booleans for files having their label match one of the selected labels
        labels = {}
        for label in sorted(available_labels):
            try:
                data['files'][data['labels'].index(label)]
                labels[label] = True
            except ValueError:
                labels[label] = False
        # discard files for which either there is a non matching label in the selected ones or too few positive labels
        n_good = sum(labels.values())
        if not disp_all and any(not labels[l] for l in selected) or \
           nbr_not_selected is not None and n_good + nbr_not_selected < len_al:
            continue
        # display valid results
        if disp:
            file += " (%d/%d)" % (n_good, len_al)
        (logger.info if disp else logger.debug)(file)
        for label, found in labels.items():
            l = LABELS.get(label, label)
            if display:
                (logger.success if found else logger.failure)(l)
        n += 1
    if filter_selected:
        nfiles = {}
        for file, data in files.items():
            if remove:
                logger.info(file)
            ndata = {'files': [], 'labels': []}
            for label in selected:
                try:
                    ndata['files'].append(data['files'][data['labels'].index(label)])
                    ndata['labels'].append(label)
                except ValueError:
                    logger.warning("label '%s' not found" % label)
            nfiles[file] = ndata
        files = nfiles
    if n == 0:
        logger.warning("No match")
    return files


if __name__ == "__main__":
    sparsers = parser.add_subparsers(dest="command", metavar="CMD", title="positional argument",
                                     help="command to be executed")
    compare = sparsers.add_parser("compare", category="visualization", help="compare files from two sources")
    #compare = add_argument(sparsers.add_parser("compare", category="visualization", help="compare files from two "
    #                                           "sources"), "folder", fmt=True, extended=True)
    compare.add_argument("pattern1", help="pattern for the path of the first file(s) to be compared")
    compare.add_argument("pattern2", help="pattern for the path of the second file(s) to be compared")
    compare.add_argument("-l1", "--legend1", help="legend for the first binary")
    compare.add_argument("-l2", "--legend2", help="legend for the second binary")
    compare.add_argument("-o", "--out-file", metavar="FILE", help="output file pattern to save the image(s) to")
    compare.add_argument("--text", action="store_true", help="textual comparison")
    compgrp = compare.add_mutually_exclusive_group()
    compgrp.add_argument("-t", "--title", help="set plot title")
    compgrp.add_argument("--no-title", action="store_true", help="do not generate a main title")
    feat = add_argument(sparsers.add_parser("features", category="visualization", help="compute features for files "
                                            "matching the regex with the input labels"), "folder")
    feat.add_argument("-r", "--raw", action="store_true", help="display raw features only (not transformed ones)")
    find = add_argument(sparsers.add_parser("find", category="utils", help="find files matching the regex with the "
                                            "input labels"), "folder")
    find.add_argument("-d", "--no-display", action="store_false", help="do not display packer label matches")
    find.add_argument("-x", "--exclude", nargs="*", help="folder to be excluded")
    plot = add_argument(sparsers.add_parser("plot", category="visualization", help="plot files matching the regex given"
                                            " the selected labels"), "folder", alias=True, fmt=True)
    plot.add_argument("--legend-location", default="lower-center", type=legend_location,
                      help="hyphen-separated pair of locators for the plot's legend")
    plot.add_argument("-n", "--no-title", action="store_true", help="do not generate a title")
    plot.add_argument("-s", "--scale", action="store_true", help="scale plots according to the first binary")
    rm = add_argument(sparsers.add_parser("remove", category="utils", help="remove files matching the regex with the "
                                          "input labels"), "folder")
    initialize(noargs_action="usage", autocomplete=True)
    configure_logging(args.verbose)
    if args.command != "compare":
        ff_args = args.folder, args.filename, args.label, getattr(args, "max_not_matching", None)
        msg = " files from %s matching '%s'%s..." % (args.folder, args.filename,
                                                     [" with the selected labels", ""][args.label is None])
    LABELS.update(getattr(args, "alias", None) or {})
    if args.command == "features":
        logger.info("Computing features of" + msg)
        for file, data in find_files_with_labels(*ff_args).items():
            logger.info(file)
            files = []
            for f, l in zip(data['files'], data['labels']):
                exe = Executable(f)
                row = {'path': str(f), 'label': LABELS.get(l, l)}
                row.update(exe.rawdata if args.raw else exe.data)
                files.append(row)
            with data_to_temp_file(pd.DataFrame(files), prefix="visualizer-features-") as tmp:
                edit_file(tmp, logger=logger)
    elif args.command == "find":
        logger.info("Searching for" + msg)
        for _ in find_files_with_labels(*ff_args, display=args.no_display, exclude=args.exclude, force_display=True,
                                        filter_selected=False).keys():
            pass
    elif args.command == "plot":
        from bintropy import plot
        logger.info("Collecting" + msg)
        for file, data in find_files_with_labels(*ff_args).items():
            files, labels = data['files'], data['labels']
            labels = [LABELS.get(l, l) for l in labels]
            imgn = ts.Path(file).stem
            logger.info("Making plot to %s.%s..." % (imgn, args.format))
            for f in files:
                logger.debug(f)
            plot(*files, img_name=imgn, labels=labels, sublabel="size-ep-ent", logger=logger, **vars(args))
    elif args.command == "compare":
        logger.info("Comparing files matching pattern '%s' to those matching pattern '%s'..." % \
                    (args.pattern1, args.pattern2))
        n = sum(1 for _ in find_files_in_folder(args.pattern1))
        for i, files in enumerate(zip(find_files_in_folder(args.pattern1), find_files_in_folder(args.pattern2))):
            f1, f2 = files[0], files[1]
            if args.text:
                print(Executable.diff_text(str(f1), str(f2), logger=logger, **vars(args)))    
            else:
                if args.out_file is None:
                    imgn = "%s_%s.png" % (f1.stem, f2.stem)
                elif n > 0:
                    fp = ts.Path(args.out_file)
                    stem, ext = fp.dirname.joinpath(fp.stem), fp.extension
                    imgn = "%s_%d%s" % (stem, i+1, ext)
                else:
                    imgn = args.out_file
                logger.info("Making plot to %s..." % imgn)
                Executable.diff_plot(str(f1), str(f2), img_name=imgn, logger=logger, **vars(args))
    elif args.command == "remove":
        logger.info("Removing" + msg)
        for file, data in find_files_with_labels(*ff_args, remove=True).items():
            if ts.confirm("Remove '%s' ?" % file):
                for f in data['files']:
                    f.remove()