gui.py

#GUI for (PE) Malware Detection using UI (Production=Client) 

#IMPORTANT: please make sure both 'malware_models.pkl' and 'gui.py' files are in the same folder


import pickle
import pandas as pd
import PySimpleGUI as sg           #pip install PySimpleGUI
from xgboost import XGBClassifier  #pip install xgboost==0.90
import itertools
import yara                        #pip install yara-python
import numpy as np

# Getting back the models and the features encoder data:
with open('malware_models.pkl','rb') as f:  # Python 3: open(..., 'rb')
    model_name, model_list, f1_score_all, accuracy_all, newlist = pickle.load(f)

  
sg.theme('Dark Blue 3')  # please make your windows colorful

#reading file path
layout = [[sg.Text('Select PE File (.exe)')],
      #[sg.Text('Source for Folders', size=(15, 1)), sg.InputText(), sg.FolderBrowse()],
      [sg.Text('Path ', size=(15, 1)), sg.InputText(), sg.FileBrowse()],
      [sg.Submit(), sg.Cancel()]]

window = sg.Window('Select File...', layout)

event, values = window.read()
window.close()
#folder_path, file_path = values[0], values[1]       # get the data from the values dictionary
file_path = values[0]

test_dataset=pd.DataFrame([file_path],columns=['hash']) #type 0==benign | 1==malware

#print(file_path)

#configs for yara rules
#Path to the folder containing downloaded files in the first part
rules_path = 'yara_rules/'
#Read yara rules files
peid_rules = yara.compile(rules_path + 'peid.yar')
packer_rules = yara.compile(rules_path + 'packer.yar')
crypto_rules = yara.compile(rules_path + 'crypto_signatures.yar')
antidebug_antivm_rules = yara.compile(rules_path + 'antidebug_antivm.yar')
capabilities_rules = yara.compile(rules_path + 'capabilities.yar')
#rules lists are here 
rules_list=[peid_rules,packer_rules,crypto_rules,
            antidebug_antivm_rules,capabilities_rules]
rules_names_list=["peid_rules","packer_rules","crypto_rules",
                  "antidebug_antivm_rules","capabilities_rules"]

#doing analsys here
#function for genrating features from yar rules
def add_feature_yara(dataframe,rules,rules_names):
  for index_rule in range(len(rules)):
    feature_list=[]
    for index_hash in list(dataframe['hash']):
      result=rule_scan_hash(index_hash,rules[index_rule])
      feature_list.append(result)
    dataframe[rules_names[index_rule]+'_feature']=feature_list

def rule_scan_hash(hash_var,rule):
  try:
    return rule.match(hash_var)
  except:
    return []
    
#PE Features functions 
#function for PE
def pe_list_imported_DLLs(hash_var):
  try:
    pe = pefile.PE(hash_var)
    # print("[*] Listing imported DLLs...")
    list_dll=[]
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
      # print('\t' + entry.dll.decode('utf-8'))
      list_dll.append(entry.dll.decode('utf-8'))
    return list_dll
  except:
    list_dll=[]
    return list_dll


def pe_list_imported_DLLs_imports(hash_var,dll_name):
  try:
    pe = pefile.PE(hash_var)
    # print("[*] Listing imported DLLs imports...")
    list_imports=[]
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
      temp_dll_name = entry.dll.decode('utf-8')
      if temp_dll_name == dll_name:
          print("[*] Kernel32.dll imports:")
          for func in entry.imports:
              # print("\t%s at 0x%08x" % (func.name.decode('utf-8'), func.address))
              list_imports.append(func.name.decode('utf-8')) 
    return list_imports
  except:
    list_imports=[]
    return list_imports

def pe_sections_name(hash_var):
  try:
    pe = pefile.PE(hash_var)
    list_section=[]
    for section in pe.sections:
      # print(section.Name.decode('utf-8'))
      try:
        list_section.append(re.sub(r'\x00', '',section.Name.decode('utf-8'))) #remove \x00 from the section name
      except:
        list_section.append('faild')

    return list_section
  except:
    list_section=[]
    return list_section

def add_feature_PE(dataframe):
  # for index_rule in range(len(rules)):
  #adding sections_name
  feature_list_sections_name=[]
  for index_hash in list(dataframe['hash']):
    result=pe_sections_name(index_hash)
    feature_list_sections_name.append(result)
  dataframe['sections_name_feature']=feature_list_sections_name

  #adding list_imported_DLLs
  feature_list_dll=[]
  for index_hash in list(dataframe['hash']):
    result=pe_list_imported_DLLs(index_hash)
    feature_list_dll.append(result)
  dataframe['imported_DLLs_feature']=feature_list_dll
  
  #(this section takes very long time)
  # #adding list_imported_DLLs_imports
  # feature_list_dll_import=[]
  # for index_hash in list(dataframe['hash']):
  #   for index_dll in feature_list_dll:
  #     result = pe_list_imported_DLLs_imports(index_hash,index_dll)
  #     feature_list_dll_import.append(result)
  # dataframe['imported_DLLs_imports_feature']=feature_list_dll_import

  return dataframe


#function for geting features list
def get_new_columns(colum_values):
  #get unige values of the colum
  newlist=[]
  for x in itertools.chain.from_iterable(np.unique(colum_values)):
    if x not in newlist:
        newlist.append(x)
  newlist = [str(s) for s in newlist]
  return newlist

#function for one-hot encoding features list
def encode_lists(colum_values):
  newlist = get_new_columns(colum_values)
  #one hot encode 
  list_indexs=[]
  for x in colum_values:
    list_zero=[0]*len(newlist)
    if len(colum_values)>0:
      for i in range(len(x)):
        index = newlist.index(str(x[i]))
        list_zero[index]=1
    list_indexs.append(list_zero)
  newdf=pd.DataFrame(list_indexs,columns=newlist) 
  return newdf
  
# #appling encoding for all coulmns

for col in test_dataset.columns[1:]:
  if col == test_dataset.columns[1]:   
    colum_values=test_dataset[col].values
    features_df_test=encode_lists(colum_values)
  else:
    colum_values=test_dataset[col].values
    features_df_test=pd.concat([features_df_test, encode_lists(colum_values)], axis=1)

newlist_test=list(features_df_test.columns)
# print(newlist)
features_to_pridict=[]
for x in range(len(features_df_test)):
  list_zero=[0]*len(newlist)
  for col in newlist_test:
    if col in newlist:
      result=list(features_df_test[col].values)[x]
      index = newlist.index(col)
      list_zero[index]=result
  features_to_pridict.append(list_zero)

pred=[]
for model in range(len(model_list)):
  pred.append(model_list[model].predict(features_to_pridict)[0])
  

# layout the window
layout = [[sg.Text('Extracting Features')],
          [sg.ProgressBar(2, orientation='h', size=(20, 20), key='progressbar')],
          [sg.Cancel()]]

# create the window`
window = sg.Window('Analyse', layout)
progress_bar = window['progressbar']
# loop that would normally do something useful
while True:
    # check to see if the cancel button was clicked and exit loop if clicked
    event, values = window.read(timeout=10)
    if event == 'Cancel'  or event == sg.WIN_CLOSED:
        break
    add_feature_yara(test_dataset,rules_list,rules_names_list)
    progress_bar.UpdateBar(1)
    event, values = window.read(timeout=10)
    if event == 'Cancel'  or event == sg.WIN_CLOSED:
        break
    test_dataset=add_feature_PE(test_dataset)
    progress_bar.UpdateBar(2)
    break
# done with loop... need to destroy the window as it's still open
window.close()

### Show Results

def get_color(val):
    if val>0:
        return 'red'
    else:
        return 'green'
        
def print_value(val):
    if val>0:
        return 'Malicious'
    else:
        return 'Benign'
        
window = sg.Window('Results')                                   # blank window
   
col12= [[sg.Text(print_value(pred[0]),text_color=get_color(pred[0]))],
       [sg.Text(print_value(pred[1]),text_color=get_color(pred[1]))],
       [sg.Text(print_value(pred[2]),text_color=get_color(pred[2]))],
       [sg.Text(print_value(pred[3]),text_color=get_color(pred[3]))],
       [sg.Text(print_value(pred[4]),text_color=get_color(pred[4]))],
       [sg.Text(print_value(pred[5]),text_color=get_color(pred[5]))],
       [sg.Text(print_value(pred[6]),text_color=get_color(pred[6]))]]    
       
col11 =[[sg.Text('XGBoost detected:      ')],
       [sg.Text('Random Forest detected:')],
       [sg.Text('Decision Tree detected:')],
       [sg.Text('AdaBoost detected:     ')],
       [sg.Text('Naive Bayes detected:  ')],
       [sg.Text('SGD detected:          ')],
       [sg.Text('MLP detected:          ')]]  

col2 = [[sg.Text('  Risk Level')],[sg.VerticalSeparator(10),sg.ProgressBar(7, orientation='v', size=(20, 60), key='progressbar')]]


layout = [[sg.Column(col11),sg.Column(col12),sg.VerticalSeparator(),sg.Column(col2)],
         [sg.Text(f'\n\n Final Score (RISK) {int(np.sum(pred)/len(pred)*100)}%     ({np.sum(pred)} out of 7)\n\n')],
         [sg.OK()]]

# Display the window and get values

window = sg.Window('Results', layout)
while True:
    event, values = window.read(timeout=10)
    if event == 'OK'  or event == sg.WIN_CLOSED:
        break
    window['progressbar'].update(np.sum(pred))
    
window.close()