Skip to content

Commit

Permalink
Merge pull request #296 from pyerie/main
Browse files Browse the repository at this point in the history
Create: Anti-Malware program using ML and RAG boilerplate using langchain
  • Loading branch information
Techiral authored Oct 4, 2024
2 parents da39e0c + 9adb41d commit 02ab593
Show file tree
Hide file tree
Showing 5 changed files with 988 additions and 0 deletions.
26 changes: 26 additions & 0 deletions A/Anti-Malware_application/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# A ML-based anti-malware program written in Python #

<p> I first created this project for a competition. I had to learn a lot of new stuff and take inspiration(and a bit of code) from others who had tried similar projects before. This project uses customtkinter and scikit-learn to handle the GUI and ML operations respectively. It utilizes a decision tree classifier to classify files as malware or benign. It can classify a single file or all the files in a folder. It currently only works with executable files(but can be modified to work with other file types). Using the PEfile library, this program extracts information from the executable file(s) to classify them.


Note: It is not perfect and is prone to a lot of false-positives, but I hardly encountered any false-negatives. I think this is due to the overfitting of the model.

### Installation and execution

1) Install the necessary libraries
```
pip3 install customtkinter
pip3 install tkinter
pip3 install pandas
pip3 install sklearn
pip3 install pefile
pip3 install numpy
```

2) Execute the program
```
python3 anti-malware.py
```

Happy learning!
241 changes: 241 additions & 0 deletions A/Anti-Malware_application/anti-malware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#################################################################################
### Author: Pyerie #
### Application: A not-so-accurate ML based anti-malware solution #
#################################################################################

print("[+] Loading.... ")
import customtkinter
from tkinter.filedialog import *
from tkinter import *
import pefile
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import os



dataset = pd.read_csv('database3.csv')
X = dataset.drop(['legitimate'],axis=1).values

y = dataset['legitimate'].values



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
clf = DecisionTreeClassifier()


y_test = y_test.reshape(-1,1)
for i in range(0, 10):
clf = clf.fit(X_train,y_train)
res1 = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, res1)
accuracy = str(accuracy)[2:4] + "%"
print("Accuracy: "+accuracy)


customtkinter.set_appearance_mode("dark")
customtkinter.set_default_color_theme("dark-blue")


window = Tk()
screen_width = window.winfo_screenwidth()
screen_height = window.winfo_screenheight()
window.geometry(str(screen_width)+"x"+str(screen_height))
window.title("eSuraksha")
window['bg'] = "#121212"
def extract_features(file):
features = []



try:

pe_obj = pefile.PE(file, fast_load=True)
except pefile.PEFormatError as error:
print("Not PE file!")

features.append(pe_obj.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size)
features.append(pe_obj.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress)
features.append(pe_obj.OPTIONAL_HEADER.MajorImageVersion)
features.append(pe_obj.OPTIONAL_HEADER.MajorOperatingSystemVersion)
features.append(pe_obj.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress)
features.append(pe_obj.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size)
try:
features.append(pe_obj.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress)
except:
features.append(0)
features.append(pe_obj.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size)
features.append(pe_obj.OPTIONAL_HEADER.MajorLinkerVersion)
features.append(pe_obj.FILE_HEADER.NumberOfSections)
features.append(pe_obj.OPTIONAL_HEADER.SizeOfStackReserve)
features.append(pe_obj.OPTIONAL_HEADER.DllCharacteristics)
features.append(pe_obj.OPTIONAL_HEADER.AddressOfEntryPoint)
features.append(pe_obj.OPTIONAL_HEADER.ImageBase)






return features

toplevel_created = False

toplevel2_created = False

def single_file():

global toplevel_created
global toplevel2_created
global single_file_top
if toplevel_created == "True":
single_file_top.destroy()
toplevel_created = "False"
elif toplevel_created == "False":
pass

if toplevel2_created == "True":
many_files.destroy()
toplevel2_created = "False"
elif toplevel2_created == "False":
pass

single_file_top = Toplevel(window)
single_file_top.geometry("350x200")
customtkinter.set_appearance_mode("dark")
customtkinter.set_default_color_theme("dark-blue")
single_file_top['bg'] = "#121212"
single_file_top.title("Scan a single file")
toplevel_created = "True"
result = customtkinter.CTkLabel(single_file_top, text="Loading...")
result.pack()

file_path = askopenfilename()
try:
features_extracted = extract_features(str(file_path))
not_pe = False
except UnboundLocalError as e:
not_pe = True
result.after(0, result.destroy)
benign_l = customtkinter.CTkLabel(single_file_top, text="Not PE file!")
benign_l.pack()
toplevel2_created = False

if not_pe != True:
data_of_sample = np.array(features_extracted)
data_of_sample = data_of_sample.reshape(1,-1)


prediction = clf.predict(data_of_sample)


if prediction == 1:
result.after(0, result.destroy)

malware_l = customtkinter.CTkLabel(single_file_top, fg_color="red", text="ML model detected malware!")
malware_l.pack()


elif prediction == 0:
result.after(0, result.destroy)
benign_l = customtkinter.CTkLabel(single_file_top, fg_color="green", text="No malware detected!")
benign_l.pack()


def scan_many():


global toplevel2_created
global toplevel_created
global many_files

if toplevel2_created == "True":
many_files.destroy()
toplevel2_created = "False"
elif toplevel2_created == "False":
pass

if toplevel_created == "True":
single_file_top.destroy()
toplevel_created = "False"
elif toplevel_created == "False":
pass

many_files = Toplevel(window)
many_files.geometry("350x200")
customtkinter.set_appearance_mode("dark")
customtkinter.set_default_color_theme("dark-blue")
many_files['bg'] = "#121212"
many_files.title("Scan a directory")
toplevel2_created = "True"
result2 = customtkinter.CTkLabel(many_files, text="Loading...")
result2.pack()
malware_many = []
directory = askdirectory()
global extracted
for root, directory, files in os.walk(str(directory)):
for name_of_file in files:
path = os.path.join(str(root),str(name_of_file))

formats_of_pe = [".acm" , ".ax" , ".cpl" , ".dll" , ".drv" , ".efi" , ".exe" , ".mui" , ".ocx" , ".scr" , ".sys" , ".tsp", ".bin"]
for format_i in formats_of_pe:
if name_of_file.endswith(format_i) == True:

extracted = 1
try:

features_of_many = extract_features(str(path))
except UnboundLocalError as e:
pass
break

else:
extracted = 0



if extracted == 1:
data_for_many = np.array(features_of_many)
data_for_many = data_for_many.reshape(1,-1)

prediction_for_many = clf.predict(data_for_many)


if prediction_for_many == 1:
malware_many.append(str(path))


if len(malware_many) != 0:
result2.after(0, result2.destroy)
malware_label2 = customtkinter.CTkLabel(many_files,text="Malware found: ")
malware_label2.pack()
malware_text_box = customtkinter.CTkTextbox(many_files)
for_text_box = ''

for name_of_malware in malware_many:
for_text_box += "".join([name_of_malware, '\n------------------------------------------'])



malware_text_box.insert('0.0',for_text_box)
malware_text_box.configure(state="disabled")
malware_text_box.pack()




elif len(malware_many) == 0:
result2.after(0, result2.destroy)
benign_label = customtkinter.CTkLabel(many_files,text="No malware found!")
benign_label.pack()

button1 = customtkinter.CTkButton(master=window, command=single_file,text="Scan a single file")
button1.pack()
button2 = customtkinter.CTkButton(master=window, command=scan_many, text="Scan a folder")
button2.pack()

window.mainloop()
Loading

0 comments on commit 02ab593

Please sign in to comment.