-
Notifications
You must be signed in to change notification settings - Fork 11
/
et.py
103 lines (94 loc) · 4.85 KB
/
et.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding: utf-8 -*
import numpy as np
import pandas as pd
import pickle
from lib import processing
from scoring import cost_based_scoring
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
__ATTR_NAMES = ("duration", # length (number of seconds) of the conn's
"protocol_type", # symbolic, type of the protocol, e.g. tcp, udp, etc.
"service", # symbolic, network service on the destination, e.g., http, telnet, etc.
"flag", # symbolic, normal or error status of the conn
"src_bytes", # number of data bytes from source to destination
"dst_bytes", # number of data bytes from destination to source
"land", # symbolic, 1 if conn is from/to the same host/port; 0 otherwise
"wrong_fragment", # number of ''wrong'' fragments
"urgent", # number of urgent packets
# ----------
# ----- Basic features of individual TCP conn's -----
# ----------
"hot", # number of ''hot'' indicators
"num_failed_logins", # number of failed login attempts
"logged_in", # symbolic, 1 if successfully logged in; 0 otherwise
"num_compromised", # number of ''compromised'' conditions
"root_shell", # 1 if root shell is obtained; 0 otherwise
"su_attempted", # 1 if ''su root'' command attempted; 0 otherwise
"num_root", # number of ''root'' accesses
"num_file_creations", # number of file creation operations
"num_shells", # number of shell prompts
"num_access_files", # number of operations on access control files
"num_outbound_cmds", # number of outbound commands in an ftp session
"is_host_login", # symbolic, 1 if the login belongs to the ''hot'' list; 0 otherwise
"is_guest_login", # symbolic, 1 if the login is a ''guest''login; 0 otherwise
# ----------
# ----- Content features within a conn suggested by domain knowledge -----
# ----------
"count", # number of conn's to the same host as the current conn in the past two seconds
# Time-based Traffic Features (examine only the conn in the past two seconds):
# 1. Same Host, have the same destination host as the current conn
# 2. Same Service, have the same service as the current conn.
"srv_count", # SH, number of conn's to the same service as the current conn
"serror_rate", # SH, % of conn's that have SYN errors
"srv_serror_rate", # SS, % of conn's that have SYN errors
"rerror_rate", # SH, % of conn's that have REJ errors
"srv_rerror_rate", # SS, % of conn's that have REJ errors
"same_srv_rate", # SH, % of conn's to the same service
"diff_srv_rate", # SH, % of conn's to different services
"srv_diff_host_rate", # SH, % of conn's to different hosts
# ----------
# Host-base Traffic Features, constructed using a window of 100 conn's to the same host
"dst_host_count",
"dst_host_srv_count",
"dst_host_same_srv_rate",
"dst_host_diff_srv_rate",
"dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate",
"dst_host_serror_rate",
"dst_host_srv_serror_rate",
"dst_host_rerror_rate",
"dst_host_srv_rerror_rate",
# ----------
# category
"attack_type"
)
df = pd.read_csv(r'data/train10pc', header=None, names=__ATTR_NAMES)
# sparse feature merge
df = processing.merge_sparse_feature(df)
# one hot encoding
df = processing.one_hot(df)
# y labels mapping
df = processing.map2major5(df)
with open(r'data/selected_feat_names.pkl', 'rb') as f:
selected_feat_names = pickle.load(f)
print("training data loaded")
y = df["attack_type"].values
X = df[selected_feat_names].values
# TODO: put the best paras learn from grid search
et = ExtraTreesClassifier(n_jobs=-1,n_estimators=5,criterion='entropy',random_state=0)
et = et.fit(X, y)
print("training finished")
df=pd.read_csv(r'data/corrected',header=None, names=__ATTR_NAMES)
df = processing.merge_sparse_feature(df)
# one hot encoding
df = processing.one_hot(df)
# y labels mapping
df = processing.map2major5(df)
with open(r'data/selected_feat_names.pkl', 'rb') as f:
selected_feat_names = pickle.load(f)
print("test data loaded")
X = df[selected_feat_names].values
y = df['attack_type'].values
y_rf = et.predict(X)
print("et results:")
cost_based_scoring.score(y, y_rf, True)