forked from d-ailin/GDN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
274 lines (200 loc) · 10.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, random_split, Subset
from sklearn.preprocessing import MinMaxScaler
from util.env import get_device, set_device
from util.preprocess import build_loc_net, construct_data
from util.net_struct import get_feature_map, get_fc_graph_struc
from util.iostream import printsep
from datasets.TimeDataset import TimeDataset
from models.GDN import GDN
from train import train
from test import test
from evaluate import get_err_scores, get_best_performance_data, get_val_performance_data, get_full_err_scores
import sys
from datetime import datetime
import os
import argparse
from pathlib import Path
import matplotlib.pyplot as plt
import json
import random
class Main():
def __init__(self, train_config, env_config, debug=False):
self.train_config = train_config
self.env_config = env_config
self.datestr = None
dataset = self.env_config['dataset']
train_orig = pd.read_csv(f'./data/{dataset}/train.csv', sep=',', index_col=0)
test_orig = pd.read_csv(f'./data/{dataset}/test.csv', sep=',', index_col=0)
train, test = train_orig, test_orig
if 'attack' in train.columns:
train = train.drop(columns=['attack'])
# 获取数据集的特征名,相当于获取特征的字段名
feature_map = get_feature_map(dataset)
# 给特征之间创建图,这里的图是一个全联通图, 返回了一个dict(list)的数据结构,也就是每个特征有一个除了自己之外的列表
fc_struc = get_fc_graph_struc(dataset)
set_device(env_config['device'])
self.device = get_device()
# 生成一个二维嵌套list, list[0]和list[1]的长度一样,为整个图中边的数目,表示从list[1]和list[0]相同位置有一条边, 边由feature_map的索引表示
# 是因为pytorch-geometric表示的格式是这样的,为了符合 COO 稀疏格式
fc_edge_index = build_loc_net(fc_struc, list(train.columns), feature_map=feature_map)
fc_edge_index = torch.tensor(fc_edge_index, dtype = torch.long)
self.feature_map = feature_map
# 结果为一个列表,列表的每个item为相应特征的所有数据,如果有10个特征,则列表的长度就为11(10个特征+1个标签), 训练数据没有是否异常的标签,默认为零
train_dataset_indata = construct_data(train, feature_map, labels=0)
# 结果为一个列表,列表的每个值为对应特征的所有数据,列表的最后一个值为标签
test_dataset_indata = construct_data(test, feature_map, labels=test.attack.tolist())
cfg = {
'slide_win': train_config['slide_win'],
'slide_stride': train_config['slide_stride'],
}
# 生成pytorch数据集
train_dataset = TimeDataset(train_dataset_indata, fc_edge_index, mode='train', config=cfg)
test_dataset = TimeDataset(test_dataset_indata, fc_edge_index, mode='test', config=cfg)
# 生成dataloader
train_dataloader, val_dataloader = self.get_loaders(train_dataset, train_config['seed'], train_config['batch'], val_ratio = train_config['val_ratio'])
self.train_dataset = train_dataset
self.test_dataset = test_dataset
self.train_dataloader = train_dataloader
self.val_dataloader = val_dataloader
self.test_dataloader = DataLoader(test_dataset, batch_size=train_config['batch'],
shuffle=False, num_workers=0)
# TODO 这里为什么要将fc_edge_index放到一个列表里去呢
edge_index_sets = []
edge_index_sets.append(fc_edge_index)
self.model = GDN(edge_index_sets, len(feature_map),
dim=train_config['dim'],
input_dim=train_config['slide_win'],
out_layer_num=train_config['out_layer_num'],
out_layer_inter_dim=train_config['out_layer_inter_dim'],
topk=train_config['topk']
).to(self.device)
def run(self):
if len(self.env_config['load_model_path']) > 0:
model_save_path = self.env_config['load_model_path']
else:
model_save_path = self.get_save_path()[0]
self.train_log = train(self.model, model_save_path,
config = train_config,
train_dataloader=self.train_dataloader,
val_dataloader=self.val_dataloader,
feature_map=self.feature_map,
test_dataloader=self.test_dataloader,
test_dataset=self.test_dataset,
train_dataset=self.train_dataset,
dataset_name=self.env_config['dataset']
)
# test
self.model.load_state_dict(torch.load(model_save_path))
best_model = self.model.to(self.device)
_, self.test_result = test(best_model, self.test_dataloader)
_, self.val_result = test(best_model, self.val_dataloader)
self.get_score(self.test_result, self.val_result)
def get_loaders(self, train_dataset, seed, batch, val_ratio=0.1):
'''
根据val_ratio比例的大小,随机生成一个val_start_index,从而从数据集中随机选一段数据作为验证集
'''
dataset_len = int(len(train_dataset))
train_use_len = int(dataset_len * (1 - val_ratio))
val_use_len = int(dataset_len * val_ratio)
val_start_index = random.randrange(train_use_len)
indices = torch.arange(dataset_len)
train_sub_indices = torch.cat([indices[:val_start_index], indices[val_start_index+val_use_len:]])
train_subset = Subset(train_dataset, train_sub_indices)
val_sub_indices = indices[val_start_index:val_start_index+val_use_len]
val_subset = Subset(train_dataset, val_sub_indices)
train_dataloader = DataLoader(train_subset, batch_size=batch,
shuffle=True)
val_dataloader = DataLoader(val_subset, batch_size=batch,
shuffle=False)
return train_dataloader, val_dataloader
def get_score(self, test_result, val_result):
'''
计算评价指标
'''
feature_num = len(test_result[0][0])
np_test_result = np.array(test_result)
np_val_result = np.array(val_result)
test_labels = np_test_result[2, :, 0].tolist()
test_scores, normal_scores = get_full_err_scores(test_result, val_result)
top1_best_info = get_best_performance_data(test_scores, test_labels, topk=1)
# 这里因为论文中说使用验证集的分数来当阈值,所以验证集的数据被称为nornal_scores
top1_val_info = get_val_performance_data(test_scores, normal_scores, test_labels, topk=1)
print('=========================** Result **============================\n')
info = None
if self.env_config['report'] == 'best':
info = top1_best_info
elif self.env_config['report'] == 'val':
info = top1_val_info
print(f'F1 score: {info[0]}')
print(f'precision: {info[1]}')
print(f'recall: {info[2]}\n')
def get_save_path(self, feature_name=''):
dir_path = self.env_config['save_path']
if self.datestr is None:
now = datetime.now()
self.datestr = now.strftime('%m|%d-%H:%M:%S')
datestr = self.datestr
paths = [
f'./pretrained/{dir_path}/best_{datestr}.pt',
f'./results/{dir_path}/{datestr}.csv',
]
for path in paths:
dirname = os.path.dirname(path)
Path(dirname).mkdir(parents=True, exist_ok=True)
return paths
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-batch', help='batch size', type = int, default=128)
parser.add_argument('-epoch', help='train epoch', type = int, default=100)
parser.add_argument('-slide_win', help='slide_win', type = int, default=15)
parser.add_argument('-dim', help='dimension', type = int, default=64)
parser.add_argument('-slide_stride', help='slide_stride', type = int, default=5)
parser.add_argument('-save_path_pattern', help='save path pattern', type = str, default='')
parser.add_argument('-dataset', help='wadi / swat', type = str, default='wadi')
parser.add_argument('-device', help='cuda / cpu', type = str, default='cuda')
parser.add_argument('-random_seed', help='random seed', type = int, default=0)
parser.add_argument('-comment', help='experiment comment', type = str, default='')
parser.add_argument('-out_layer_num', help='outlayer num', type = int, default=1)
parser.add_argument('-out_layer_inter_dim', help='out_layer_inter_dim', type = int, default=256)
parser.add_argument('-decay', help='decay', type = float, default=0)
parser.add_argument('-val_ratio', help='val ratio', type = float, default=0.1)
parser.add_argument('-topk', help='topk num', type = int, default=20)
parser.add_argument('-report', help='best / val', type = str, default='best')
parser.add_argument('-load_model_path', help='trained model path', type = str, default='')
args = parser.parse_args()
random.seed(args.random_seed)
np.random.seed(args.random_seed)
torch.manual_seed(args.random_seed)
torch.cuda.manual_seed(args.random_seed)
torch.cuda.manual_seed_all(args.random_seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
os.environ['PYTHONHASHSEED'] = str(args.random_seed)
train_config = {
'batch': args.batch,
'epoch': args.epoch,
'slide_win': args.slide_win,
'dim': args.dim,
'slide_stride': args.slide_stride,
'comment': args.comment,
'seed': args.random_seed,
'out_layer_num': args.out_layer_num,
'out_layer_inter_dim': args.out_layer_inter_dim,
'decay': args.decay,
'val_ratio': args.val_ratio,
'topk': args.topk,
}
env_config={
'save_path': args.save_path_pattern,
'dataset': args.dataset,
'report': args.report,
'device': args.device,
'load_model_path': args.load_model_path
}
main = Main(train_config, env_config, debug=False)
main.run()