-
Notifications
You must be signed in to change notification settings - Fork 81
/
DetectMalware_CNN.lua
163 lines (131 loc) · 6.95 KB
/
DetectMalware_CNN.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
require 'nn'
require 'optim'
require 'nngraph'
require 'readMalwareData'
require 'splitMalwareData'
require 'buildNetwork'
require 'trainModel'
local cmd = torch.CmdLine()
cmd:option('-seed',1,'seed the random number generator')
cmd:option('-nEmbeddingDims',8,'number of dims in lookupTable for projecting instructions to network')
cmd:option('-nConvFilters',64,'number of convolutional filters')
cmd:option('-kernelLength',8,'seed the random number generator')
cmd:option('-useHiddenLayer',true,'use hidden layer between the conv layers and classifier')
cmd:option('-nHiddenNodes',16,'seed the random number generator')
cmd:option('-weightClasses',false,'seed the random number generator')
cmd:option('-nSamplingEpochs',10,'how often to sample the validation set - slow')
cmd:option('-useDropout',false,'use dropout between the conv and hidden layers')
cmd:option('-dropoutFrac',0.5,'dropout strength')
cmd:option('-randomize',false,'randomly select the network parameters')
cmd:option('-numDAShuffles',1,'number of function order shuffled versions of each program to keep')
cmd:option('-useOneHot',false,'Represent programs using one-hot / otherwise use look-up-table')
cmd:option('-learningRate',1e-3,'learning rate')
cmd:option('-nEpochs',20,'training epochs')
cmd:option('-nConvLayers',1,'number of extra convolutional layers')
cmd:option('-nFCLayers',1,'number of extra convolutional layers')
cmd:option('-batchSize',1,'size of batch used in training')
cmd:option('-usemom',false,'use momentum during SGD optimisation')
cmd:option('-useRMSProp',false,'use alternative optimizer rather than SGD')
cmd:option('-useCUDA',false,'use CUDA optimisation')
cmd:option('-gpuid',1,'which GPU to use')
cmd:option('-usePreTrainedEmbedding',false,'initialise network with pre-trained embedding')
cmd:option('-fixEmbedding',false,'prevent the embedding from being updated during learning')
cmd:option('-programLen',8,'how many instructions to read')
cmd:option('-debug',false,'enter debug mode')
cmd:option('-dataAugProb',0.1,'probability of changing an instruction during data augmentation')
cmd:option('-dataAugMethod',1,'1 - substitue the semantically most similar instruction, 2 - substitue random instruction')
cmd:option('-trainingSetSize',2,'restrict the size of the training-set for evaluation purposes')
cmd:option('-markFunctionEnds',false,'place a marker at the end of each method which may help classification work better')
cmd:option('-saveModel',false,'save the model and data split')
cmd:option('-saveFileName','detect_malware_cnn','filename to save the network')
cmd:option('-decayLearningRate',false,'reduce learning rate by factor of 10 every so often')
cmd:option('-weightDecay',0,'weight decay for L2 regularisation')
cmd:option('-weightDecayFrac',0.1,'amount to reduce learning rate by, 0.1 or 0.5 are good values')
-- try using dropout in various places of the network
cmd:option('-useSpatialDropout',false,'drop instructions after the embedding layer')
cmd:option('-useDropoutAfterEmbedding',false,'drop instructions after the embedding layer')
cmd:option('-useDropoutAfterConv',false,'drop instructions after the embedding layer')
cmd:option('-dataDir','./malwareDataset/','directory with the android programs to classify')
cmd:option('-metaDataFile','./config/metaData.th7','file containing indicies of test/train/val split')
cmd:option('-setupMode',false,'Only run in this mode once. Splits the data into the train/test sets. Saved into ./config/metaData.th7')
cmd:option('-maxSequenceLength',1000000,'if program is longer than this length, crop sequence before passing to GPU')
cmd:option('-dataAugTesting',false,'Use data augmentation during testing i.e average score over random samples from program')
opt = cmd:parse(arg)
if opt.useCUDA then
require 'cunn'
require 'cutorch'
end
torch.setdefaulttensortype("torch.DoubleTensor")
torch.manualSeed(opt.seed)
if opt.useCUDA then
cutorch.setDevice(opt.gpuid)
cutorch.manualSeedAll(opt.seed)
end
if opt.dataAugTesting then
require 'testModel_dataAug'
else
require 'testModel'
end
print(opt)
function isnan(z)
return z ~= z
end
if opt.setupMode then
-- READ-ME
-- Given a new dataset we need to split into training / testing sets.
-- We only run this chunk once to generate the new train / test split and save it to disk
-- Later, when training the network, the training-set is randomly spit into train / validation for a given run
-- This allows us to perform cross-validation on the training-set. After we have finished
-- doing all development we can test a pre-trained network on the testing-set.
------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------
-- read the data from the root dir
-- decide which files should be included in the dataset
print('reading dataset')
local datasetInfo = readMalwareData_setup(opt.dataDir)
print('splitting dataset into train/test sets')
local trainPercentage = 0.9 -- use 90% for training and validation sets, and 10% for held-out testing-set
local trainInds,testInds,posNegRatio = splitMalwareDataTrainTest(datasetInfo.label,trainPercentage,1 - trainPercentage)
local metaData = {
trainInds = trainInds,
testInds = testInds,
posNegRatio = posNegRatio,
trainPercentage = trainPercentage,
--
filesList = datasetInfo.filesList,
family = datasetInfo.family,
label = datasetInfo.label,
benignFamily = datasetInfo.benignFamily,
familyName = datasetInfo.familyName,
}
print('saving dataset metadata to file ',opt.metaDataFile)
torch.save(opt.metaDataFile,metaData)
-- ------------------------------------------------------------------------------------------
-- ------------------------------------------------------------------------------------------
else
-- train the network and save version with lowest validation error to disk
print(opt.metaDataFile)
local metaData = torch.load(opt.metaDataFile)
print('reading data from disk')
local allData = readMalwareData(opt.dataDir,metaData)
print('reading data from disk - complete')
print('program lens ',torch.min(allData.programLengths),torch.max(allData.programLengths),torch.mean(allData.programLengths))
--take the saved split of train/test and further split the train-set into train/val
print('splitting data into train/val/test sets')
local testPercentage = (1 - metaData.trainPercentage)
local valPercentage = (1 - metaData.trainPercentage)
local trainPercentage = 1 - (testPercentage + valPercentage)
print('t,v,t')
print(testPercentage,valPercentage,trainPercentage)
local trainInds,valInds,testInds,posNegRatio = splitMalwareDataTrainValTest(allData.label,metaData,trainPercentage)
local dataSplit = {
trainInds = trainInds,
valInds = valInds,
testInds = testInds,
posNegRatio = posNegRatio,
}
print('new network')
local model,criterion = buildNetwork(metaData.posNegRatio)
print('starting training')
local trainedModel = trainModel(model,criterion,allData,dataSplit.trainInds,dataSplit.valInds,dataSplit,metaData)
end