-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnomaly_Detection.R
285 lines (193 loc) · 7.46 KB
/
Anomaly_Detection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
### Anomaly Detection in R
### 1. Statistical outlier detection
# Explore contents of dataset
head(river)
# Summary statistics of river nitrate concentrations
summary(river$nitrate)
# Plot the distribution of nitrate concentration
boxplot(river$nitrate)
# Separate the histogram into 40 bins
hist(river$nitrate, xlab = "Nitrate concentration", breaks = 40)
# The histogram you've generated displays a symmetrical bell shape
# which is what would be expected of normal data,
# which means you can proceed to use Grubbs' outlier test.
# Apply Grubbs' test to the river nitrate data
grubbs.test(river$nitrate)
# Apply Grubbs' test to the nitrate data
grubbs.test(river$nitrate)
# Use which.max to find row index of the max
which.max(river$nitrate)
# Runs Grubbs' test excluding row 156
grubbs.test(river$nitrate[-156])
# Print the value tested in the second Grubbs' test
max(river$nitrate[-156])
# View contents of dataset
head(river)
# Show the time series of nitrate concentrations with time
plot(nitrate ~ index, data = river, type = "o")
# Calculate the mean nitrate by month
monthly_mean <- tapply(river$nitrate, river$months, FUN = mean)
monthly_mean
# Plot the monthly means
plot(monthly_mean, type = "o", xlab = "Month", ylab = "Monthly mean")
# Create a boxplot of nitrate against months
boxplot(nitrate ~ months, data = river)
# Use Seasonal-Hybrid ESD for nitrate concentrations
river_anomalies <- AnomalyDetectionVec(x = river$nitrate, period = 12, direction = 'both', plot = T)
# Print the anomalies
river_anomalies$anoms
# Print the plot
print(river_anomalies$plot)
#######################
### 2. Distance and density based anomaly detection
# View the contents of the wine data
head(wine)
# Scatterplot of wine pH against alcohol
plot(pH ~ alcohol, data = wine)
# Calculate the 5 nearest neighbors distance
wine_nn <- get.knn(wine, k = 5)
# View the distance matrix
head(wine_nn$nn.dist)
# Distance from wine 5 to nearest neighbor
wine_nn$nn.dist[5, 1]
# Row index of wine 5's nearest neighbor
wine_nn$nn.ind[5, 1]
# Return data for wine 5 and its nearest neighbor
wine[c(5, wine_nn$nn.ind[5, 1]), ]
# Calculate the 5 nearest neighbors distance
wine_nn <- get.knn(wine, k = 5)
# Create score by averaging distances
wine_nnd <- rowMeans(wine_nn$nn.dist)
# Print row index of the most anomalous point
which.max(wine_nnd)
# Without standardization, features have different scales
summary(wine)
# Standardize the wine columns
wine_scaled <- scale(wine)
# Standardized features have similar means and quartiles
summary(wine_scaled)
# Print the 5-nearest neighbor distance score
wine_nnd[1:5]
# Append the score as a new column
wine$score <- wine_nnd
# Scatterplot showing pH, alcohol and kNN score
plot(pH ~ alcohol, data = wine, cex = sqrt(score), pch = 20)
# Calculate the LOF for wine data
wine_lof <- lof(scale(wine), 5)
# Append the LOF score as a new column
wine$score <- wine_lof
# Scatterplot showing pH, alcohol and LOF score
plot(pH ~ alcohol, data = wine, cex = score, pch = 20)
# Scaled wine data
wine_scaled <- scale(wine)
# Calculate and append kNN distance as a new column
wine_nn <- get.knn(wine_scaled, k = 10)
wine$score_knn <- rowMeans(wine_nn$nn.dist)
# Calculate and append LOF as a new column
wine$score_lof <- lof(wine_scaled, k = 10)
# Find the row location of highest kNN
which.max(wine$score_knn)
# Find the row location of highest LOF
which.max(wine$score_lof)
#######################
### 3. Isolation forest
# Build an isolation tree
wine_tree <- iForest(wine, nt = 1)
# Create isolation score
wine$tree_score <- predict(wine_tree, newdata = wine)
# Histogram plot of the scores
hist(wine$tree_score, breaks = 40)
# Fit isolation forest
wine_forest <- iForest(wine, nt = 100, phi = 200)
# Create isolation score from forest
wine_score <- predict(wine_forest, wine)
# Append score to the wine data
wine$score <- wine_score
# View the contents of the wine scores
head(wine_scores)
# Score scatterplot 2000 vs 1000 trees
plot(trees_2000 ~ trees_1000, data = wine_scores)
# Add reference line of equality
abline(a = 0, b = 1)
# Sequence of values for pH and alcohol
ph_seq <- seq(min(wine$pH), max(wine$pH), length.out = 25)
alcohol_seq <- seq(min(wine$alcohol), max(wine$alcohol), length.out = 25)
# Create a data frame of grid coordinates
wine_grid <- expand.grid(pH = ph_seq, alcohol = alcohol_seq)
# Visualise the grid using a scatterplot
plot(pH ~ alcohol, data = wine_grid, pch = 20)
# Calculate isolation score at grid locations
wine_grid$score <- predict(wine_forest, wine_grid)
# Contour plot of isolation scores
contourplot(score ~ alcohol + pH, wine_grid, region = TRUE)
#######################
### 4. Comparing performance
# View contents of thryoid data
head(thyroid)
# Tabulate the labels
table(thyroid$label)
# Proportion of thyroid cases
prop_disease <- 22/(978+22)
# Scatterplot showing TSH, T3 and anomaly labels
plot(TSH ~ T3, data = thyroid, pch = 20, col = label + 1)
# Scatterplot showing TT4, TBG and anomaly labels
plot(TT4 ~ TBG, data = thyroid, pch = 20, col = label + 1)
# Fit isolation forest
thyroid_forest <- iForest(thyroid[, -1], nt = 200, phi = 100)
# Anomaly score
thyroid$iso_score <- predict(thyroid_forest, thyroid[, -1])
# Boxplot of the anomaly score against labels
boxplot(iso_score ~ label, thyroid, col = "olivedrab4")
# Scale the measurement columns of thyroid
scaled_thyroid_measurements <- scale(thyroid[, -1])
# Create a LOF score for the measurements
lof_score <- lof(scaled_thyroid_measurements, k = 10)
# Calculate high threshold for
high_lof <- quantile(lof_score, probs = 0.98)
# Append binary LOF score to thyroid data
thyroid$binary_lof <- as.numeric(lof_score >= high_lof)
# Calculate high threshold for iso_score
high_iso <- quantile(iso_score, probs = 0.98)
# Append binary isolation score to thyroid data
thyroid$binary_iso <- as.numeric(iso_score >= high_iso)
# Tabulate agreement of label and binary isolation score
table(thyroid$label, thyroid$binary_iso)
# Tabulate agreement of label and binary LOF score
table(thyroid$label, thyroid$binary_lof)
# Proportion of binary_iso and label that agree
iso_prop <- sum(diag(table(thyroid$label, thyroid$binary_iso)))
/ sum(table(thyroid$label, thyroid$binary_iso))
# Proportion of binary_lof and label that agree
lof_prop <- sum(diag(table(thyroid$label, thyroid$binary_lof)))
/ sum(table(thyroid$label, thyroid$binary_lof))
# Tabulation for the binary isolation score
table(thyroid$label, thyroid$binary_iso)
# Precision for the isolation score
precision_iso <- 12 / (8 + 12)
# Recall for the isolation score
recall_iso <- 12 / (12 + 10)
# Tabulation for the binary lof score
table(thyroid$label, thyroid$binary_lof)
# Precision for the binary lof score
precision_lof <- 0 / (20 + 0)
# Recall for the binary lof score
recall_lof <- 0 / (22 + 0)
# Print the column classes in thyroid
sapply(X = thyroid, FUN = class)
# Convert column with character class to factor
thyroid$age <- as.factor(thyroid$age)
thyroid$sex <- as.factor(thyroid$sex)
# Check that all columns are factor or numeric
sapply(X = thyroid, FUN = class)
# Check the class of age column
class(thyroid$age)
# Check the class of sex column
class(thyroid$sex)
# Fit an isolation forest with 100 trees
thyroid_for <- iForest(thyroid[,-1], nt = 100)
# Calculate Gower's distance matrix
thyroid_dist <- daisy(thyroid[,-1], metric = "gower")
# Generate LOF scores for thyroid data
thyroid_lof <- lof(thyroid_dist, k = 10)
# Range of values in the distance matrix
range(as.matrix(thyroid_dist))