Skip to content

Commit

Permalink
Merge pull request #115 from HDI-Project/issue_111_improve_timeseries…
Browse files Browse the repository at this point in the history
…_primitives_performance

Add new primitives for timeseries preprocessing and anomaly detection
  • Loading branch information
csala authored Feb 21, 2019
2 parents 2c8b702 + 480c363 commit 98a341c
Show file tree
Hide file tree
Showing 6 changed files with 384 additions and 0 deletions.
167 changes: 167 additions & 0 deletions mlprimitives/custom/timeseries_anomalies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
"""
Time Series anomaly detection functions.
Implementation inspired by the paper https://arxiv.org/pdf/1802.04431.pdf
"""

import numpy as np
import pandas as pd
from scipy.optimize import fmin


def regression_errors(y, y_hat, smoothing_window=0.01, smooth=True):
"""Compute an array of absolute errors comparing predictions and expected output.
If smooth is True, apply EWMA to the resulting array of errors.
Args:
y (array): Ground truth.
y_hat (array): Predictions array.
smoothing_window (float): Size of the smoothing window, expressed as a proportion
of the total length of y.
smooth (bool): whether the returned errors should be smoothed with EWMA.
Returns:
(array): errors
"""
errors = np.abs(y - y_hat)[:, 0]

if not smooth:
return errors

smoothing_window = int(smoothing_window * len(y))

return pd.Series(errors).ewm(span=smoothing_window).mean().values


def deltas(errors, epsilon, mean, std):
"""Compute mean and std deltas.
delta_mean = mean(errors) - mean(all errors below epsilon)
delta_std = std(errors) - std(all errors below epsilon)
"""
below = errors[errors <= epsilon]
if not len(below):
return 0, 0

return mean - below.mean(), std - below.std()


def count_above(errors, epsilon):
"""Count number of errors and continuous sequences above epsilon.
Continuous sequences are counted by shifting and counting the number
of positions where there was a change and the original value was true,
which means that a sequence started at that position.
"""
above = errors > epsilon
total_above = len(errors[above])

above = pd.Series(above)
shift = above.shift(1)
change = above != shift

total_consecutive = sum(above & change)

return total_above, total_consecutive


def z_cost(z, errors, mean, std):
"""Compute how bad a z value is.
The original formula is::
(delta_mean/mean) + (delta_std/std)
------------------------------------------------------
number of errors above + (number of sequences above)^2
which computes the "goodness" of `z`, meaning that the higher the value
the better the `z`.
In this case, we return this value inverted (we make it negative), to convert
it into a cost function, as later on we will use scipy to minimize it.
"""

epsilon = mean + z * std

delta_mean, delta_std = deltas(errors, epsilon, mean, std)
above, consecutive = count_above(errors, epsilon)

numerator = -(delta_mean / mean + delta_std / std)
denominator = above + consecutive ** 2

if denominator == 0:
return np.inf

return numerator / denominator


def find_threshold(errors, z_range=(0, 10)):
"""Find the ideal threshold.
The ideal threshold is the one that minimizes the z_cost function.
"""

mean = errors.mean()
std = errors.std()

min_z, max_z = z_range
best_z = min_z
best_cost = np.inf
for z in range(min_z, max_z):
best = fmin(z_cost, z, args=(errors, mean, std), full_output=True, disp=False)
z, cost = best[0:2]
if cost < best_cost:
best_z = z

return mean + best_z * std


def find_sequences(errors, epsilon):
"""Find sequences of values that are above epsilon.
This is done following this steps:
* create a boolean mask that indicates which value are above epsilon.
* shift this mask by one place, filing the empty gap with a False
* compare the shifted mask with the original one to see if there are changes.
* Consider a sequence start any point which was true and has changed
* Consider a sequence end any point which was false and has changed
"""
above = pd.Series(errors > epsilon)
shift = above.shift(1).fillna(False)
change = above != shift

index = above.index
starts = index[above & change].tolist()
ends = index[~above & change].tolist()
if len(ends) == len(starts) - 1:
ends.append(len(above))

return list(zip(starts, ends))


def find_anomalies(errors, index, z_range=(0, 10)):
"""Find sequences of values that are anomalous.
We first find the ideal threshold for the set of errors that we have,
and then find the sequences of values that are above this threshold.
Lastly, we compute a score proportional to the maximum error in the
sequence, and finally return the index pairs that correspond to
each sequence, along with its score.
"""

mean = errors.mean()
std = errors.std()

threshold = find_threshold(errors, z_range)
sequences = find_sequences(errors, threshold)

anomalies = list()
denominator = mean + std
for start, stop in sequences:
max_error = errors[start:stop].max()
score = (max_error - threshold) / denominator
anomalies.append([index[start], index[stop], score])

return np.asarray(anomalies)
42 changes: 42 additions & 0 deletions mlprimitives/custom/timeseries_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import numpy as np
import pandas as pd


def rolling_window_sequences(X, index, window_size, target_size, target_column):
"""Create rolling window sequences out of timeseries data."""
out_X = list()
out_y = list()
out_index = list()

target = X[:, target_column]

for start in range(len(X) - window_size - target_size + 1):
end = start + window_size
out_X.append(X[start:end])
out_y.append(target[end:end + target_size])
out_index.append(index[start])

return np.asarray(out_X), np.asarray(out_y), np.asarray(out_index)


def time_segments_average(X, interval, time_column):
"""Compute average of values over fixed length time segments."""
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)

X = X.sort_values(time_column).set_index(time_column)

start_ts = X.index.values[0]
max_ts = X.index.values[-1]

values = list()
index = list()
while start_ts <= max_ts:
end_ts = start_ts + interval
subset = X.loc[start_ts:end_ts - 1]
means = subset.mean(skipna=True).values
values.append(means)
index.append(start_ts)
start_ts = end_ts

return np.asarray(values), np.asarray(index)
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"name": "mlprimitives.custom.timeseries_anomalies.find_anomalies",
"contributors": ["Carles Sala <csala@csail.mit.edu>"],
"documentation": "https://arxiv.org/pdf/1802.04431.pdf",
"description": "Extract anomalies from sequences of errors following the approach explained in the related paper.",
"classifiers": {
"type": "postprocessor",
"subtype": "anomaly_detector"
},
"modalities": ["timeseries"],
"primitive": "mlprimitives.custom.timeseries_anomalies.find_anomalies",
"produce": {
"args": [
{
"name": "errors",
"type": "ndarray"
},
{
"name": "index",
"type": "ndarray"
}
],
"output": [
{
"name": "y",
"type": "ndarray"
}
]
},
"hyperparameters": {
"fixed": {
"z_range": {
"type": "tuple",
"default": [0, 12]
}
},
"tunable": {}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"name": "mlprimitives.custom.timeseries_anomalies.regression_errors",
"contributors": ["Carles Sala <csala@csail.mit.edu>"],
"description": "Compute an array of absolute errors comparing predictions and expected output. Optionally smooth them using EWMA",
"classifiers": {
"type": "postprocessor",
"subtype": "feature_extractor"
},
"modalities": ["timeseries"],
"primitive": "mlprimitives.custom.timeseries_anomalies.regression_errors",
"produce": {
"args": [
{
"name": "y",
"type": "ndarray"
},
{
"name": "y_hat",
"type": "ndarray"
}
],
"output": [
{
"name": "errors",
"type": "ndarray"
}
]
},
"hyperparameters": {
"tunable": {
"smooth": {
"type": "bool",
"default": true
},
"smoothing_window": {
"type": "float",
"default": 0.01,
"range": [0.001, 0.1]
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"name": "mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences",
"contributors": ["Carles Sala <csala@csail.mit.edu>"],
"description": "Create rolling window sequences out of timeseries data.",
"classifiers": {
"type": "preprocessor",
"subtype": "feature_extractor"
},
"modalities": ["timeseries"],
"primitive": "mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences",
"produce": {
"args": [
{
"name": "X",
"type": "ndarray"
},
{
"name": "index",
"type": "ndarray"
}
],
"output": [
{
"name": "X",
"type": "ndarray"
},
{
"name": "y",
"type": "ndarray"
},
{
"name": "index",
"type": "ndarray"
}
]
},
"hyperparameters": {
"fixed": {
"window_size": {
"type": "int",
"default": 250
},
"target_size": {
"type": "int",
"default": 1
},
"target_column": {
"type": "str or int",
"default": 1
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"name": "mlprimitives.custom.timeseries_preprocessing.time_segments_average",
"contributors": ["Carles Sala <csala@csail.mit.edu>"],
"description": "Compute average of values over fixed length time segments.",
"classifiers": {
"type": "preprocessor",
"subtype": "feature_extractor"
},
"modalities": ["timeseries"],
"primitive": "mlprimitives.custom.timeseries_preprocessing.time_segments_average",
"produce": {
"args": [
{
"name": "X",
"type": "ndarray"
}
],
"output": [
{
"name": "X",
"type": "ndarray"
},
{
"name": "index",
"type": "ndarray"
}
]
},
"hyperparameters": {
"fixed": {
"interval": {
"type": "int",
"default": 3600
},
"time_column": {
"type": "str or int",
"default": 0
}
}
}
}

0 comments on commit 98a341c

Please sign in to comment.