Merge pull request #115 from HDI-Project/issue_111_improve_timeseries…

…_primitives_performance Add new primitives for timeseries preprocessing and anomaly detection
MLBazaar · Feb 21, 2019 · 98a341c · 98a341c
2 parents 2c8b702 + 480c363
commit 98a341c
Show file tree

Hide file tree

Showing 6 changed files with 384 additions and 0 deletions.
diff --git a/mlprimitives/custom/timeseries_anomalies.py b/mlprimitives/custom/timeseries_anomalies.py
@@ -0,0 +1,167 @@
+"""
+Time Series anomaly detection functions.
+Implementation inspired by the paper https://arxiv.org/pdf/1802.04431.pdf
+"""
+
+import numpy as np
+import pandas as pd
+from scipy.optimize import fmin
+
+
+def regression_errors(y, y_hat, smoothing_window=0.01, smooth=True):
+    """Compute an array of absolute errors comparing predictions and expected output.
+
+    If smooth is True, apply EWMA to the resulting array of errors.
+
+    Args:
+        y (array): Ground truth.
+        y_hat (array): Predictions array.
+        smoothing_window (float): Size of the smoothing window, expressed as a proportion
+            of the total length of y.
+        smooth (bool): whether the returned errors should be smoothed with EWMA.
+
+    Returns:
+        (array): errors
+    """
+    errors = np.abs(y - y_hat)[:, 0]
+
+    if not smooth:
+        return errors
+
+    smoothing_window = int(smoothing_window * len(y))
+
+    return pd.Series(errors).ewm(span=smoothing_window).mean().values
+
+
+def deltas(errors, epsilon, mean, std):
+    """Compute mean and std deltas.
+
+    delta_mean = mean(errors) - mean(all errors below epsilon)
+    delta_std = std(errors) - std(all errors below epsilon)
+    """
+    below = errors[errors <= epsilon]
+    if not len(below):
+        return 0, 0
+
+    return mean - below.mean(), std - below.std()
+
+
+def count_above(errors, epsilon):
+    """Count number of errors and continuous sequences above epsilon.
+
+    Continuous sequences are counted by shifting and counting the number
+    of positions where there was a change and the original value was true,
+    which means that a sequence started at that position.
+    """
+    above = errors > epsilon
+    total_above = len(errors[above])
+
+    above = pd.Series(above)
+    shift = above.shift(1)
+    change = above != shift
+
+    total_consecutive = sum(above & change)
+
+    return total_above, total_consecutive
+
+
+def z_cost(z, errors, mean, std):
+    """Compute how bad a z value is.
+
+    The original formula is::
+
+                 (delta_mean/mean) + (delta_std/std)
+        ------------------------------------------------------
+        number of errors above + (number of sequences above)^2
+
+    which computes the "goodness" of `z`, meaning that the higher the value
+    the better the `z`.
+
+    In this case, we return this value inverted (we make it negative), to convert
+    it into a cost function, as later on we will use scipy to minimize it.
+    """
+
+    epsilon = mean + z * std
+
+    delta_mean, delta_std = deltas(errors, epsilon, mean, std)
+    above, consecutive = count_above(errors, epsilon)
+
+    numerator = -(delta_mean / mean + delta_std / std)
+    denominator = above + consecutive ** 2
+
+    if denominator == 0:
+        return np.inf
+
+    return numerator / denominator
+
+
+def find_threshold(errors, z_range=(0, 10)):
+    """Find the ideal threshold.
+
+    The ideal threshold is the one that minimizes the z_cost function.
+    """
+
+    mean = errors.mean()
+    std = errors.std()
+
+    min_z, max_z = z_range
+    best_z = min_z
+    best_cost = np.inf
+    for z in range(min_z, max_z):
+        best = fmin(z_cost, z, args=(errors, mean, std), full_output=True, disp=False)
+        z, cost = best[0:2]
+        if cost < best_cost:
+            best_z = z
+
+    return mean + best_z * std
+
+
+def find_sequences(errors, epsilon):
+    """Find sequences of values that are above epsilon.
+
+    This is done following this steps:
+
+        * create a boolean mask that indicates which value are above epsilon.
+        * shift this mask by one place, filing the empty gap with a False
+        * compare the shifted mask with the original one to see if there are changes.
+        * Consider a sequence start any point which was true and has changed
+        * Consider a sequence end any point which was false and has changed
+    """
+    above = pd.Series(errors > epsilon)
+    shift = above.shift(1).fillna(False)
+    change = above != shift
+
+    index = above.index
+    starts = index[above & change].tolist()
+    ends = index[~above & change].tolist()
+    if len(ends) == len(starts) - 1:
+        ends.append(len(above))
+
+    return list(zip(starts, ends))
+
+
+def find_anomalies(errors, index, z_range=(0, 10)):
+    """Find sequences of values that are anomalous.
+
+    We first find the ideal threshold for the set of errors that we have,
+    and then find the sequences of values that are above this threshold.
+
+    Lastly, we compute a score proportional to the maximum error in the
+    sequence, and finally return the index pairs that correspond to
+    each sequence, along with its score.
+    """
+
+    mean = errors.mean()
+    std = errors.std()
+
+    threshold = find_threshold(errors, z_range)
+    sequences = find_sequences(errors, threshold)
+
+    anomalies = list()
+    denominator = mean + std
+    for start, stop in sequences:
+        max_error = errors[start:stop].max()
+        score = (max_error - threshold) / denominator
+        anomalies.append([index[start], index[stop], score])
+
+    return np.asarray(anomalies)
diff --git a/mlprimitives/custom/timeseries_preprocessing.py b/mlprimitives/custom/timeseries_preprocessing.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pandas as pd
+
+
+def rolling_window_sequences(X, index, window_size, target_size, target_column):
+    """Create rolling window sequences out of timeseries data."""
+    out_X = list()
+    out_y = list()
+    out_index = list()
+
+    target = X[:, target_column]
+
+    for start in range(len(X) - window_size - target_size + 1):
+        end = start + window_size
+        out_X.append(X[start:end])
+        out_y.append(target[end:end + target_size])
+        out_index.append(index[start])
+
+    return np.asarray(out_X), np.asarray(out_y), np.asarray(out_index)
+
+
+def time_segments_average(X, interval, time_column):
+    """Compute average of values over fixed length time segments."""
+    if isinstance(X, np.ndarray):
+        X = pd.DataFrame(X)
+
+    X = X.sort_values(time_column).set_index(time_column)
+
+    start_ts = X.index.values[0]
+    max_ts = X.index.values[-1]
+
+    values = list()
+    index = list()
+    while start_ts <= max_ts:
+        end_ts = start_ts + interval
+        subset = X.loc[start_ts:end_ts - 1]
+        means = subset.mean(skipna=True).values
+        values.append(means)
+        index.append(start_ts)
+        start_ts = end_ts
+
+    return np.asarray(values), np.asarray(index)
diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.find_anomalies.json b/mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.find_anomalies.json
@@ -0,0 +1,39 @@
+{
+    "name": "mlprimitives.custom.timeseries_anomalies.find_anomalies",
+    "contributors": ["Carles Sala <csala@csail.mit.edu>"],
+    "documentation": "https://arxiv.org/pdf/1802.04431.pdf",
+    "description": "Extract anomalies from sequences of errors following the approach explained in the related paper.",
+    "classifiers": {
+        "type": "postprocessor",
+        "subtype": "anomaly_detector"
+    },
+    "modalities": ["timeseries"],
+    "primitive": "mlprimitives.custom.timeseries_anomalies.find_anomalies",
+    "produce": {
+        "args": [
+            {
+                "name": "errors",
+                "type": "ndarray"
+            },
+            {
+                "name": "index",
+                "type": "ndarray"
+            }
+        ],
+        "output": [
+            {
+                "name": "y",
+                "type": "ndarray"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "fixed": {
+            "z_range": {
+                "type": "tuple",
+                "default": [0, 12]
+            }
+        },
+        "tunable": {}
+    }
+}
diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.regression_errors.json b/mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.regression_errors.json
@@ -0,0 +1,42 @@
+{
+    "name": "mlprimitives.custom.timeseries_anomalies.regression_errors",
+    "contributors": ["Carles Sala <csala@csail.mit.edu>"],
+    "description": "Compute an array of absolute errors comparing predictions and expected output. Optionally smooth them using EWMA",
+    "classifiers": {
+        "type": "postprocessor",
+        "subtype": "feature_extractor"
+    },
+    "modalities": ["timeseries"],
+    "primitive": "mlprimitives.custom.timeseries_anomalies.regression_errors",
+    "produce": {
+        "args": [
+            {
+                "name": "y",
+                "type": "ndarray"
+            },
+            {
+                "name": "y_hat",
+                "type": "ndarray"
+            }
+        ],
+        "output": [
+            {
+                "name": "errors",
+                "type": "ndarray"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "tunable": {
+            "smooth": {
+                "type": "bool",
+                "default": true
+            },
+            "smoothing_window": {
+                "type": "float",
+                "default": 0.01,
+                "range": [0.001, 0.1]
+            }
+        }
+    }
+}
diff --git a/...imitives/jsons/mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences.json b/...imitives/jsons/mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences.json
@@ -0,0 +1,53 @@
+{
+    "name": "mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences",
+    "contributors": ["Carles Sala <csala@csail.mit.edu>"],
+    "description": "Create rolling window sequences out of timeseries data.",
+    "classifiers": {
+        "type": "preprocessor",
+        "subtype": "feature_extractor"
+    },
+    "modalities": ["timeseries"],
+    "primitive": "mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences",
+    "produce": {
+        "args": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            },
+            {
+                "name": "index",
+                "type": "ndarray"
+            }
+        ],
+        "output": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            },
+            {
+                "name": "y",
+                "type": "ndarray"
+            },
+            {
+                "name": "index",
+                "type": "ndarray"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "fixed": {
+            "window_size": {
+                "type": "int",
+                "default": 250
+            },
+            "target_size": {
+                "type": "int",
+                "default": 1
+            },
+            "target_column": {
+                "type": "str or int",
+                "default": 1
+            }
+        }
+    }
+}
diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.time_segments_average.json b/mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.time_segments_average.json
@@ -0,0 +1,41 @@
+{
+    "name": "mlprimitives.custom.timeseries_preprocessing.time_segments_average",
+    "contributors": ["Carles Sala <csala@csail.mit.edu>"],
+    "description": "Compute average of values over fixed length time segments.",
+    "classifiers": {
+        "type": "preprocessor",
+        "subtype": "feature_extractor"
+    },
+    "modalities": ["timeseries"],
+    "primitive": "mlprimitives.custom.timeseries_preprocessing.time_segments_average",
+    "produce": {
+        "args": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            }
+        ],
+        "output": [
+            {
+                "name": "X",
+                "type": "ndarray"
+            },
+            {
+                "name": "index",
+                "type": "ndarray"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "fixed": {
+            "interval": {
+                "type": "int",
+                "default": 3600
+            },
+            "time_column": {
+                "type": "str or int",
+                "default": 0
+            }
+        }
+    }
+}