ADD sklearn LOF support

salesforce · May 9, 2024 · 8a2747a · 8a2747a
1 parent 005e8cc
commit 8a2747a
Show file tree

Hide file tree

Showing 6 changed files with 287 additions and 1 deletion.
diff --git a/conf/benchmark_anomaly.json b/conf/benchmark_anomaly.json
@@ -191,5 +191,14 @@
     "post_rule_train_config": {
       "default": {"unsup_quantile": 0.95}
     }
+  },
+  "LocalOutlierFactor": {"alias": "LocalOutlierFactor"},
+  "LocalOutlierFactor": {
+    "config": {
+      "default": {}
+    },
+    "post_rule_train_config": {
+      "default": {"unsup_quantile": 0.95}
+    }
   }
 }
diff --git a/docs/source/merlion.models.anomaly.rst b/docs/source/merlion.models.anomaly.rst
@@ -133,6 +133,13 @@ anomaly.lstm_ed
    :undoc-members:
    :show-inheritance:
 
+anomaly.lof
+^^^^^^^^^^^^^^^
+.. automodule:: merlion.models.anomaly.lof
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 anomaly.deep\_point\_anomaly\_detector
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. automodule:: merlion.models.anomaly.deep_point_anomaly_detector

diff --git a/merlion/dashboard/models/anomaly.py b/merlion/dashboard/models/anomaly.py
@@ -24,6 +24,7 @@ class AnomalyModel(ModelMixin, DataMixin):
         "ArimaDetector",
         "DynamicBaseline",
         "IsolationForest",
+        "LocalOutlierFactor"
         "ETSDetector",
         "MSESDetector",
         "ProphetDetector",
@@ -34,7 +35,7 @@ class AnomalyModel(ModelMixin, DataMixin):
         "ZMS",
         "DeepPointAnomalyDetector",
     ]
-    multivariate_algorithms = ["IsolationForest", "AutoEncoder", "VAE", "DAGMM", "LSTMED"]
+    multivariate_algorithms = ["IsolationForest", "AutoEncoder", "VAE", "DAGMM", "LSTMED","LocalOutlierFactor"]
     thresholds = ["Threshold", "AggregateAlarms"]
 
     def __init__(self):

diff --git a/merlion/models/anomaly/lof.py b/merlion/models/anomaly/lof.py
@@ -0,0 +1,175 @@
+#
+# Copyright (c) 2024 salesforce.com, inc.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+#
+"""
+The classic LocalOutlierFactor model for anomaly detection.
+"""
+import logging
+
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import LocalOutlierFactor
+
+from merlion.models.anomaly.base import DetectorConfig, DetectorBase
+from merlion.transform.moving_average import DifferenceTransform
+from merlion.transform.sequence import TransformSequence
+from merlion.transform.resample import Shingle
+
+logger = logging.getLogger(__name__)
+
+
+class LOFConfig(DetectorConfig):
+    """
+    Configuration class for `LocalOutlierFactor`.
+    """
+
+    _default_transform = TransformSequence([DifferenceTransform(), Shingle(size=2, stride=1)])
+
+    def __init__(
+        self,
+        n_neighbors=20,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        contamination=0.1,
+        n_jobs=1,
+        novelty=True,
+        **kwargs
+    ):
+        """
+        n_neighbors : int, optional (default=20)
+            Number of neighbors to use by default for `kneighbors` queries.
+            If n_neighbors is larger than the number of samples provided,
+            all samples will be used.
+
+        algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
+            Algorithm used to compute the nearest neighbors:
+
+            - 'ball_tree' will use BallTree
+            - 'kd_tree' will use KDTree
+            - 'brute' will use a brute-force search.
+            - 'auto' will attempt to decide the most appropriate algorithm
+              based on the values passed to :meth:`fit` method.
+
+            Note: fitting on sparse input will override the setting of
+            this parameter, using brute force.
+
+        leaf_size : int, optional (default=30)
+            Leaf size passed to `BallTree` or `KDTree`. This can
+            affect the speed of the construction and query, as well as the memory
+            required to store the tree. The optimal value depends on the
+            nature of the problem.
+
+        metric : string or callable, default 'minkowski'
+            metric used for the distance computation. Any metric from scikit-learn
+            or scipy.spatial.distance can be used.
+
+            If 'precomputed', the training input X is expected to be a distance
+            matrix.
+
+            If metric is a callable function, it is called on each
+            pair of instances (rows) and the resulting value recorded. The callable
+            should take two arrays as input and return one value indicating the
+            distance between them. This works for Scipy's metrics, but is less
+            efficient than passing the metric name as a string.
+
+            Valid values for metric are:
+
+            - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+              'manhattan']
+
+            - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+              'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+              'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
+              'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
+              'sqeuclidean', 'yule']
+
+            See the documentation for scipy.spatial.distance for details on these
+            metrics:
+            http://docs.scipy.org/doc/scipy/reference/spatial.distance.html
+
+        p : integer, optional (default = 2)
+            Parameter for the Minkowski metric from
+            sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+            equivalent to using manhattan_distance (l1), and euclidean_distance
+            (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+            See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances
+
+        metric_params : dict, optional (default = None)
+            Additional keyword arguments for the metric function.
+
+        contamination : float in (0., 0.5), optional (default=0.1)
+            The amount of contamination of the data set, i.e. the proportion
+            of outliers in the data set. When fitting this is used to define the
+            threshold on the decision function.
+
+        n_jobs : int, optional (default = 1)
+            The number of parallel jobs to run for neighbors search.
+            If ``-1``, then the number of jobs is set to the number of CPU cores.
+            Affects only kneighbors and kneighbors_graph methods.
+
+        novelty : bool (default=False)
+            By default, LocalOutlierFactor is only meant to be used for outlier
+            detection (novelty=False). Set novelty to True if you want to use
+            LocalOutlierFactor for novelty detection. In this case be aware that
+            that you should only use predict, decision_function and score_samples
+            on new unseen data and not on the training set.
+        """
+        self.contamination = contamination
+        self.n_neighbors = n_neighbors
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.metric = metric
+        self.p = p
+        self.metric_params = metric_params
+        self.n_jobs = n_jobs
+        self.novelty = novelty
+        # Expect the max_score be overridden in the calibrator function
+        kwargs["max_score"] = 1.0
+        super().__init__(**kwargs)
+
+
+class LOF(DetectorBase):
+    """
+    The classic LocalOutlierFactor sklearn implementation.
+    """
+
+    config_class = LOFConfig
+
+    def __init__(self, config: LOFConfig):
+        super().__init__(config)
+        self.model = LocalOutlierFactor(
+            n_neighbors=config.n_neighbors,
+            algorithm=config.algorithm,
+            leaf_size=config.leaf_size,
+            metric=config.metric,
+            p=config.p,
+            metric_params=config.metric_params,
+            contamination=config.contamination,
+            n_jobs=config.n_jobs,
+            novelty=config.novelty,
+        )
+
+    @property
+    def require_even_sampling(self) -> bool:
+        return False
+
+    @property
+    def require_univariate(self) -> bool:
+        return False
+
+    def _train(self, train_data: pd.DataFrame, train_config=None) -> pd.DataFrame:
+        times, train_values = train_data.index, train_data.values
+        self.model.fit(train_values)
+        train_scores = -self.model.score_samples(train_values)
+        return pd.DataFrame(train_scores, index=times, columns=["anom_score"])
+
+    def _get_anomaly_score(self, time_series: pd.DataFrame, time_series_prev: pd.DataFrame = None) -> pd.DataFrame:
+        # Return the negative of model's score, since model scores are in [-1, 0), where more negative = more anomalous
+        scores = -self.model.score_samples(np.array(time_series.values))
+        return pd.DataFrame(scores, index=time_series.index)
diff --git a/merlion/models/factory.py b/merlion/models/factory.py
@@ -26,6 +26,7 @@
     ArimaDetector="merlion.models.anomaly.forecast_based.arima:ArimaDetector",
     DynamicBaseline="merlion.models.anomaly.dbl:DynamicBaseline",
     IsolationForest="merlion.models.anomaly.isolation_forest:IsolationForest",
+    LocalOutlierFactor="merlion.models.anomaly.lof:LOF",
     # Forecast-based anomaly detection models
     ETSDetector="merlion.models.anomaly.forecast_based.ets:ETSDetector",
     MSESDetector="merlion.models.anomaly.forecast_based.mses:MSESDetector",

diff --git a/tests/anomaly/test_lof.py b/tests/anomaly/test_lof.py
@@ -0,0 +1,93 @@
+#
+# Copyright (c) 2023 salesforce.com, inc.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+#
+import logging
+from os.path import abspath, dirname, join
+import sys
+import unittest
+
+import numpy as np
+
+from merlion.models.anomaly.lof import LOF, LOFConfig
+from merlion.transform.moving_average import MovingAverage, ExponentialMovingAverage
+from merlion.transform.resample import Shingle
+from merlion.transform.sequence import TransformSequence
+from merlion.post_process.threshold import AggregateAlarms
+from merlion.utils.data_io import csv_to_time_series
+
+rootdir = dirname(dirname(dirname(abspath(__file__))))
+logger = logging.getLogger(__name__)
+
+
+class TestLOF(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.csv_name = join(rootdir, "data", "example.csv")
+        self.test_len = 32768
+        self.data = csv_to_time_series(self.csv_name, timestamp_unit="ms", data_cols=["kpi"])
+        logger.info(f"Data looks like:\n{self.data[:5]}")
+        self.vals_train = self.data[: -self.test_len]
+        self.vals_test = self.data[-self.test_len :]
+
+        # You probably wouldn't use this transform in practice, but we use it
+        # here to test ExponentialMovingAverage and MovingAverage on
+        # multi-variate time series
+        self.model = LOF(
+            LOFConfig(
+                threshold=AggregateAlarms(alm_threshold=3.5),
+                transform=TransformSequence(
+                    [
+                        Shingle(size=5, stride=1),
+                        ExponentialMovingAverage(alpha=0.9, normalize=True),
+                        MovingAverage(weights=[0.1, 0.2, 0.3, 0.4]),
+                    ]
+                )
+            )
+        )
+        print()
+        logger.info("Training model...\n")
+        self.model.train(self.vals_train, post_rule_train_config={"unsup_quantile": 0.999})
+
+    def test_score(self):
+        # score function returns the raw anomaly scores
+        print("-" * 80)
+        logger.info("test_score\n" + "-" * 80 + "\n")
+        scores = self.model.get_anomaly_score(self.vals_test)
+        logger.info(f"Scores look like:\n{scores[:5]}")
+        scores = scores.to_pd().values.flatten()
+        logger.info("max score = " + str(max(scores)))
+        logger.info("min score = " + str(min(scores)) + "\n")
+
+    def test_alarm(self):
+        # alarm function returns the post-rule processed anomaly scores
+        print("-" * 80)
+        logger.info("test_alarm\n" + "-" * 80 + "\n")
+        alarms = self.model.get_anomaly_label(self.vals_test)
+        n_alarms = np.sum(alarms.to_pd().values != 0)
+        logger.info(f"Alarms look like:\n{alarms[:5]}")
+        logger.info(f"Number of alarms: {n_alarms}\n")
+        self.assertLess(n_alarms, 17)
+
+    def test_save_load(self):
+        print("-" * 80)
+        logger.info("test_save_load\n" + "-" * 80 + "\n")
+        self.model.save(dirname=join(rootdir, "tmp", "lof"))
+        loaded_model = LOF.load(dirname=join(rootdir, "tmp", "lof"))
+
+        scores = self.model.get_anomaly_score(self.vals_test)
+        loaded_model_scores = loaded_model.get_anomaly_score(self.vals_test)
+        self.assertSequenceEqual(list(scores), list(loaded_model_scores))
+
+        alarms = self.model.get_anomaly_label(self.vals_test)
+        loaded_model_alarms = loaded_model.get_anomaly_label(self.vals_test)
+        self.assertSequenceEqual(list(alarms), list(loaded_model_alarms))
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG
+    )
+    unittest.main()