NorskRegnesentral · Tveten · Oct 3, 2024 · Jul 2, 2024 · Jul 3, 2024 · Aug 28, 2024
diff --git a/interactive/explore_moscore.py b/interactive/explore_moscore.py
@@ -31,7 +31,7 @@
 df = generate_teeth_data(
     n_segments=2, variance=16, segment_length=100, p=1, random_state=1
 )
-detector = Moscore(score="meanvar")
+detector = Moscore(score="mean_var")
 changepoints = detector.fit_predict(df)
 px.scatter(df)
 px.scatter(detector.scores)

diff --git a/interactive/explore_moscore_anomaly.py b/interactive/explore_moscore_anomaly.py
@@ -15,7 +15,7 @@
 px.scatter(df)
 
 detector = MoscoreAnomaly(
-    score="meanvar",
+    score="mean_var",
     min_anomaly_length=10,
     max_anomaly_length=100,
     threshold_scale=3.0,

diff --git a/interactive/explore_stat_threshold_anomaliser.py b/interactive/explore_stat_threshold_anomaliser.py
@@ -12,7 +12,7 @@
     n, anomalies=[(100, 119), (250, 299)], means=[10.0, 5.0], random_state=1
 )
 
-change_detector = Moscore("meanvar", bandwidth=20)
+change_detector = Moscore("mean_var", bandwidth=20)
 change_detector = Pelt("mean", min_segment_length=5)
 detector = StatThresholdAnomaliser(
     change_detector, stat=np.mean, stat_lower=-1.0, stat_upper=1.0

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,13 @@
 [project]
 name = "skchange"
-version = "0.7.0"
+version = "0.8.0"
 description = "Sktime-compatible change and anomaly detection"
 authors = [
     {name = "Martin Tveten", email = "tveten@nr.no"},
 ]
 maintainers = [
     {name = "Martin Tveten", email = "tveten@nr.no"},
+    {name = "Johannes Voll Kolstø", email = "jvkolsto@nr.no"},
 ]
 readme = "README.md"
 keywords = [
@@ -55,6 +56,7 @@ dev = [
     "pre-commit",
     "pytest",
     "pytest-cov",
+    "ipykernel",
 ]
 
 [build-system]

diff --git a/skchange/__init__.py b/skchange/__init__.py
@@ -1,3 +1,3 @@
 """skchange."""
 
-__version__ = "0.7.0"
+__version__ = "0.8.0"
diff --git a/skchange/anomaly_detectors/capa.py b/skchange/anomaly_detectors/capa.py
@@ -1,6 +1,6 @@
 """The collective and point anomalies (CAPA) algorithm."""
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["Capa"]
 
 from typing import Callable, Optional, Union

diff --git a/skchange/anomaly_detectors/circular_binseg.py b/skchange/anomaly_detectors/circular_binseg.py
@@ -1,6 +1,6 @@
 """Circular binary segmentation algorithm for multiple changepoint detection."""
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["CircularBinarySegmentation"]
 
 from typing import Callable, Optional, Union
@@ -107,30 +107,48 @@ class CircularBinarySegmentation(CollectiveAnomalyDetector):
 
     Parameters
     ----------
-    score: str, tuple[Callable, Callable], optional (default="mean")
+    score: {"mean", "mean_var", "mean_cov"}, tuple[Callable, Callable], default="mean"
         Test statistic to use for changepoint detection.
-        * If "mean", the difference-in-mean statistic is used,
-        * If "var", the difference-in-variance statistic is used,
-        * If a tuple, it must contain two functions: The first function is the scoring
-        function, which takes in the output of the second function as its first
-        argument, and start, end and split indices as the second, third and fourth
-        arguments. The second function is the initializer, which precomputes quantities
-        that should be precomputed. See skchange/scores/score_factory.py for examples.
-    threshold_scale : float, optional (default=2.0)
+
+        * "mean": The CUSUM statistic for a change in mean (this is equivalent to a
+          likelihood ratio test for a change in the mean of Gaussian data). For
+          multivariate data, the sum of the CUSUM statistics for each dimension is used.
+        * "mean_var": The likelihood ratio test for a change in the mean and/or variance
+          of Gaussian data. For multivariate data, the sum of the likelihood ratio
+          statistics for each dimension is used.
+        * "mean_cov": The likelihood ratio test for a change in the mean and/or
+          covariance matrix of multivariate Gaussian data.
+        * If a tuple, it must contain two numba jitted functions:
+
+            1. The first function is the scoring function, which takes four arguments:
+
+                1. The output of the second function.
+                2. Start indices of the intervals to score for a change
+                3. End indices of the intervals to score for a change
+                4. Split indices of the intervals to score for a change.
+
+               For each start, split and end, the score should be calculated for the
+               data intervals [start:split] and [split+1:end], meaning that both the
+               starts and ends are inclusive, while split is included in the left
+               interval.
+            2. The second function is the initializer, which takes the data matrix as
+               input and returns precomputed quantities that may speed up the score
+               calculations. If not relevant, just return the data matrix.
+    threshold_scale : float, default=2.0
         Scaling factor for the threshold. The threshold is set to
         'threshold_scale * 2 * p * np.sqrt(np.log(n))', where 'n' is the sample size
         and 'p' is the number of variables. If None, the threshold is tuned on the data
         input to .fit().
-    level : float, optional (default=0.01)
+    level : float, default=0.01
         If `threshold_scale` is None, the threshold is set to the (1-`level`)-quantile
         of the changepoint scores of all the seeded intervals on the training data.
         For this to be correct, the training data must contain no changepoints.
-    min_segment_length : int, optional (default=5)
+    min_segment_length : int, default=5
         Minimum length between two changepoints. Must be greater than or equal to 1.
-    max_interval_length : int (default=100)
+    max_interval_length : int, default=100
         The maximum length of an interval to estimate a changepoint in. Must be greater
         than or equal to '2 * min_segment_length'.
-    growth_factor : float (default = 1.5)
+    growth_factor : float, default=1.5
         The growth factor for the seeded intervals. Intervals grow in size according to
         'interval_len=max(interval_len + 1, np.floor(growth_factor * interval_len))',
         starting at 'interval_len'='min_interval_length'. It also governs the amount

diff --git a/skchange/anomaly_detectors/moscore_anomaly.py b/skchange/anomaly_detectors/moscore_anomaly.py
@@ -1,6 +1,6 @@
 """The Moving Score algorithm for multiple collective anomaly detection."""
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["MoscoreAnomaly"]
 
 from typing import Callable, Optional, Union

diff --git a/skchange/anomaly_detectors/mvcapa.py b/skchange/anomaly_detectors/mvcapa.py
@@ -1,6 +1,6 @@
 """The subset multivariate collective and point anomalies (MVCAPA) algorithm."""
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["Mvcapa"]
 
 from typing import Callable, Optional, Union

diff --git a/skchange/base.py b/skchange/base.py
@@ -33,7 +33,7 @@ class name: BaseDetector
     fitted state flag       - check_is_fitted()
 """
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["BaseDetector"]
 
 import pandas as pd
@@ -66,8 +66,8 @@ class BaseDetector(BaseEstimator):
 
     _tags = {
         "object_type": "detector",  # type of object
-        "authors": "mtveten",  # author(s) of the object
-        "maintainers": "mtveten",  # current maintainer(s) of the object
+        "authors": "Tveten",  # author(s) of the object
+        "maintainers": "Tveten",  # current maintainer(s) of the object
     }  # for unit test cases
 
     def __init__(self):

diff --git a/skchange/change_detectors/moscore.py b/skchange/change_detectors/moscore.py
@@ -1,6 +1,6 @@
 """The Moving Score algorithm for multiple changepoint detection."""
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["Moscore"]
 
 from typing import Callable, Optional, Union
@@ -58,30 +58,48 @@ class Moscore(ChangeDetector):
 
     Parameters
     ----------
-    score: str, tuple[Callable, Callable], optional (default="mean")
+    score: {"mean", "mean_var", "mean_cov"}, tuple[Callable, Callable], default="mean"
         Test statistic to use for changepoint detection.
-        * If "mean", the difference-in-mean statistic is used,
-        * If "var", the difference-in-variance statistic is used,
-        * If a tuple, it must contain two functions: The first function is the scoring
-        function, which takes in the output of the second function as its first
-        argument, and start, end and split indices as the second, third and fourth
-        arguments. The second function is the initializer, which precomputes quantities
-        that should be precomputed. See skchange/scores/score_factory.py for examples.
-    bandwidth : int, optional (default=30)
+
+        * "mean": The CUSUM statistic for a change in mean (this is equivalent to a
+          likelihood ratio test for a change in the mean of Gaussian data). For
+          multivariate data, the sum of the CUSUM statistics for each dimension is used.
+        * "mean_var": The likelihood ratio test for a change in the mean and/or variance
+          of Gaussian data. For multivariate data, the sum of the likelihood ratio
+          statistics for each dimension is used.
+        * "mean_cov": The likelihood ratio test for a change in the mean and/or
+          covariance matrix of multivariate Gaussian data.
+        * If a tuple, it must contain two numba jitted functions:
+
+            1. The first function is the scoring function, which takes four arguments:
+
+                1. The output of the second function.
+                2. Start indices of the intervals to score for a change
+                3. End indices of the intervals to score for a change
+                4. Split indices of the intervals to score for a change.
+
+               For each start, split and end, the score should be calculated for the
+               data intervals [start:split] and [split+1:end], meaning that both the
+               starts and ends are inclusive, while split is included in the left
+               interval.
+            2. The second function is the initializer, which takes the data matrix as
+               input and returns precomputed quantities that may speed up the score
+               calculations. If not relevant, just return the data matrix.
+    bandwidth : int, default=30
         The bandwidth is the number of samples on either side of a candidate
         changepoint. The minimum bandwidth depends on the
         test statistic. For "mean", the minimum bandwidth is 1.
-    threshold_scale : float, optional (default=2.0)
+    threshold_scale : float, default=2.0
         Scaling factor for the threshold. The threshold is set to
         'threshold_scale * default_threshold', where the default threshold depends on
         the number of samples, the number of variables, `bandwidth` and `level`.
         If None, the threshold is tuned on the data input to .fit().
-    level : float, optional (default=0.01)
+    level : float, default=0.01
         If `threshold_scale` is None, the threshold is set to the (1-`level`)-quantile
         of the changepoint score on the training data. For this to be correct, the
         training data must contain no changepoints. If `threshold_scale` is a number,
         `level` is used in the default threshold, _before_ scaling.
-    min_detection_interval : int, optional (default=1)
+    min_detection_interval : int, default=1
         Minimum number of consecutive scores above the threshold to be considered a
         changepoint. Must be between 1 and `bandwidth`/2.
 
@@ -305,6 +323,6 @@ def get_test_params(cls, parameter_set="default"):
         """
         params = [
             {"score": "mean", "bandwidth": 5},
-            {"score": "meanvar", "bandwidth": 5},
+            {"score": "mean_var", "bandwidth": 5},
         ]
         return params
diff --git a/skchange/change_detectors/pelt.py b/skchange/change_detectors/pelt.py
@@ -1,6 +1,6 @@
 """The pruned exact linear time (PELT) algorithm."""
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["Pelt"]
 
 

diff --git a/skchange/change_detectors/seeded_binseg.py b/skchange/change_detectors/seeded_binseg.py
@@ -1,6 +1,6 @@
 """Seeded binary segmentation algorithm for multiple changepoint detection."""
 
-__author__ = ["mtveten"]
+__author__ = ["Tveten"]
 __all__ = ["SeededBinarySegmentation"]
 
 from typing import Callable, Optional, Union
@@ -108,30 +108,48 @@ class SeededBinarySegmentation(ChangeDetector):
 
     Parameters
     ----------
-    score: str, tuple[Callable, Callable], optional (default="mean")
+    score: {"mean", "mean_var", "mean_cov"}, tuple[Callable, Callable], default="mean"
         Test statistic to use for changepoint detection.
-        * If "mean", the difference-in-mean statistic is used,
-        * If "var", the difference-in-variance statistic is used,
-        * If a tuple, it must contain two functions: The first function is the scoring
-        function, which takes in the output of the second function as its first
-        argument, and start, end and split indices as the second, third and fourth
-        arguments. The second function is the initializer, which precomputes quantities
-        that should be precomputed. See skchange/scores/score_factory.py for examples.
-    threshold_scale : float, optional (default=2.0)
+
+        * "mean": The CUSUM statistic for a change in mean (this is equivalent to a
+          likelihood ratio test for a change in the mean of Gaussian data). For
+          multivariate data, the sum of the CUSUM statistics for each dimension is used.
+        * "mean_var": The likelihood ratio test for a change in the mean and/or variance
+          of Gaussian data. For multivariate data, the sum of the likelihood ratio
+          statistics for each dimension is used.
+        * "mean_cov": The likelihood ratio test for a change in the mean and/or
+          covariance matrix of multivariate Gaussian data.
+        * If a tuple, it must contain two numba jitted functions:
+
+            1. The first function is the scoring function, which takes four arguments:
+
+                1. The output of the second function.
+                2. Start indices of the intervals to score for a change
+                3. End indices of the intervals to score for a change
+                4. Split indices of the intervals to score for a change.
+
+               For each start, split and end, the score should be calculated for the
+               data intervals [start:split] and [split+1:end], meaning that both the
+               starts and ends are inclusive, while split is included in the left
+               interval.
+            2. The second function is the initializer, which takes the data matrix as
+               input and returns precomputed quantities that may speed up the score
+               calculations. If not relevant, just return the data matrix.
+    threshold_scale : float, default=2.0
         Scaling factor for the threshold. The threshold is set to
         'threshold_scale * 2 * p * np.sqrt(np.log(n))', where 'n' is the sample size
         and 'p' is the number of variables. If None, the threshold is tuned on the data
         input to .fit().
-    level : float, optional (default=0.01)
+    level : float, default=0.01
         If `threshold_scale` is None, the threshold is set to the (1-`level`)-quantile
         of the changepoint scores of all the seeded intervals on the training data.
         For this to be correct, the training data must contain no changepoints.
-    min_segment_length : int, optional (default=5)
+    min_segment_length : int, default=5
         Minimum length between two changepoints. Must be greater than or equal to 1.
-    max_interval_length : int (default=200)
+    max_interval_length : int, default=200
         The maximum length of an interval to estimate a changepoint in. Must be greater
         than or equal to '2 * min_segment_length'.
-    growth_factor : float (default = 1.5)
+    growth_factor : float, default=1.5
         The growth factor for the seeded intervals. Intervals grow in size according to
         'interval_len=max(interval_len + 1, np.floor(growth_factor * interval_len))',
         starting at 'interval_len'='min_interval_length'. It also governs the amount

diff --git a/skchange/change_detectors/tests/test_moscore.py b/skchange/change_detectors/tests/test_moscore.py
@@ -14,7 +14,7 @@ def test_moscore_changepoint(score):
     n_segments = 2
     seg_len = 50
     df = generate_teeth_data(
-        n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2
+        n_segments=n_segments, mean=15, segment_length=seg_len, p=1, random_state=2
     )
     detector = Moscore(score)
     changepoints = detector.fit_predict(df)

diff --git a/skchange/costs/cost_factory.py b/skchange/costs/cost_factory.py
@@ -14,6 +14,8 @@
 
 """
 
+__author__ = ["Tveten"]
+
 from skchange.costs.mean_cost import init_mean_cost, mean_cost
 
 VALID_COSTS = ["mean"]

diff --git a/skchange/costs/mean_cost.py b/skchange/costs/mean_cost.py
@@ -1,5 +1,7 @@
 """Gaussian mean likelihood cost function for change point detection."""
 
+__author__ = ["Tveten"]
+
 import numpy as np
 from numba import njit
 

diff --git a/skchange/costs/mean_saving.py b/skchange/costs/mean_saving.py
@@ -1,5 +1,7 @@
 """Mean saving for CAPA type anomaly detection."""
 
+__author__ = ["Tveten"]
+
 import numpy as np
 from numba import njit
 

diff --git a/skchange/costs/saving_factory.py b/skchange/costs/saving_factory.py
@@ -16,6 +16,8 @@
 
 """
 
+__author__ = ["Tveten"]
+
 from skchange.costs.mean_saving import init_mean_saving, mean_saving
 
 VALID_SAVINGS = ["mean"]

diff --git a/skchange/datasets/generate.py b/skchange/datasets/generate.py
@@ -1,5 +1,7 @@
 """Data generators."""
 
+__author__ = ["Tveten"]
+
 from numbers import Number
 from typing import Union