diff --git a/src/insight/metrics/__init__.py b/src/insight/metrics/__init__.py index a1fc7dd8..14abdf1f 100644 --- a/src/insight/metrics/__init__.py +++ b/src/insight/metrics/__init__.py @@ -7,6 +7,7 @@ HellingerDistance, JensenShannonDivergence, KendallTauCorrelation, + KolmogorovSmirnovDistance, KullbackLeiblerDivergence, Mean, Norm, @@ -16,23 +17,24 @@ from .metrics_usage import CorrMatrix, DiffCorrMatrix, OneColumnMap, TwoColumnMap __all__ = [ - "OneColumnMetric", - "TwoColumnMetric", - "OneColumnMap", - "TwoColumnMap", + "BhattacharyyaCoefficient", "CorrMatrix", - "DiffCorrMatrix", "CramersV", + "DiffCorrMatrix", "EarthMoversDistance", - "Mean", - "StandardDeviation", - "KendallTauCorrelation", - "Norm", - "TwoDataFrameMetric", "EarthMoversDistanceBinned", + "HellingerDistance", "JensenShannonDivergence", + "KendallTauCorrelation", + "KolmogorovSmirnovDistance", "KullbackLeiblerDivergence", - "HellingerDistance", - "BhattacharyyaCoefficient", + "Mean", + "Norm", + "OneColumnMap", + "OneColumnMetric", + "StandardDeviation", "TotalVariationDistance", + "TwoColumnMap", + "TwoColumnMetric", + "TwoDataFrameMetric", ] diff --git a/src/insight/metrics/metrics.py b/src/insight/metrics/metrics.py index 3de3a633..e68f1f57 100644 --- a/src/insight/metrics/metrics.py +++ b/src/insight/metrics/metrics.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from scipy.spatial.distance import jensenshannon -from scipy.stats import entropy, wasserstein_distance +from scipy.stats import entropy, ks_2samp, wasserstein_distance from ..check import Check, ColumnCheck from .base import OneColumnMetric, TwoColumnMetric @@ -489,3 +489,35 @@ def check_column_types( def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): (p, q) = zipped_hist((sr_a, sr_b), check=self.check) return np.linalg.norm(ty.cast(pd.Series, p) - ty.cast(pd.Series, q), ord=1) / 2 + + +class KolmogorovSmirnovDistance(TwoColumnMetric): + """Kolmogorov-Smirnov Distance between two probability distributions. + + The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions, + and a value of 1 indicates they follow completely different distributions. + """ + + name = "kolmogorov_smirnov_distance" + + @classmethod + def check_column_types( + cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck() + ) -> bool: + if check.continuous(sr_a) and check.continuous(sr_b): + return True + if check.categorical(sr_a) and check.categorical(sr_b): + return True + return False + + def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series) -> float: + """Calculate the metric. + Args: + sr_a (pd.Series): values of a variable. + sr_b (pd.Series): values of another variable to compare. + Returns: + The Kolmogorov-Smirnov distance between sr_a and sr_b. + """ + if sr_a.empty or sr_b.empty: + return 1.0 + return ks_2samp(sr_a, sr_b)[0] # The first element is the KS statistic diff --git a/tests/test_metrics/test_metrics.py b/tests/test_metrics/test_metrics.py index 70fccfa1..4bedf2ab 100644 --- a/tests/test_metrics/test_metrics.py +++ b/tests/test_metrics/test_metrics.py @@ -13,6 +13,7 @@ HellingerDistance, JensenShannonDivergence, KendallTauCorrelation, + KolmogorovSmirnovDistance, KullbackLeiblerDivergence, Mean, Norm, @@ -27,6 +28,7 @@ emd = EarthMoversDistance() hellinger_distance = HellingerDistance() kl_divergence = KullbackLeiblerDivergence() +kolmogorov_smirnov_distance = KolmogorovSmirnovDistance() js_divergence = JensenShannonDivergence() norm = Norm() norm_ord1 = Norm(ord=1) @@ -286,6 +288,41 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series: assert abs(kl_divergence_with_custom_check(sr_g, sr_h) - 0.3) < 0.01 +def test_kolmogorov_smirnov_distance(group1): + # Test with identical distributions + assert kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 3])) == 0 + assert kolmogorov_smirnov_distance(group1, group1) == 0 + + # Test with distributions that are completely different + assert kolmogorov_smirnov_distance(pd.Series([1, 1, 1]), pd.Series([2, 2, 2])) == 1 + + # Test with distributions that are slightly different + assert 0 < kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 4])) < 1 + + # Test with random distributions + np.random.seed(0) + group2 = pd.Series(np.random.normal(0, 1, 1000)) + group3 = pd.Series(np.random.normal(0.5, 1, 1000)) + assert 0 < kolmogorov_smirnov_distance(group2, group3) < 1 + + # Test with distributions of different lengths + assert 0 < kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 3, 4])) < 1 + + # Test with categorical data + cat1 = pd.Series(["a", "b", "c", "a"]) + cat2 = pd.Series(["b", "c", "d"]) + assert 0 < kolmogorov_smirnov_distance(cat1, cat2) < 1 + + # Edge cases + # Test with one or both series empty + assert kolmogorov_smirnov_distance(pd.Series([]), pd.Series([1, 2, 3])) == 1 + assert kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([])) == 1 + assert kolmogorov_smirnov_distance(pd.Series([]), pd.Series([])) == 1 + + # Test with series containing NaN values + assert 0 <= kolmogorov_smirnov_distance(pd.Series([1, np.nan, 3]), pd.Series([1, 2, 3])) <= 1 + + def test_js_divergence(group1, group2, group3): assert js_divergence(pd.Series([1, 0]), pd.Series([1, 0])) == 0 diff --git a/tests/test_metrics/test_metrics_usage.py b/tests/test_metrics/test_metrics_usage.py index ab695c38..fd02313a 100644 --- a/tests/test_metrics/test_metrics_usage.py +++ b/tests/test_metrics/test_metrics_usage.py @@ -3,7 +3,14 @@ import pytest from insight.check import ColumnCheck -from insight.metrics import CorrMatrix, CramersV, DiffCorrMatrix, EarthMoversDistance, TwoColumnMap +from insight.metrics import ( + CorrMatrix, + CramersV, + DiffCorrMatrix, + EarthMoversDistance, + KolmogorovSmirnovDistance, + TwoColumnMap, +) @pytest.fixture(scope="module") @@ -44,6 +51,22 @@ def test_two_column_map(data): assert all(np.isnan(emd_map_df["metric_val"][cont]) for cont in continuous_cols) +def test_two_column_map_with_ksd(data): + df, categorical_cols, continuous_cols = data[0], data[1], data[2] + df1 = df.sample(1000).reset_index(drop=True) + df2 = df.sample(1000).reset_index(drop=True) + + ksd = KolmogorovSmirnovDistance() + + col_map = TwoColumnMap(ksd) + ksd_map_df = col_map(df1, df2) + assert col_map.name == f"{str(ksd)}_map" + + assert set(ksd_map_df.columns.to_list()) == set(["metric_val"]) + assert all(not np.isnan(ksd_map_df["metric_val"][cat]) for cat in categorical_cols) + assert all(not np.isnan(ksd_map_df["metric_val"][cont]) for cont in continuous_cols) + + def test_metric_matrix(data): df, categorical_cols, continuous_cols = data[0], data[1], data[2] df1 = df.sample(1000).reset_index(drop=True)