Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added KSD #156

Merged
merged 2 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions src/insight/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
HellingerDistance,
JensenShannonDivergence,
KendallTauCorrelation,
KolmogorovSmirnovDistance,
KullbackLeiblerDivergence,
Mean,
Norm,
Expand All @@ -16,23 +17,24 @@
from .metrics_usage import CorrMatrix, DiffCorrMatrix, OneColumnMap, TwoColumnMap

__all__ = [
"OneColumnMetric",
"TwoColumnMetric",
"OneColumnMap",
"TwoColumnMap",
"BhattacharyyaCoefficient",
"CorrMatrix",
"DiffCorrMatrix",
"CramersV",
"DiffCorrMatrix",
"EarthMoversDistance",
"Mean",
"StandardDeviation",
"KendallTauCorrelation",
"Norm",
"TwoDataFrameMetric",
"EarthMoversDistanceBinned",
"HellingerDistance",
"JensenShannonDivergence",
"KendallTauCorrelation",
"KolmogorovSmirnovDistance",
"KullbackLeiblerDivergence",
"HellingerDistance",
"BhattacharyyaCoefficient",
"Mean",
"Norm",
"OneColumnMap",
"OneColumnMetric",
"StandardDeviation",
"TotalVariationDistance",
"TwoColumnMap",
"TwoColumnMetric",
"TwoDataFrameMetric",
]
34 changes: 33 additions & 1 deletion src/insight/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy, wasserstein_distance
from scipy.stats import entropy, ks_2samp, wasserstein_distance

from ..check import Check, ColumnCheck
from .base import OneColumnMetric, TwoColumnMetric
Expand Down Expand Up @@ -489,3 +489,35 @@ def check_column_types(
def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series):
(p, q) = zipped_hist((sr_a, sr_b), check=self.check)
return np.linalg.norm(ty.cast(pd.Series, p) - ty.cast(pd.Series, q), ord=1) / 2


class KolmogorovSmirnovDistance(TwoColumnMetric):
"""Kolmogorov-Smirnov Distance between two probability distributions.

The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions,
and a value of 1 indicates they follow completely different distributions.
"""

name = "kolmogorov_smirnov_distance"

@classmethod
def check_column_types(
cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()
) -> bool:
if check.continuous(sr_a) and check.continuous(sr_b):
return True
if check.categorical(sr_a) and check.categorical(sr_b):
return True
return False

def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series) -> float:
"""Calculate the metric.
Args:
sr_a (pd.Series): values of a variable.
sr_b (pd.Series): values of another variable to compare.
Returns:
The Kolmogorov-Smirnov distance between sr_a and sr_b.
"""
if sr_a.empty or sr_b.empty:
return 1.0
return ks_2samp(sr_a, sr_b)[0] # The first element is the KS statistic
37 changes: 37 additions & 0 deletions tests/test_metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
HellingerDistance,
JensenShannonDivergence,
KendallTauCorrelation,
KolmogorovSmirnovDistance,
KullbackLeiblerDivergence,
Mean,
Norm,
Expand All @@ -27,6 +28,7 @@
emd = EarthMoversDistance()
hellinger_distance = HellingerDistance()
kl_divergence = KullbackLeiblerDivergence()
kolmogorov_smirnov_distance = KolmogorovSmirnovDistance()
js_divergence = JensenShannonDivergence()
norm = Norm()
norm_ord1 = Norm(ord=1)
Expand Down Expand Up @@ -286,6 +288,41 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series:
assert abs(kl_divergence_with_custom_check(sr_g, sr_h) - 0.3) < 0.01


def test_kolmogorov_smirnov_distance(group1):
# Test with identical distributions
assert kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 3])) == 0
assert kolmogorov_smirnov_distance(group1, group1) == 0

# Test with distributions that are completely different
assert kolmogorov_smirnov_distance(pd.Series([1, 1, 1]), pd.Series([2, 2, 2])) == 1

# Test with distributions that are slightly different
assert 0 < kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 4])) < 1

# Test with random distributions
np.random.seed(0)
group2 = pd.Series(np.random.normal(0, 1, 1000))
group3 = pd.Series(np.random.normal(0.5, 1, 1000))
assert 0 < kolmogorov_smirnov_distance(group2, group3) < 1

# Test with distributions of different lengths
assert 0 < kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 3, 4])) < 1

# Test with categorical data
cat1 = pd.Series(["a", "b", "c", "a"])
cat2 = pd.Series(["b", "c", "d"])
assert 0 < kolmogorov_smirnov_distance(cat1, cat2) < 1

# Edge cases
# Test with one or both series empty
assert kolmogorov_smirnov_distance(pd.Series([]), pd.Series([1, 2, 3])) == 1
assert kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([])) == 1
assert kolmogorov_smirnov_distance(pd.Series([]), pd.Series([])) == 1

# Test with series containing NaN values
assert 0 <= kolmogorov_smirnov_distance(pd.Series([1, np.nan, 3]), pd.Series([1, 2, 3])) <= 1


def test_js_divergence(group1, group2, group3):
assert js_divergence(pd.Series([1, 0]), pd.Series([1, 0])) == 0

Expand Down
25 changes: 24 additions & 1 deletion tests/test_metrics/test_metrics_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@
import pytest

from insight.check import ColumnCheck
from insight.metrics import CorrMatrix, CramersV, DiffCorrMatrix, EarthMoversDistance, TwoColumnMap
from insight.metrics import (
CorrMatrix,
CramersV,
DiffCorrMatrix,
EarthMoversDistance,
KolmogorovSmirnovDistance,
TwoColumnMap,
)


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -44,6 +51,22 @@ def test_two_column_map(data):
assert all(np.isnan(emd_map_df["metric_val"][cont]) for cont in continuous_cols)


def test_two_column_map_with_ksd(data):
df, categorical_cols, continuous_cols = data[0], data[1], data[2]
df1 = df.sample(1000).reset_index(drop=True)
df2 = df.sample(1000).reset_index(drop=True)

ksd = KolmogorovSmirnovDistance()

col_map = TwoColumnMap(ksd)
ksd_map_df = col_map(df1, df2)
assert col_map.name == f"{str(ksd)}_map"

assert set(ksd_map_df.columns.to_list()) == set(["metric_val"])
assert all(not np.isnan(ksd_map_df["metric_val"][cat]) for cat in categorical_cols)
assert all(not np.isnan(ksd_map_df["metric_val"][cont]) for cont in continuous_cols)


def test_metric_matrix(data):
df, categorical_cols, continuous_cols = data[0], data[1], data[2]
df1 = df.sample(1000).reset_index(drop=True)
Expand Down
Loading