Skip to content

Commit

Permalink
Add kendall tau metric (#115)
Browse files Browse the repository at this point in the history
* add kendall tau

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* test ordinal and len in check_column_types, consider sr codes

* replace print with assignment

* reverse order of categories to make test more obvious

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
tomcarter23 and pre-commit-ci[bot] authored Mar 24, 2023
1 parent 03fe2e9 commit d619081
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
3 changes: 2 additions & 1 deletion src/insight/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
EarthMoversDistanceBinned,
HellingerDistance,
JensenShannonDivergence,
KendallTauCorrelation,
KullbackLeiblerDivergence,
Mean,
Norm,
Expand All @@ -15,6 +16,6 @@
from .metrics_usage import CorrMatrix, DiffCorrMatrix, OneColumnMap, TwoColumnMap

__all__ = ['OneColumnMetric', 'TwoColumnMetric', 'OneColumnMap', 'TwoColumnMap', 'CorrMatrix', 'DiffCorrMatrix',
'CramersV', 'EarthMoversDistance', 'Mean', 'StandardDeviation', 'Norm', 'TwoDataFrameMetric',
'CramersV', 'EarthMoversDistance', 'Mean', 'StandardDeviation', 'KendallTauCorrelation', 'Norm', 'TwoDataFrameMetric',
'EarthMoversDistanceBinned', 'JensenShannonDivergence', 'KullbackLeiblerDivergence', 'HellingerDistance',
'BhattacharyyaCoefficient', 'TotalVariationDistance']
41 changes: 40 additions & 1 deletion src/insight/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy, wasserstein_distance
from scipy.stats import entropy, kendalltau, wasserstein_distance

from ..check import Check, ColumnCheck
from .base import OneColumnMetric, TwoColumnMetric
Expand Down Expand Up @@ -55,6 +55,45 @@ def _compute_metric(self, sr: pd.Series):
return s * np.array(1, dtype=d.dtype)


class KendallTauCorrelation(TwoColumnMetric):
"""Kendall's Tau correlation coefficient between ordinal variables.
The statistic ranges from -1 to 1, indicating the strength and direction of the relationship between the
two variables.
"""

name = "kendall_tau_correlation"

@classmethod
def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()):
if len(sr_a) != len(sr_b):
return False
if not check.ordinal(sr_a) or not check.ordinal(sr_b):
return False
return True

def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series):
"""Calculate the metric.
Args:
sr_a (pd.Series): values of an ordinal variable.
sr_b (pd.Series): values of another ordinal variable to assess association.
Returns:
The Kendall Tau coefficient between sr_a and sr_b.
"""
if hasattr(sr_a, "cat") and sr_a.cat.ordered:
sr_a = sr_a.cat.codes

if hasattr(sr_b, "cat") and sr_b.cat.ordered:
sr_b = sr_b.cat.codes

corr, _ = kendalltau(sr_a.values, sr_b.values, nan_policy="omit")

return corr


class CramersV(TwoColumnMetric):
"""Cramér's V correlation coefficient between nominal variables.
Expand Down
25 changes: 25 additions & 0 deletions tests/test_metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
EarthMoversDistanceBinned,
HellingerDistance,
JensenShannonDivergence,
KendallTauCorrelation,
KullbackLeiblerDivergence,
Mean,
Norm,
Expand All @@ -21,6 +22,7 @@

mean = Mean()
std_dev = StandardDeviation()
kendall_tau = KendallTauCorrelation()
cramers_v = CramersV()
emd = EarthMoversDistance()
hellinger_distance = HellingerDistance()
Expand Down Expand Up @@ -91,6 +93,9 @@ def test_base_to_dict():
dict_mean = mean.to_dict()
assert dict_mean['name'] == 'mean'

dict_kendalltau = kendall_tau.to_dict()
assert dict_kendalltau['name'] == 'kendall_tau_correlation'

dict_cramers_v = cramers_v.to_dict()
assert dict_cramers_v['name'] == 'cramers_v'

Expand Down Expand Up @@ -118,6 +123,10 @@ def test_base_from_dict():
new_mean = Mean.from_dict(dict_mean)
assert isinstance(new_mean, Mean)

dict_kendall_tau = {'name': 'kendall_tau_correlation'}
new_kendall_tau = KendallTauCorrelation.from_dict(dict_kendall_tau)
assert isinstance(new_kendall_tau, KendallTauCorrelation)

dict_cramers_v = {'name': 'cramers_v'}
new_cramers_v = CramersV.from_dict(dict_cramers_v)
assert isinstance(new_cramers_v, CramersV)
Expand Down Expand Up @@ -186,6 +195,22 @@ def test_em_distance():
assert emd(sr_b, sr_b) is not None


def test_kt_correlation():
sr_a = pd.Series(np.random.normal(0, 1, 100), name='a')
sr_b = pd.Series(np.random.normal(0, 1, 5), name='b')
sr_c = pd.Series(sr_b.values + np.random.normal(0, 0.8, 5), name='c')
sr_d = pd.Series(['a', 'b', 'c', 'd'], name='d')
sr_e = pd.Series(list("abbccc"), dtype=pd.CategoricalDtype(categories=list("abc"), ordered=True))
sr_f = pd.Series(list("feeddd"), dtype=pd.CategoricalDtype(categories=list("fed"), ordered=True))

kt_corr = KendallTauCorrelation()

assert kt_corr(sr_a, sr_a) is not None
assert kt_corr(sr_b, sr_c) is not None
assert kt_corr(sr_c, sr_d) is None
assert kt_corr(sr_e, sr_f) == 1.0


def test_cramers_v_basic():
sr_a = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3] * 100, name='a')
sr_b = pd.Series([1, 2, 3, 2, 3, 1, 3, 1, 2] * 100, name='b')
Expand Down

0 comments on commit d619081

Please sign in to comment.