Skip to content

Commit

Permalink
Categorical PSI (#1040)
Browse files Browse the repository at this point in the history
* Categorical PSI  (#1039)

* fix bug

* reformatting pre-commit

* clean up and remove try/except

* pre-commit fix

* typo fix

* update version tag
  • Loading branch information
taylorfturner authored Sep 25, 2023
1 parent 02b7070 commit 9bca4e7
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 30 deletions.
56 changes: 40 additions & 16 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@
from __future__ import annotations

import math
import warnings
from collections import defaultdict
from operator import itemgetter
from typing import cast

import datasketches
from pandas import DataFrame, Series

from .. import dp_logging
from . import profiler_utils
from .base_column_profilers import BaseColumnProfiler
from .profiler_options import CategoricalOptions

logger = dp_logging.get_child_logger(__name__)


class CategoricalColumn(BaseColumnProfiler["CategoricalColumn"]):
"""
Expand Down Expand Up @@ -306,24 +308,27 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
other_profile._categories.items(), key=itemgetter(1), reverse=True
)
)
if cat_count1.keys() == cat_count2.keys():
total_psi = 0.0
for key in cat_count1.keys():
perc_A = cat_count1[key] / self.sample_size
perc_B = cat_count2[key] / other_profile.sample_size
total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)
differences["statistics"]["psi"] = total_psi
else:
warnings.warn(
"psi was not calculated due to the differences in categories "
"of the profiles. Differences:\n"
f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}",
RuntimeWarning,
)
(
self_cat_count,
other_cat_count,
) = self._preprocess_for_categorical_psi_calculation(
self_cat_count=cat_count1,
other_cat_count=cat_count2,
)

total_psi = 0.0
for iter_key in self_cat_count.keys():
percent_self = self_cat_count[iter_key] / self.sample_size
percent_other = other_cat_count[iter_key] / other_profile.sample_size
if (percent_other != 0) and (percent_self != 0):
total_psi += (percent_other - percent_self) * math.log(
percent_other / percent_self
)
differences["statistics"]["psi"] = total_psi

differences["statistics"][
"categorical_count"
] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2)
] = profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count)

return differences

Expand Down Expand Up @@ -431,6 +436,25 @@ def is_match(self) -> bool:
is_match = True
return is_match

def _preprocess_for_categorical_psi_calculation(
self, self_cat_count, other_cat_count
):
super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys())
if self_cat_count.keys() != other_cat_count.keys():
logger.info(
f"""PSI data pre-processing found that categories between
the profiles were not equal. Both profiles do not contain
the following categories {super_set_categories}."""
)

for iter_key in super_set_categories:
for iter_dictionary in [self_cat_count, other_cat_count]:
try:
iter_dictionary[iter_key] = iter_dictionary[iter_key]
except KeyError:
iter_dictionary[iter_key] = 0
return self_cat_count, other_cat_count

def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float):
"""Return boolean given stop conditions.
Expand Down
17 changes: 5 additions & 12 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def test_gini_impurity(self):
self.assertEqual(profile.gini_impurity, None)

def test_categorical_diff(self):
# test psi new category in another profile
df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
profile = CategoricalColumn(df_categorical.name)
profile.update(df_categorical)
Expand All @@ -720,21 +721,17 @@ def test_categorical_diff(self):
"categories": [[], ["y", "n"], ["maybe"]],
"gini_impurity": -0.16326530612244894,
"unalikeability": -0.19047619047619047,
"categorical_count": {"y": 1, "n": 1, "maybe": [None, 2]},
"categorical_count": {"y": 1, "n": 1, "maybe": -2},
"chi2-test": {
"chi2-statistic": 82 / 35,
"df": 2,
"p-value": 0.3099238764710244,
},
"psi": 0.0990210257942779,
},
}
with self.assertWarnsRegex(
RuntimeWarning,
"psi was not calculated due to the differences in categories "
"of the profiles. Differences:\n{'maybe'}",
):
test_profile_diff = profile.diff(profile2)
self.assertDictEqual(expected_diff, test_profile_diff)
actual_diff = profile.diff(profile2)
self.assertDictEqual(expected_diff, actual_diff)

# Test with one categorical column matching
df_not_categorical = pd.Series(
Expand Down Expand Up @@ -770,10 +767,6 @@ def test_categorical_diff(self):
profile2 = CategoricalColumn(df_categorical.name)
profile2.update(df_categorical)

# chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
# df = categories - 1
# psi = (% of records based on Sample (A) - % of records Sample (B)) * ln(A/ B)
# p-value found through using chi2 CDF
expected_diff = {
"categorical": "unchanged",
"statistics": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,12 +500,13 @@ def test_column_stats_profile_compiler_stats_diff(self):
"categories": [["1"], ["9"], ["10"]],
"gini_impurity": 0.06944444444444448,
"unalikeability": 0.16666666666666663,
"categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]},
"categorical_count": {"9": -1, "1": 1, "10": -1},
"chi2-test": {
"chi2-statistic": 2.1,
"df": 2,
"p-value": 0.3499377491111554,
},
"psi": 0.009815252971365292,
},
}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

MAJOR = 0
MINOR = 10
MICRO = 4
MICRO = 5
POST = None # otherwise None

VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO)
Expand Down

0 comments on commit 9bca4e7

Please sign in to comment.