Skip to content

Commit

Permalink
added psi calculation to categorical columns (#1027)
Browse files Browse the repository at this point in the history
* added psi calculation to categorical columns

* Changed test value to non-calculated assignment
  • Loading branch information
ksneab7 authored Sep 20, 2023
1 parent ff14ab3 commit de664f9
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 2 deletions.
16 changes: 16 additions & 0 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Contains class for categorical column profiler."""
from __future__ import annotations

import math
import warnings
from collections import defaultdict
from operator import itemgetter
from typing import cast
Expand Down Expand Up @@ -304,6 +306,20 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
other_profile._categories.items(), key=itemgetter(1), reverse=True
)
)
if cat_count1.keys() == cat_count2.keys():
total_psi = 0.0
for key in cat_count1.keys():
perc_A = cat_count1[key] / self.sample_size
perc_B = cat_count2[key] / other_profile.sample_size
total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)
differences["statistics"]["psi"] = total_psi
else:
warnings.warn(
"psi was not calculated due to the differences in categories "
"of the profiles. Differences:\n"
f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}",
RuntimeWarning,
)

differences["statistics"][
"categorical_count"
Expand Down
41 changes: 39 additions & 2 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,8 +728,13 @@ def test_categorical_diff(self):
},
},
}

self.assertDictEqual(expected_diff, profile.diff(profile2))
with self.assertWarnsRegex(
RuntimeWarning,
"psi was not calculated due to the differences in categories "
"of the profiles. Differences:\n{'maybe'}",
):
test_profile_diff = profile.diff(profile2)
self.assertDictEqual(expected_diff, test_profile_diff)

# Test with one categorical column matching
df_not_categorical = pd.Series(
Expand All @@ -756,6 +761,38 @@ def test_categorical_diff(self):
}
self.assertDictEqual(expected_diff, profile.diff(profile2))

# Test diff with psi enabled
df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
profile = CategoricalColumn(df_categorical.name)
profile.update(df_categorical)

df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
profile2 = CategoricalColumn(df_categorical.name)
profile2.update(df_categorical)

# chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
# df = categories - 1
# psi = (% of records based on Sample (A) - % of records Sample (B)) * ln(A/ B)
# p-value found through using chi2 CDF
expected_diff = {
"categorical": "unchanged",
"statistics": {
"unique_count": "unchanged",
"unique_ratio": -0.05357142857142855,
"chi2-test": {
"chi2-statistic": 0.6122448979591839,
"df": 2,
"p-value": 0.7362964551863367,
},
"categories": "unchanged",
"gini_impurity": -0.059311224489795866,
"unalikeability": -0.08333333333333326,
"psi": 0.16814961527477595,
"categorical_count": {"y": 1, "n": 1, "maybe": -1},
},
}
self.assertDictEqual(expected_diff, profile.diff(profile2))

def test_unalikeability(self):
df_categorical = pd.Series(["a", "a"])
profile = CategoricalColumn(df_categorical.name)
Expand Down

0 comments on commit de664f9

Please sign in to comment.