diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 1376cc38e..c85b195a1 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -1,6 +1,8 @@ """Contains class for categorical column profiler.""" from __future__ import annotations +import math +import warnings from collections import defaultdict from operator import itemgetter from typing import cast @@ -304,6 +306,20 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: other_profile._categories.items(), key=itemgetter(1), reverse=True ) ) + if cat_count1.keys() == cat_count2.keys(): + total_psi = 0.0 + for key in cat_count1.keys(): + perc_A = cat_count1[key] / self.sample_size + perc_B = cat_count2[key] / other_profile.sample_size + total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A) + differences["statistics"]["psi"] = total_psi + else: + warnings.warn( + "psi was not calculated due to the differences in categories " + "of the profiles. Differences:\n" + f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}", + RuntimeWarning, + ) differences["statistics"][ "categorical_count" diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 10be10c58..5bdbbb83c 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -728,8 +728,13 @@ def test_categorical_diff(self): }, }, } - - self.assertDictEqual(expected_diff, profile.diff(profile2)) + with self.assertWarnsRegex( + RuntimeWarning, + "psi was not calculated due to the differences in categories " + "of the profiles. Differences:\n{'maybe'}", + ): + test_profile_diff = profile.diff(profile2) + self.assertDictEqual(expected_diff, test_profile_diff) # Test with one categorical column matching df_not_categorical = pd.Series( @@ -756,6 +761,38 @@ def test_categorical_diff(self): } self.assertDictEqual(expected_diff, profile.diff(profile2)) + # Test diff with psi enabled + df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"]) + profile = CategoricalColumn(df_categorical.name) + profile.update(df_categorical) + + df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) + profile2 = CategoricalColumn(df_categorical.name) + profile2.update(df_categorical) + + # chi2-statistic = sum((observed-expected)^2/expected for each category in each column) + # df = categories - 1 + # psi = (% of records based on Sample (A) - % of records Sample (B)) * ln(A/ B) + # p-value found through using chi2 CDF + expected_diff = { + "categorical": "unchanged", + "statistics": { + "unique_count": "unchanged", + "unique_ratio": -0.05357142857142855, + "chi2-test": { + "chi2-statistic": 0.6122448979591839, + "df": 2, + "p-value": 0.7362964551863367, + }, + "categories": "unchanged", + "gini_impurity": -0.059311224489795866, + "unalikeability": -0.08333333333333326, + "psi": 0.16814961527477595, + "categorical_count": {"y": 1, "n": 1, "maybe": -1}, + }, + } + self.assertDictEqual(expected_diff, profile.diff(profile2)) + def test_unalikeability(self): df_categorical = pd.Series(["a", "a"]) profile = CategoricalColumn(df_categorical.name)