capitalone · taylorfturner · Sep 20, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 20, 2023
@@ -1,6 +1,8 @@
 """Contains class for categorical column profiler."""
 from __future__ import annotations
 
+import math
+import warnings
 from collections import defaultdict
 from operator import itemgetter
 from typing import cast
@@ -304,6 +306,20 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
                     other_profile._categories.items(), key=itemgetter(1), reverse=True
                 )
             )
+            if cat_count1.keys() == cat_count2.keys():
+                total_psi = 0.0
+                for key in cat_count1.keys():
+                    perc_A = cat_count1[key] / self.sample_size
+                    perc_B = cat_count2[key] / other_profile.sample_size
+                    total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)
+                    differences["statistics"]["psi"] = total_psi
+            else:
+                warnings.warn(
+                    "psi was not calculated due to the differences in categories "
+                    "of the profiles. Differences:\n"
+                    f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}",
+                    RuntimeWarning,
+                )
 
             differences["statistics"][
                 "categorical_count"

@@ -728,8 +728,13 @@ def test_categorical_diff(self):
                 },
             },
         }
-
-        self.assertDictEqual(expected_diff, profile.diff(profile2))
+        with self.assertWarnsRegex(
+            RuntimeWarning,
+            "psi was not calculated due to the differences in categories "
+            "of the profiles. Differences:\n{'maybe'}",
+        ):
+            test_profile_diff = profile.diff(profile2)
+        self.assertDictEqual(expected_diff, test_profile_diff)
 
         # Test with one categorical column matching
         df_not_categorical = pd.Series(
@@ -756,6 +761,38 @@ def test_categorical_diff(self):
         }
         self.assertDictEqual(expected_diff, profile.diff(profile2))
 
+        # Test diff with psi enabled
+        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
+        profile = CategoricalColumn(df_categorical.name)
+        profile.update(df_categorical)
+
+        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
+        profile2 = CategoricalColumn(df_categorical.name)
+        profile2.update(df_categorical)
+
+        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
+        # df = categories - 1
+        # psi = (% of records based on Sample (A) - % of records  Sample (B)) * ln(A/ B)
+        # p-value found through using chi2 CDF
+        expected_diff = {
+            "categorical": "unchanged",
+            "statistics": {
+                "unique_count": "unchanged",
+                "unique_ratio": -0.05357142857142855,
+                "chi2-test": {
+                    "chi2-statistic": 0.6122448979591839,
+                    "df": 2,
+                    "p-value": 0.7362964551863367,
+                },
+                "categories": "unchanged",
+                "gini_impurity": -0.059311224489795866,
+                "unalikeability": -0.08333333333333326,
+                "psi": 0.16814961527477595,
+                "categorical_count": {"y": 1, "n": 1, "maybe": -1},
+            },
+        }
+        self.assertDictEqual(expected_diff, profile.diff(profile2))
+
     def test_unalikeability(self):
         df_categorical = pd.Series(["a", "a"])
         profile = CategoricalColumn(df_categorical.name)