From f0024215955234a289309502a23b5107cf653f5c Mon Sep 17 00:00:00 2001 From: Scott Garcia Date: Mon, 4 Dec 2023 14:24:05 -0500 Subject: [PATCH] feat: compute data type profile diff --- dataprofiler/profilers/column_profile_compilers.py | 2 ++ dataprofiler/profilers/profile_builder.py | 3 +++ dataprofiler/profilers/text_column_profile.py | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py index 07edf13dc..35e3640f6 100644 --- a/dataprofiler/profilers/column_profile_compilers.py +++ b/dataprofiler/profilers/column_profile_compilers.py @@ -338,6 +338,8 @@ def diff( if all_profiles: for key in all_profiles: if key in self._profiles and key in other._profiles: + prof_diff = self._profiles[key].diff(other._profiles[key]) + diff_profile.update(prof_diff) diff = profiler_utils.find_diff_of_numbers( self._profiles[key].data_type_ratio, other._profiles[key].data_type_ratio, diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 6e512658f..aaabb6469 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -285,6 +285,9 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di } ) + if "statistics" not in unordered_profile: + unordered_profile["statistics"] = {} + unordered_profile["statistics"].update( { "sample_size": profiler_utils.find_diff_of_numbers( diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index bea8dbd68..01de13b9d 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -108,9 +108,8 @@ def diff(self, other_profile: TextColumn, options: dict = None) -> dict: :rtype: dict """ # Make sure other_profile's type matches this class - differences = NumericStatsMixin.diff(self, other_profile, options) + differences = BaseColumnProfiler.diff(self, other_profile, options) - del differences["psi"] vocab_diff = profiler_utils.find_diff_of_lists_and_sets( self.vocab, other_profile.vocab )