From cb84e9670bb1e7b50feafb64c881caa667c1c602 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Fri, 11 Feb 2022 14:44:21 -0500 Subject: [PATCH 01/15] feat: add method stub --- great_expectations/data_context/data_context.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index 5246f116ae19..eb881604ea37 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -3355,6 +3355,19 @@ def run_profiler_with_dynamic_arguments( include_citation=include_citation, ) + @usage_statistics_enabled_method( + event_name="data_context.run_profiler_on_data", + ) + def run_profiler_on_data( + self, + name: Optional[str] = None, + batch_request: Optional[Union[dict, BatchRequest, RuntimeBatchRequest]] = None, + ) -> ExpectationSuite: + if isinstance(batch_request, dict): + batch_request = get_batch_request_from_acceptable_arguments(**batch_request) + + # TODO(cdkini): Figure out what happens next! + def test_yaml_config( self, yaml_config: str, From 7a0718e1b6a8a70dbf53772e9f0a0fa6fc55e192 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Mon, 14 Feb 2022 17:03:19 -0500 Subject: [PATCH 02/15] feat: start impl --- .../data_context/data_context.py | 12 ++++--- .../rule_based_profiler.py | 36 +++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index d4ecbb2133be..755ea28fb96c 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -3354,12 +3354,16 @@ def run_profiler_with_dynamic_arguments( def run_profiler_on_data( self, name: Optional[str] = None, + ge_cloud_id: Optional[str] = None, batch_request: Optional[Union[dict, BatchRequest, RuntimeBatchRequest]] = None, ) -> ExpectationSuite: - if isinstance(batch_request, dict): - batch_request = get_batch_request_from_acceptable_arguments(**batch_request) - - # TODO(cdkini): Figure out what happens next! + return RuleBasedProfiler.run_profiler_on_data( + data_context=self, + profiler_store=self.profiler_store, + batch_request=batch_request, + name=name, + ge_cloud_id=ge_cloud_id, + ) def test_yaml_config( self, diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index 6d33c45caf0a..dda97d5d1a69 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -7,6 +7,7 @@ BatchRequest, RuntimeBatchRequest, batch_request_contains_batch_data, + get_batch_request_from_acceptable_arguments, ) from great_expectations.core.config_peer import ConfigPeer from great_expectations.core.expectation_configuration import ExpectationConfiguration @@ -309,6 +310,22 @@ def run( return expectation_suite + def reconcile_batch_requests_in_builders( + self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest] + ) -> None: + if isinstance(batch_request, dict): + batch_request = get_batch_request_from_acceptable_arguments(**batch_request) + + for rule in self.rules: + # TODO(cdkini): Should only work on Column and -1 index? + rule.domain_builder._batch_request = batch_request + + if rule.parameter_builders: + for parameter_builder in rule.parameter_builders: + parameter_builder._batch_request = batch_request + + # TODO(cdkini): Let's use setters + def reconcile_profiler_variables( self, variables: Optional[Dict[str, Any]] = None ) -> Optional[ParameterContainer]: @@ -816,6 +833,25 @@ def run_profiler( return result + @staticmethod + def run_profiler_on_data( + data_context: "DataContext", # noqa: F821 + profiler_store: ProfilerStore, + batch_request: Union[dict, BatchRequest, RuntimeBatchRequest], + name: Optional[str] = None, + ge_cloud_id: Optional[str] = None, + ) -> ExpectationSuite: + profiler: RuleBasedProfiler = RuleBasedProfiler.get_profiler( + data_context=data_context, + profiler_store=profiler_store, + name=name, + ge_cloud_id=ge_cloud_id, + ) + profiler.reconcile_batch_requests_in_builders(batch_request) + + result: ExpectationSuite = profiler.run() + return result + @staticmethod def add_profiler( config: RuleBasedProfilerConfig, From 89e3b0d67658a59a149a5af45657b6c08b352f75 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Tue, 15 Feb 2022 08:39:46 -0500 Subject: [PATCH 03/15] chore: write docstr --- .../rule_based_profiler.py | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index dda97d5d1a69..59c6c0ec9ea7 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -26,6 +26,9 @@ expectationConfigurationBuilderConfigSchema, parameterBuilderConfigSchema, ) +from great_expectations.rule_based_profiler.domain_builder.column_domain_builder import ( + ColumnDomainBuilder, +) from great_expectations.rule_based_profiler.domain_builder.domain_builder import ( DomainBuilder, ) @@ -313,19 +316,36 @@ def run( def reconcile_batch_requests_in_builders( self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest] ) -> None: + """ + Profiler "batch_request" reconciliation involves combining existing Profiler state, instantiated from Profiler configuration + (e.g., stored in a YAML file managed by the Profiler store), with the batch request overrides, provided at run time. + + The provided batch request is propagated to the following relevant Builders attributes (as applicable): + - ParameterBuilders + - ColumnDomainBuilder + - We default to the latest value as a sensible default + + The reconciliation logic for "batch_request" is of the "replace" nature: the provided data is consistently applied, regardless + of existing Builder state. + + Args: + batch_request: Data provided at runtime used to hydrate nested builder attributes + """ if isinstance(batch_request, dict): batch_request = get_batch_request_from_acceptable_arguments(**batch_request) + # TODO(cdkini): Let's use setters for rule in self.rules: - # TODO(cdkini): Should only work on Column and -1 index? - rule.domain_builder._batch_request = batch_request - - if rule.parameter_builders: - for parameter_builder in rule.parameter_builders: + domain_builder = rule.domain_builder + if isinstance(domain_builder, ColumnDomainBuilder): + domain_builder._batch_request = batch_request + domain_builder._batch_request.data_connector_query = {"index": -1} + + parameter_builders = rule.parameter_builders + if parameter_builders: + for parameter_builder in parameter_builders: parameter_builder._batch_request = batch_request - # TODO(cdkini): Let's use setters - def reconcile_profiler_variables( self, variables: Optional[Dict[str, Any]] = None ) -> Optional[ParameterContainer]: From f567035339120608de07efec16972615771ba5ca Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Tue, 15 Feb 2022 08:40:04 -0500 Subject: [PATCH 04/15] test: write test stubs --- tests/data_context/test_data_context_profilers.py | 4 ++++ .../rule_based_profiler/test_rule_based_profiler.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/tests/data_context/test_data_context_profilers.py b/tests/data_context/test_data_context_profilers.py index 128f158eeae7..a01355680ca5 100644 --- a/tests/data_context/test_data_context_profilers.py +++ b/tests/data_context/test_data_context_profilers.py @@ -79,3 +79,7 @@ def test_run_profiler_with_dynamic_arguments_emits_proper_usage_stats( } ) ] + + +def test_run_profiler_on_data_emits_proper_usage_stats(): + pass # TBD diff --git a/tests/rule_based_profiler/test_rule_based_profiler.py b/tests/rule_based_profiler/test_rule_based_profiler.py index 0ba63ce01899..beb6656bd1cc 100644 --- a/tests/rule_based_profiler/test_rule_based_profiler.py +++ b/tests/rule_based_profiler/test_rule_based_profiler.py @@ -701,3 +701,15 @@ def test_list_profilers_in_cloud_mode(mock_profiler_store: mock.MagicMock): assert res == keys assert store.list_keys.called + + +def test_reconcile_batch_requests_in_builders_replaces_batch_requests(): + pass # TBD + + +def test_reconcile_batch_requests_in_builders_does_not_replace_batch_requests(): + pass # TBD + + +def test_run_profiler_on_data(): + pass # TBD From 24a07cf1db333b4186ecc551913d8b8d4e77c327 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Tue, 15 Feb 2022 08:42:35 -0500 Subject: [PATCH 05/15] feat: copy batch requests each time --- great_expectations/rule_based_profiler/rule_based_profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index 59c6c0ec9ea7..6c5bc84f3d16 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -338,13 +338,13 @@ def reconcile_batch_requests_in_builders( for rule in self.rules: domain_builder = rule.domain_builder if isinstance(domain_builder, ColumnDomainBuilder): - domain_builder._batch_request = batch_request + domain_builder._batch_request = copy.deepcopy(batch_request) domain_builder._batch_request.data_connector_query = {"index": -1} parameter_builders = rule.parameter_builders if parameter_builders: for parameter_builder in parameter_builders: - parameter_builder._batch_request = batch_request + parameter_builder._batch_request = copy.deepcopy(batch_request) def reconcile_profiler_variables( self, variables: Optional[Dict[str, Any]] = None From 0bfc783bd5819a2f9678ebbda53713b8a0295372 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Tue, 15 Feb 2022 14:13:45 -0500 Subject: [PATCH 06/15] refactor: make reconciliation method private --- great_expectations/data_context/data_context.py | 2 +- great_expectations/rule_based_profiler/rule_based_profiler.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index 755ea28fb96c..c9b380045b25 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -3353,9 +3353,9 @@ def run_profiler_with_dynamic_arguments( ) def run_profiler_on_data( self, + batch_request: Union[dict, BatchRequest, RuntimeBatchRequest], name: Optional[str] = None, ge_cloud_id: Optional[str] = None, - batch_request: Optional[Union[dict, BatchRequest, RuntimeBatchRequest]] = None, ) -> ExpectationSuite: return RuleBasedProfiler.run_profiler_on_data( data_context=self, diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index 6c5bc84f3d16..e842cf96241c 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -313,7 +313,7 @@ def run( return expectation_suite - def reconcile_batch_requests_in_builders( + def _reconcile_batch_requests_in_builders( self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest] ) -> None: """ @@ -867,7 +867,7 @@ def run_profiler_on_data( name=name, ge_cloud_id=ge_cloud_id, ) - profiler.reconcile_batch_requests_in_builders(batch_request) + profiler._reconcile_batch_requests_in_builders(batch_request) result: ExpectationSuite = profiler.run() return result From 376b80bf6573ae9b4083e35f7933f2e58d6f482a Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Tue, 15 Feb 2022 17:54:45 -0500 Subject: [PATCH 07/15] feat: finish initial version --- .../domain_builder/domain_builder.py | 6 ++ .../parameter_builder/parameter_builder.py | 6 ++ .../rule_based_profiler.py | 19 +++- .../test_data_context_profilers.py | 36 +++++++- .../test_rule_based_profiler.py | 92 ++++++++++++++++--- 5 files changed, 140 insertions(+), 19 deletions(-) diff --git a/great_expectations/rule_based_profiler/domain_builder/domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/domain_builder.py index 4df29db55421..986fba0739a2 100644 --- a/great_expectations/rule_based_profiler/domain_builder/domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/domain_builder.py @@ -112,6 +112,12 @@ def get_batch_id( def batch_request(self) -> Optional[Union[BatchRequest, RuntimeBatchRequest, dict]]: return self._batch_request + @batch_request.setter + def batch_request( + self, batch_request: Union[BatchRequest, RuntimeBatchRequest, dict] + ) -> None: + self._batch_request = batch_request + @property def data_context(self) -> "DataContext": # noqa: F821 return self._data_context diff --git a/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py index d38bcbaacce5..a73115c113cd 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py @@ -98,6 +98,12 @@ def name(self) -> str: def batch_request(self) -> Optional[Union[BatchRequest, RuntimeBatchRequest, dict]]: return self._batch_request + @batch_request.setter + def batch_request( + self, batch_request: Union[BatchRequest, RuntimeBatchRequest, dict] + ) -> None: + self._batch_request = batch_request + @property def data_context(self) -> "DataContext": # noqa: F821 return self._data_context diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index e842cf96241c..b8183d037b2e 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -1,4 +1,5 @@ import copy +import logging import uuid from typing import Any, Dict, List, Optional, Union @@ -45,6 +46,8 @@ ) from great_expectations.util import filter_properties_dict +logger = logging.getLogger(__name__) + def _validate_builder_override_config(builder_config: dict): """ @@ -323,7 +326,7 @@ def _reconcile_batch_requests_in_builders( The provided batch request is propagated to the following relevant Builders attributes (as applicable): - ParameterBuilders - ColumnDomainBuilder - - We default to the latest value as a sensible default + - We default to the latest value as a sensible default (using index: -1) The reconciliation logic for "batch_request" is of the "replace" nature: the provided data is consistently applied, regardless of existing Builder state. @@ -334,17 +337,23 @@ def _reconcile_batch_requests_in_builders( if isinstance(batch_request, dict): batch_request = get_batch_request_from_acceptable_arguments(**batch_request) - # TODO(cdkini): Let's use setters for rule in self.rules: domain_builder = rule.domain_builder if isinstance(domain_builder, ColumnDomainBuilder): - domain_builder._batch_request = copy.deepcopy(batch_request) - domain_builder._batch_request.data_connector_query = {"index": -1} + domain_builder.batch_request = copy.deepcopy(batch_request) + domain_builder.batch_request.data_connector_query = {"index": -1} + logger.info( + "Overwrote Rule %s's DomainBuilder batch_request attr", rule.name + ) parameter_builders = rule.parameter_builders if parameter_builders: for parameter_builder in parameter_builders: - parameter_builder._batch_request = copy.deepcopy(batch_request) + parameter_builder.batch_request = copy.deepcopy(batch_request) + logger.info( + "Overwrote ParameterBuilder %s's batch_request attr", + parameter_builder.name, + ) def reconcile_profiler_variables( self, variables: Optional[Dict[str, Any]] = None diff --git a/tests/data_context/test_data_context_profilers.py b/tests/data_context/test_data_context_profilers.py index a01355680ca5..cf4eee5235d8 100644 --- a/tests/data_context/test_data_context_profilers.py +++ b/tests/data_context/test_data_context_profilers.py @@ -81,5 +81,37 @@ def test_run_profiler_with_dynamic_arguments_emits_proper_usage_stats( ] -def test_run_profiler_on_data_emits_proper_usage_stats(): - pass # TBD +@mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") +@mock.patch( + "great_expectations.core.usage_statistics.usage_statistics.UsageStatisticsHandler.emit" +) +def test_run_profiler_on_data_emits_proper_usage_stats( + mock_emit: mock.MagicMock, + mock_profiler_run: mock.MagicMock, + empty_data_context_stats_enabled: DataContext, + populated_profiler_store: ProfilerStore, + profiler_name: str, +): + with mock.patch( + "great_expectations.data_context.DataContext.profiler_store" + ) as mock_profiler_store: + mock_profiler_store.__get__ = mock.Mock(return_value=populated_profiler_store) + empty_data_context_stats_enabled.run_profiler_on_data( + name=profiler_name, + batch_request={ + "datasource_name": "my_datasource", + "data_connector_name": "my_data_connector", + "data_asset_name": "my_data_asset", + }, + ) + + assert mock_emit.call_count == 1 + assert mock_emit.call_args_list == [ + mock.call( + { + "event_payload": {}, + "event": "data_context.run_profiler_on_data", + "success": True, + } + ) + ] diff --git a/tests/rule_based_profiler/test_rule_based_profiler.py b/tests/rule_based_profiler/test_rule_based_profiler.py index 5e3423d99d4f..5dccc65c059f 100644 --- a/tests/rule_based_profiler/test_rule_based_profiler.py +++ b/tests/rule_based_profiler/test_rule_based_profiler.py @@ -1,3 +1,4 @@ +import logging from typing import Any, Dict, List, Optional from unittest import mock @@ -5,6 +6,7 @@ import pytest import great_expectations.exceptions as ge_exceptions +from great_expectations.core.batch import BatchRequest from great_expectations.data_context.store.profiler_store import ProfilerStore from great_expectations.data_context.types.resource_identifiers import ( ConfigurationIdentifier, @@ -509,6 +511,84 @@ def test_run_profiler_with_dynamic_args( ) +@mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") +@mock.patch("great_expectations.data_context.data_context.DataContext") +def test_run_profiler_on_data_emits_appropriate_logging( + mock_data_context: mock.MagicMock, + mock_profiler_run: mock.MagicMock, + populated_profiler_store: ProfilerStore, + profiler_name: str, + caplog: Any, +): + batch_request: Dict[str, str] = { + "datasource_name": "my_datasource", + "data_connector_name": "my_data_connector", + "data_asset_name": "my_data_asset", + } + + with caplog.at_level(logging.INFO): + RuleBasedProfiler.run_profiler_on_data( + data_context=mock_data_context, + profiler_store=populated_profiler_store, + name=profiler_name, + batch_request=batch_request, + ) + + assert "Overwrote ParameterBuilder" in caplog.text + + +@mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") +@mock.patch("great_expectations.data_context.data_context.DataContext") +def test_run_profiler_on_data_creates_suite_with_dict_arg( + mock_data_context: mock.MagicMock, + mock_profiler_run: mock.MagicMock, + populated_profiler_store: ProfilerStore, + profiler_name: str, +): + batch_request: Dict[str, str] = { + "datasource_name": "my_datasource", + "data_connector_name": "my_data_connector", + "data_asset_name": "my_data_asset", + } + + RuleBasedProfiler.run_profiler_on_data( + data_context=mock_data_context, + profiler_store=populated_profiler_store, + name=profiler_name, + batch_request=batch_request, + ) + + # run() is invoked but without any runtime args + assert mock_profiler_run.called + assert mock_profiler_run.call_args == [] + + +@mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") +@mock.patch("great_expectations.data_context.data_context.DataContext") +def test_run_profiler_on_data_creates_suite_with_batch_request_arg( + mock_data_context: mock.MagicMock, + mock_profiler_run: mock.MagicMock, + populated_profiler_store: ProfilerStore, + profiler_name: str, +): + batch_request: BatchRequest = BatchRequest( + datasource_name="my_datasource", + data_connector_name="my_data_connector", + data_asset_name="my_data_asset", + ) + + RuleBasedProfiler.run_profiler_on_data( + data_context=mock_data_context, + profiler_store=populated_profiler_store, + name=profiler_name, + batch_request=batch_request, + ) + + # run() is invoked but without any runtime args + assert mock_profiler_run.called + assert mock_profiler_run.call_args == [] + + @mock.patch("great_expectations.data_context.data_context.DataContext") def test_get_profiler_with_too_many_args_raises_error( mock_data_context: mock.MagicMock, @@ -712,15 +792,3 @@ def test_list_profilers_in_cloud_mode(mock_profiler_store: mock.MagicMock): assert res == keys assert store.list_keys.called - - -def test_reconcile_batch_requests_in_builders_replaces_batch_requests(): - pass # TBD - - -def test_reconcile_batch_requests_in_builders_does_not_replace_batch_requests(): - pass # TBD - - -def test_run_profiler_on_data(): - pass # TBD From 159ffcfd5fe6ad7cc68f1b4d3ceec953d7e9946b Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Tue, 15 Feb 2022 19:03:01 -0500 Subject: [PATCH 08/15] refactor: convert batch request to dict to ensure proper serializability --- .../rule_based_profiler/domain_builder/domain_builder.py | 4 +--- .../parameter_builder/parameter_builder.py | 4 +--- .../rule_based_profiler/rule_based_profiler.py | 5 +++-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/great_expectations/rule_based_profiler/domain_builder/domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/domain_builder.py index 986fba0739a2..30f75f2f38d2 100644 --- a/great_expectations/rule_based_profiler/domain_builder/domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/domain_builder.py @@ -113,9 +113,7 @@ def batch_request(self) -> Optional[Union[BatchRequest, RuntimeBatchRequest, dic return self._batch_request @batch_request.setter - def batch_request( - self, batch_request: Union[BatchRequest, RuntimeBatchRequest, dict] - ) -> None: + def batch_request(self, batch_request: dict) -> None: self._batch_request = batch_request @property diff --git a/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py index 81fc8634cd72..3d4b0e6aac99 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py @@ -101,9 +101,7 @@ def batch_request(self) -> Optional[Union[BatchRequest, RuntimeBatchRequest, dic return self._batch_request @batch_request.setter - def batch_request( - self, batch_request: Union[BatchRequest, RuntimeBatchRequest, dict] - ) -> None: + def batch_request(self, batch_request: dict) -> None: self._batch_request = batch_request @property diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index 2ee359db2720..2fefb64918aa 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -8,6 +8,7 @@ BatchRequest, RuntimeBatchRequest, batch_request_contains_batch_data, + get_batch_request_as_dict, get_batch_request_from_acceptable_arguments, ) from great_expectations.core.config_peer import ConfigPeer @@ -333,8 +334,8 @@ def _reconcile_batch_requests_in_builders( Args: batch_request: Data provided at runtime used to hydrate nested builder attributes """ - if isinstance(batch_request, dict): - batch_request = get_batch_request_from_acceptable_arguments(**batch_request) + if not isinstance(batch_request, dict): + batch_request = get_batch_request_as_dict(batch_request) for rule in self.rules: domain_builder = rule.domain_builder From 859e03cf1b93970c92b4cd6383e821eed1f4d787 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Wed, 16 Feb 2022 13:00:43 -0500 Subject: [PATCH 09/15] feat: lay out plan after talking with Alex --- .../rule_based_profiler/rule_based_profiler.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index 2fefb64918aa..ce50c0318f33 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -876,9 +876,19 @@ def run_profiler_on_data( name=name, ge_cloud_id=ge_cloud_id, ) - profiler._reconcile_batch_requests_in_builders(batch_request) - result: ExpectationSuite = profiler.run() + # Parsing the input batch_request + rules = profiler.get_rules_as_dict() + variables = {} + + # profiler._reconcile_batch_requests_in_builders(batch_request) + + result: ExpectationSuite = profiler.run( + variables=variables, + rules=rules, + expectation_suite_name=expectation_suite_name, + include_citation=include_citation, + ) return result @staticmethod From f90e4d55ca2ef768aea1d3c122284f4f64015d66 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Wed, 16 Feb 2022 16:45:02 -0500 Subject: [PATCH 10/15] feat: convert batch request to dict --- .../rule_based_profiler.py | 100 ++++++++++-------- 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index ce50c0318f33..b4633a2acac6 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -9,7 +9,6 @@ RuntimeBatchRequest, batch_request_contains_batch_data, get_batch_request_as_dict, - get_batch_request_from_acceptable_arguments, ) from great_expectations.core.config_peer import ConfigPeer from great_expectations.core.expectation_configuration import ExpectationConfiguration @@ -316,45 +315,6 @@ def run( return expectation_suite - def _reconcile_batch_requests_in_builders( - self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest] - ) -> None: - """ - Profiler "batch_request" reconciliation involves combining existing Profiler state, instantiated from Profiler configuration - (e.g., stored in a YAML file managed by the Profiler store), with the batch request overrides, provided at run time. - - The provided batch request is propagated to the following relevant Builders attributes (as applicable): - - ParameterBuilders - - ColumnDomainBuilder - - We default to the latest value as a sensible default (using index: -1) - - The reconciliation logic for "batch_request" is of the "replace" nature: the provided data is consistently applied, regardless - of existing Builder state. - - Args: - batch_request: Data provided at runtime used to hydrate nested builder attributes - """ - if not isinstance(batch_request, dict): - batch_request = get_batch_request_as_dict(batch_request) - - for rule in self.rules: - domain_builder = rule.domain_builder - if isinstance(domain_builder, ColumnDomainBuilder): - domain_builder.batch_request = copy.deepcopy(batch_request) - domain_builder.batch_request.data_connector_query = {"index": -1} - logger.info( - "Overwrote Rule %s's DomainBuilder batch_request attr", rule.name - ) - - parameter_builders = rule.parameter_builders - if parameter_builders: - for parameter_builder in parameter_builders: - parameter_builder.batch_request = copy.deepcopy(batch_request) - logger.info( - "Overwrote ParameterBuilder %s's batch_request attr", - parameter_builder.name, - ) - def reconcile_profiler_variables( self, variables: Optional[Dict[str, Any]] = None ) -> Optional[ParameterContainer]: @@ -869,6 +829,8 @@ def run_profiler_on_data( batch_request: Union[dict, BatchRequest, RuntimeBatchRequest], name: Optional[str] = None, ge_cloud_id: Optional[str] = None, + expectation_suite_name: Optional[str] = None, + include_citation: bool = True, ) -> ExpectationSuite: profiler: RuleBasedProfiler = RuleBasedProfiler.get_profiler( data_context=data_context, @@ -877,20 +839,68 @@ def run_profiler_on_data( ge_cloud_id=ge_cloud_id, ) - # Parsing the input batch_request - rules = profiler.get_rules_as_dict() - variables = {} + # Convert Rules into dictionaries and then hydrate with batch request elements + rules = profiler._overwrite_rules_with_batch_request(batch_request) - # profiler._reconcile_batch_requests_in_builders(batch_request) + # NOTE: Slightly blocked at the moment + # Is "rules" a list or a dict of dicts? This is very important to figure out. + # Do we only need to overwrite rules? What about variables? (don't think so?) result: ExpectationSuite = profiler.run( - variables=variables, rules=rules, expectation_suite_name=expectation_suite_name, include_citation=include_citation, ) return result + def _overwrite_rules_with_batch_request( + self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest] + ) -> List[dict]: + """ + FIXME(cdkini): Clean this up! + + Profiler "batch_request" reconciliation involves combining existing Profiler state, instantiated from Profiler configuration + (e.g., stored in a YAML file managed by the Profiler store), with the batch request overrides, provided at run time. + + The provided batch request is propagated to the following relevant Builders attributes (as applicable): + - ParameterBuilders + - ColumnDomainBuilder + - We default to the latest value as a sensible default (using index: -1) + + The reconciliation logic for "batch_request" is of the "replace" nature: the provided data is consistently applied, regardless + of existing Builder state. + + Args: + batch_request: Data provided at runtime used to hydrate nested builder attributes + """ + rules: List[Rule] = copy.deepcopy(self.rules) + if not isinstance(batch_request, dict): + batch_request = get_batch_request_as_dict(batch_request) + + resulting_rules: List[Dict[str, Any]] = [] + + for rule in rules: + domain_builder = rule.domain_builder + if isinstance(domain_builder, ColumnDomainBuilder): + domain_builder.batch_request = copy.deepcopy(batch_request) + domain_builder.batch_request["data_connector_query"] = {"index": -1} + logger.info( + "Overwrote Rule %s's DomainBuilder batch_request attr", rule.name + ) + + parameter_builders = rule.parameter_builders + if parameter_builders: + for parameter_builder in parameter_builders: + parameter_builder.batch_request = copy.deepcopy(batch_request) + logger.info( + "Overwrote ParameterBuilder %s's batch_request attr", + parameter_builder.name, + ) + + resulting_rules.append(rule.to_dict()) + + return resulting_rules + @staticmethod def add_profiler( config: RuleBasedProfilerConfig, From dd522bb26f5e2318a35d04b2ace55beb625bd764 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 17 Feb 2022 16:33:50 -0500 Subject: [PATCH 11/15] feat: finish impl --- .../data_context/data_context.py | 4 ++ .../rule_based_profiler.py | 44 ++++--------------- .../test_rule_based_profiler.py | 24 +++++----- 3 files changed, 27 insertions(+), 45 deletions(-) diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index c9b380045b25..bb6c6f5ba555 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -3356,6 +3356,8 @@ def run_profiler_on_data( batch_request: Union[dict, BatchRequest, RuntimeBatchRequest], name: Optional[str] = None, ge_cloud_id: Optional[str] = None, + expectation_suite_name: Optional[str] = None, + include_citation: bool = True, ) -> ExpectationSuite: return RuleBasedProfiler.run_profiler_on_data( data_context=self, @@ -3363,6 +3365,8 @@ def run_profiler_on_data( batch_request=batch_request, name=name, ge_cloud_id=ge_cloud_id, + expectation_suite_name=expectation_suite_name, + include_citation=include_citation, ) def test_yaml_config( diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index b4633a2acac6..2b31ee79fa95 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -839,12 +839,9 @@ def run_profiler_on_data( ge_cloud_id=ge_cloud_id, ) - # Convert Rules into dictionaries and then hydrate with batch request elements - rules = profiler._overwrite_rules_with_batch_request(batch_request) - - # NOTE: Slightly blocked at the moment - # Is "rules" a list or a dict of dicts? This is very important to figure out. - # Do we only need to overwrite rules? What about variables? (don't think so?) + rules: Dict[ + str, Dict[str, Any] + ] = profiler._generate_rule_overrides_from_batch_request(batch_request) result: ExpectationSuite = profiler.run( rules=rules, @@ -853,51 +850,28 @@ def run_profiler_on_data( ) return result - def _overwrite_rules_with_batch_request( + def _generate_rule_overrides_from_batch_request( self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest] - ) -> List[dict]: - """ - FIXME(cdkini): Clean this up! - - Profiler "batch_request" reconciliation involves combining existing Profiler state, instantiated from Profiler configuration - (e.g., stored in a YAML file managed by the Profiler store), with the batch request overrides, provided at run time. - - The provided batch request is propagated to the following relevant Builders attributes (as applicable): - - ParameterBuilders - - ColumnDomainBuilder - - We default to the latest value as a sensible default (using index: -1) - - The reconciliation logic for "batch_request" is of the "replace" nature: the provided data is consistently applied, regardless - of existing Builder state. - - Args: - batch_request: Data provided at runtime used to hydrate nested builder attributes - """ - rules: List[Rule] = copy.deepcopy(self.rules) + ) -> Dict[str, Dict[str, Any]]: + rules: List[Rule] = self.rules if not isinstance(batch_request, dict): batch_request = get_batch_request_as_dict(batch_request) + logger.info("Converted batch request to dictionary: %s", batch_request) - resulting_rules: List[Dict[str, Any]] = [] + resulting_rules: Dict[str, Dict[str, Any]] = {} for rule in rules: domain_builder = rule.domain_builder if isinstance(domain_builder, ColumnDomainBuilder): domain_builder.batch_request = copy.deepcopy(batch_request) domain_builder.batch_request["data_connector_query"] = {"index": -1} - logger.info( - "Overwrote Rule %s's DomainBuilder batch_request attr", rule.name - ) parameter_builders = rule.parameter_builders if parameter_builders: for parameter_builder in parameter_builders: parameter_builder.batch_request = copy.deepcopy(batch_request) - logger.info( - "Overwrote ParameterBuilder %s's batch_request attr", - parameter_builder.name, - ) - resulting_rules.append(rule.to_dict()) + resulting_rules[rule.name] = rule.to_dict() return resulting_rules diff --git a/tests/rule_based_profiler/test_rule_based_profiler.py b/tests/rule_based_profiler/test_rule_based_profiler.py index 5dccc65c059f..73618f7bef42 100644 --- a/tests/rule_based_profiler/test_rule_based_profiler.py +++ b/tests/rule_based_profiler/test_rule_based_profiler.py @@ -520,11 +520,11 @@ def test_run_profiler_on_data_emits_appropriate_logging( profiler_name: str, caplog: Any, ): - batch_request: Dict[str, str] = { - "datasource_name": "my_datasource", - "data_connector_name": "my_data_connector", - "data_asset_name": "my_data_asset", - } + batch_request: BatchRequest = BatchRequest( + datasource_name="my_datasource", + data_connector_name="my_data_connector", + data_asset_name="my_data_asset", + ) with caplog.at_level(logging.INFO): RuleBasedProfiler.run_profiler_on_data( @@ -534,7 +534,7 @@ def test_run_profiler_on_data_emits_appropriate_logging( batch_request=batch_request, ) - assert "Overwrote ParameterBuilder" in caplog.text + assert "Converted batch request" in caplog.text @mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") @@ -558,9 +558,11 @@ def test_run_profiler_on_data_creates_suite_with_dict_arg( batch_request=batch_request, ) - # run() is invoked but without any runtime args assert mock_profiler_run.called - assert mock_profiler_run.call_args == [] + + rule = mock_profiler_run.call_args[1]["rules"]["rule_1"] + resulting_batch_request = rule["parameter_builders"][0]["batch_request"] + assert resulting_batch_request == batch_request @mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") @@ -584,9 +586,11 @@ def test_run_profiler_on_data_creates_suite_with_batch_request_arg( batch_request=batch_request, ) - # run() is invoked but without any runtime args assert mock_profiler_run.called - assert mock_profiler_run.call_args == [] + + rule = mock_profiler_run.call_args[1]["rules"]["rule_1"] + resulting_batch_request = rule["parameter_builders"][0]["batch_request"] + assert resulting_batch_request == batch_request.to_dict() @mock.patch("great_expectations.data_context.data_context.DataContext") From 9e7100e5b5332e84437a531ef50cb56eb80156bb Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Fri, 18 Feb 2022 07:53:24 -0500 Subject: [PATCH 12/15] feat: leverage enum check instead of isinstance --- .../rule_based_profiler/rule_based_profiler.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index ea46f1247aa0..bbfe8258e0e4 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -20,6 +20,7 @@ GeCloudIdentifier, ) from great_expectations.data_context.util import instantiate_class_from_config +from great_expectations.execution_engine.execution_engine import MetricDomainTypes from great_expectations.rule_based_profiler.config.base import ( DomainBuilderConfig, ExpectationConfigurationBuilderConfig, @@ -29,9 +30,6 @@ expectationConfigurationBuilderConfigSchema, parameterBuilderConfigSchema, ) -from great_expectations.rule_based_profiler.domain_builder.column_domain_builder import ( - ColumnDomainBuilder, -) from great_expectations.rule_based_profiler.domain_builder.domain_builder import ( DomainBuilder, ) @@ -861,7 +859,7 @@ def _generate_rule_overrides_from_batch_request( for rule in rules: domain_builder = rule.domain_builder - if isinstance(domain_builder, ColumnDomainBuilder): + if domain_builder.domain_type == MetricDomainTypes.COLUMN: domain_builder.batch_request = copy.deepcopy(batch_request) domain_builder.batch_request["data_connector_query"] = {"index": -1} From cabcf815dd094e564a463ef6173732bd2e78e266 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Fri, 18 Feb 2022 08:06:07 -0500 Subject: [PATCH 13/15] feat: add docstring and use safe_deep_copy --- .../rule_based_profiler.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index bbfe8258e0e4..b50b8bd1c2aa 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -44,6 +44,7 @@ ParameterContainer, build_parameter_container_for_variables, ) +from great_expectations.types import safe_deep_copy from great_expectations.util import filter_properties_dict logger = logging.getLogger(__name__) @@ -850,6 +851,22 @@ def run_profiler_on_data( def _generate_rule_overrides_from_batch_request( self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest] ) -> Dict[str, Dict[str, Any]]: + """Iterates through the profiler's builder attributes and generates a set of + Rules that contain overrides from the input batch request. This only applies to + ParameterBuilder and any DomainBuilder with a COLUMN MetricDomainType. + + Note that we are passing ALL batches to the parameter builder. If not used carefully, + a bias may creep in to the resulting estimates computed by these objects. + + Users of this override should be aware that a batch request should either have no + notion of "current/active" batch or it is excluded. + + Args: + batch_request: Data used to override builder attributes + + Returns: + The dictionary representation of the Rules used as runtime arguments to `run()` + """ rules: List[Rule] = self.rules if not isinstance(batch_request, dict): batch_request = get_batch_request_as_dict(batch_request) @@ -860,13 +877,13 @@ def _generate_rule_overrides_from_batch_request( for rule in rules: domain_builder = rule.domain_builder if domain_builder.domain_type == MetricDomainTypes.COLUMN: - domain_builder.batch_request = copy.deepcopy(batch_request) + domain_builder.batch_request = safe_deep_copy(batch_request) domain_builder.batch_request["data_connector_query"] = {"index": -1} parameter_builders = rule.parameter_builders if parameter_builders: for parameter_builder in parameter_builders: - parameter_builder.batch_request = copy.deepcopy(batch_request) + parameter_builder.batch_request = safe_deep_copy(batch_request) resulting_rules[rule.name] = rule.to_dict() From a710eab5ff52c54d0e43a98f4419a31382df77b5 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Fri, 18 Feb 2022 11:12:03 -0500 Subject: [PATCH 14/15] chore: use deepcopy instead of safe_deecopy --- .../rule_based_profiler/rule_based_profiler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index b50b8bd1c2aa..468c09c1586f 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -44,7 +44,6 @@ ParameterContainer, build_parameter_container_for_variables, ) -from great_expectations.types import safe_deep_copy from great_expectations.util import filter_properties_dict logger = logging.getLogger(__name__) @@ -877,13 +876,13 @@ def _generate_rule_overrides_from_batch_request( for rule in rules: domain_builder = rule.domain_builder if domain_builder.domain_type == MetricDomainTypes.COLUMN: - domain_builder.batch_request = safe_deep_copy(batch_request) + domain_builder.batch_request = copy.deepcopy(batch_request) domain_builder.batch_request["data_connector_query"] = {"index": -1} parameter_builders = rule.parameter_builders if parameter_builders: for parameter_builder in parameter_builders: - parameter_builder.batch_request = safe_deep_copy(batch_request) + parameter_builder.batch_request = copy.deepcopy(batch_request) resulting_rules[rule.name] = rule.to_dict() From 163deff389751e22cd7f616f4d7584e02e3223fd Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Fri, 18 Feb 2022 11:47:42 -0500 Subject: [PATCH 15/15] feat: remove copy --- great_expectations/rule_based_profiler/rule_based_profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index 468c09c1586f..ac5d3118b5a1 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -876,13 +876,13 @@ def _generate_rule_overrides_from_batch_request( for rule in rules: domain_builder = rule.domain_builder if domain_builder.domain_type == MetricDomainTypes.COLUMN: - domain_builder.batch_request = copy.deepcopy(batch_request) + domain_builder.batch_request = batch_request domain_builder.batch_request["data_connector_query"] = {"index": -1} parameter_builders = rule.parameter_builders if parameter_builders: for parameter_builder in parameter_builders: - parameter_builder.batch_request = copy.deepcopy(batch_request) + parameter_builder.batch_request = batch_request resulting_rules[rule.name] = rule.to_dict()