Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Add run_profiler_on_data method to DataContext #4190

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
cb84e96
feat: add method stub
cdkini Feb 11, 2022
c84dba1
Merge branch 'develop' into feature/great-464/great-490/run-profiler-…
cdkini Feb 14, 2022
29b9b19
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 14, 2022
7a0718e
feat: start impl
cdkini Feb 14, 2022
f6c244f
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 14, 2022
e6b5c30
Merge branch 'feature/great-464/great-490/run-profiler-on-data-method…
cdkini Feb 14, 2022
89e3b0d
chore: write docstr
cdkini Feb 15, 2022
f567035
test: write test stubs
cdkini Feb 15, 2022
24a07cf
feat: copy batch requests each time
cdkini Feb 15, 2022
d1be8fd
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 15, 2022
0bfc783
refactor: make reconciliation method private
cdkini Feb 15, 2022
becd7f2
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 15, 2022
376b80b
feat: finish initial version
cdkini Feb 15, 2022
58d4cf2
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 15, 2022
c50e425
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 15, 2022
159ffcf
refactor: convert batch request to dict to ensure proper serializability
cdkini Feb 16, 2022
859e03c
feat: lay out plan after talking with Alex
cdkini Feb 16, 2022
3372e83
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 16, 2022
f90e4d5
feat: convert batch request to dict
cdkini Feb 16, 2022
dd522bb
feat: finish impl
cdkini Feb 17, 2022
e268230
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 17, 2022
b7aea71
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Feb 18, 2022
9e7100e
feat: leverage enum check instead of isinstance
cdkini Feb 18, 2022
cabcf81
feat: add docstring and use safe_deep_copy
cdkini Feb 18, 2022
a710eab
chore: use deepcopy instead of safe_deecopy
cdkini Feb 18, 2022
163deff
feat: remove copy
cdkini Feb 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions great_expectations/data_context/data_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -3348,6 +3348,23 @@ def run_profiler_with_dynamic_arguments(
include_citation=include_citation,
)

@usage_statistics_enabled_method(
event_name="data_context.run_profiler_on_data",
)
def run_profiler_on_data(
self,
name: Optional[str] = None,
ge_cloud_id: Optional[str] = None,
batch_request: Optional[Union[dict, BatchRequest, RuntimeBatchRequest]] = None,
cdkini marked this conversation as resolved.
Show resolved Hide resolved
) -> ExpectationSuite:
return RuleBasedProfiler.run_profiler_on_data(
data_context=self,
profiler_store=self.profiler_store,
batch_request=batch_request,
name=name,
ge_cloud_id=ge_cloud_id,
)

def test_yaml_config(
self,
yaml_config: str,
Expand Down
56 changes: 56 additions & 0 deletions great_expectations/rule_based_profiler/rule_based_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
BatchRequest,
RuntimeBatchRequest,
batch_request_contains_batch_data,
get_batch_request_from_acceptable_arguments,
)
from great_expectations.core.config_peer import ConfigPeer
from great_expectations.core.expectation_configuration import ExpectationConfiguration
Expand All @@ -25,6 +26,9 @@
expectationConfigurationBuilderConfigSchema,
parameterBuilderConfigSchema,
)
from great_expectations.rule_based_profiler.domain_builder.column_domain_builder import (
ColumnDomainBuilder,
)
from great_expectations.rule_based_profiler.domain_builder.domain_builder import (
DomainBuilder,
)
Expand Down Expand Up @@ -309,6 +313,39 @@ def run(

return expectation_suite

def reconcile_batch_requests_in_builders(
self, batch_request: Union[dict, BatchRequest, RuntimeBatchRequest]
) -> None:
cdkini marked this conversation as resolved.
Show resolved Hide resolved
"""
Profiler "batch_request" reconciliation involves combining existing Profiler state, instantiated from Profiler configuration
(e.g., stored in a YAML file managed by the Profiler store), with the batch request overrides, provided at run time.

The provided batch request is propagated to the following relevant Builders attributes (as applicable):
- ParameterBuilders
- ColumnDomainBuilder
- We default to the latest value as a sensible default

The reconciliation logic for "batch_request" is of the "replace" nature: the provided data is consistently applied, regardless
of existing Builder state.

Args:
batch_request: Data provided at runtime used to hydrate nested builder attributes
"""
if isinstance(batch_request, dict):
cdkini marked this conversation as resolved.
Show resolved Hide resolved
batch_request = get_batch_request_from_acceptable_arguments(**batch_request)

# TODO(cdkini): Let's use setters
for rule in self.rules:
domain_builder = rule.domain_builder
if isinstance(domain_builder, ColumnDomainBuilder):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cdkini I feel that it makes sense to do something here that will make this method applicable to any and all column domain builders. The question that arises would be: what do we do for custom-built column type domain builders, which users might develop (and we have one example of it in tests, which @anthonyburdi developed). Might it make sense to set up a top-level column domain builder class and require any column domain builder to extend it? Happy to brainstorm. Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're referring to MyCustomSemanticTypeColumnDomainBuilder?

I think there are two reasonable solutions:

  • Require the user to subclass the ColumnDomainBuilder - I think not doing so is a configuration error. How do we enforce that?
  • Always apply the batch request overwriting (regardless of DomainBuilder subclass type) and only use it in the applicable cases - this may be confusing but would be easier to implement.

domain_builder._batch_request = copy.deepcopy(batch_request)
domain_builder._batch_request.data_connector_query = {"index": -1}
cdkini marked this conversation as resolved.
Show resolved Hide resolved

parameter_builders = rule.parameter_builders
if parameter_builders:
for parameter_builder in parameter_builders:
parameter_builder._batch_request = copy.deepcopy(batch_request)
cdkini marked this conversation as resolved.
Show resolved Hide resolved
cdkini marked this conversation as resolved.
Show resolved Hide resolved

def reconcile_profiler_variables(
self, variables: Optional[Dict[str, Any]] = None
) -> Optional[ParameterContainer]:
Expand Down Expand Up @@ -816,6 +853,25 @@ def run_profiler(

return result

@staticmethod
def run_profiler_on_data(
data_context: "DataContext", # noqa: F821
profiler_store: ProfilerStore,
batch_request: Union[dict, BatchRequest, RuntimeBatchRequest],
cdkini marked this conversation as resolved.
Show resolved Hide resolved
Comment on lines +822 to +826
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same note about downstream arguments - should this be streamlined?

name: Optional[str] = None,
ge_cloud_id: Optional[str] = None,
) -> ExpectationSuite:
profiler: RuleBasedProfiler = RuleBasedProfiler.get_profiler(
data_context=data_context,
profiler_store=profiler_store,
name=name,
ge_cloud_id=ge_cloud_id,
)
profiler.reconcile_batch_requests_in_builders(batch_request)
cdkini marked this conversation as resolved.
Show resolved Hide resolved

result: ExpectationSuite = profiler.run()
return result

@staticmethod
def add_profiler(
config: RuleBasedProfilerConfig,
Expand Down
4 changes: 4 additions & 0 deletions tests/data_context/test_data_context_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,7 @@ def test_run_profiler_with_dynamic_arguments_emits_proper_usage_stats(
}
)
]


def test_run_profiler_on_data_emits_proper_usage_stats():
pass # TBD
cdkini marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 12 additions & 0 deletions tests/rule_based_profiler/test_rule_based_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,3 +701,15 @@ def test_list_profilers_in_cloud_mode(mock_profiler_store: mock.MagicMock):

assert res == keys
assert store.list_keys.called


def test_reconcile_batch_requests_in_builders_replaces_batch_requests():
pass # TBD


def test_reconcile_batch_requests_in_builders_does_not_replace_batch_requests():
pass # TBD


def test_run_profiler_on_data():
pass # TBD