Skip to content

Commit

Permalink
[MAINTENANCE] Instrument test_yaml_config() (great-expectations#2981)
Browse files Browse the repository at this point in the history
Instrument test_yaml_config() and update Anonymizers
  • Loading branch information
anthonyburdi authored and gipaetusb committed Jul 13, 2021
1 parent e86e0e1 commit e274f29
Show file tree
Hide file tree
Showing 26 changed files with 2,753 additions and 172 deletions.
2 changes: 1 addition & 1 deletion docs_rtd/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Develop
- Addition of the "bootstrap" mode of parameter estimation (default) to NumericMetricRangeMultiBatchParameterBuilder
- Initial documentation
* [BUGFIX] Modify read_excel() to handle new optional-dependency openpyxl for pandas >= 1.3.0 #2989

* [MAINTENANCE] Instrumented BaseDataContext.test_yaml_config() and updated Anonymizers

0.13.21
-----------------
Expand Down
34 changes: 34 additions & 0 deletions great_expectations/core/usage_statistics/anonymizers/anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from hashlib import md5
from typing import Optional

from great_expectations.util import load_class

Expand Down Expand Up @@ -67,3 +68,36 @@ def anonymize_object_info(
anonymized_info_dict["anonymized_class"] = self.anonymize(object_class_name)

return anonymized_info_dict

def _is_parent_class_recognized(
self,
classes_to_check,
object_=None,
object_class=None,
object_config=None,
) -> Optional[str]:
"""
Check if the parent class is a subclass of any core GE class.
This private method is intended to be used by anonymizers in a public `is_parent_class_recognized()` method. These anonymizers define and provide the core GE classes_to_check.
Returns:
The name of the parent class found, or None if no parent class was found
"""
assert (
object_ or object_class or object_config
), "Must pass either object_ or object_class or object_config."
try:
if object_class is None and object_ is not None:
object_class = object_.__class__
elif object_class is None and object_config is not None:
object_class_name = object_config.get("class_name")
object_module_name = object_config.get("module_name")
object_class = load_class(object_class_name, object_module_name)

for class_to_check in classes_to_check:
if issubclass(object_class, class_to_check):
return class_to_check.__name__

return None

except AttributeError:
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from great_expectations.checkpoint import Checkpoint, SimpleCheckpoint
from great_expectations.core.usage_statistics.anonymizers.anonymizer import Anonymizer


class CheckpointAnonymizer(Anonymizer):
def __init__(self, salt=None):
super().__init__(salt=salt)

# ordered bottom up in terms of inheritance order
self._ge_classes = [SimpleCheckpoint, Checkpoint]

def anonymize_checkpoint_info(self, name, config):
anonymized_info_dict = dict()
anonymized_info_dict["anonymized_name"] = self.anonymize(name)

self.anonymize_object_info(
anonymized_info_dict=anonymized_info_dict,
ge_classes=self._ge_classes,
object_config=config,
)

return anonymized_info_dict

def is_parent_class_recognized(self, config):
return self._is_parent_class_recognized(
classes_to_check=self._ge_classes,
object_config=config,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from great_expectations.core.usage_statistics.anonymizers.anonymizer import Anonymizer
from great_expectations.datasource.data_connector import (
ConfiguredAssetFilePathDataConnector,
ConfiguredAssetFilesystemDataConnector,
ConfiguredAssetS3DataConnector,
ConfiguredAssetSqlDataConnector,
DataConnector,
FilePathDataConnector,
InferredAssetFilePathDataConnector,
InferredAssetFilesystemDataConnector,
InferredAssetS3DataConnector,
InferredAssetSqlDataConnector,
RuntimeDataConnector,
)


class DataConnectorAnonymizer(Anonymizer):
def __init__(self, salt=None):
super().__init__(salt=salt)

# This list should contain all DataConnector types. When new DataConnector types
# are created, please make sure to add ordered bottom up in terms of inheritance order
self._ge_classes = [
InferredAssetS3DataConnector,
InferredAssetFilesystemDataConnector,
InferredAssetFilePathDataConnector,
InferredAssetSqlDataConnector,
ConfiguredAssetS3DataConnector,
ConfiguredAssetFilesystemDataConnector,
ConfiguredAssetFilePathDataConnector,
ConfiguredAssetSqlDataConnector,
RuntimeDataConnector,
FilePathDataConnector,
DataConnector,
]

def anonymize_data_connector_info(self, name, config):
anonymized_info_dict = dict()
anonymized_info_dict["anonymized_name"] = self.anonymize(name)

self.anonymize_object_info(
anonymized_info_dict=anonymized_info_dict,
ge_classes=self._ge_classes,
object_config=config,
)

return anonymized_info_dict

def is_parent_class_recognized(self, config):
return self._is_parent_class_recognized(
classes_to_check=self._ge_classes,
object_config=config,
)
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
from typing import Optional

from great_expectations.core.usage_statistics.anonymizers.anonymizer import Anonymizer
from great_expectations.core.usage_statistics.anonymizers.data_connector_anonymizer import (
DataConnectorAnonymizer,
)
from great_expectations.core.usage_statistics.anonymizers.execution_engine_anonymizer import (
ExecutionEngineAnonymizer,
)
from great_expectations.datasource import (
BaseDatasource,
Datasource,
LegacyDatasource,
PandasDatasource,
SimpleSqlalchemyDatasource,
SparkDFDatasource,
SqlAlchemyDatasource,
)
Expand All @@ -12,21 +23,142 @@ def __init__(self, salt=None):
super().__init__(salt=salt)

# ordered bottom up in terms of inheritance order
self._ge_classes = [
self._legacy_ge_classes = [
PandasDatasource,
SqlAlchemyDatasource,
SparkDFDatasource,
LegacyDatasource,
]

# ordered bottom up in terms of inheritance order
self._ge_classes = [
SimpleSqlalchemyDatasource,
Datasource,
BaseDatasource,
]

self._execution_engine_anonymizer = ExecutionEngineAnonymizer(salt=salt)
self._data_connector_anonymizer = DataConnectorAnonymizer(salt=salt)

def anonymize_datasource_info(self, name, config):
anonymized_info_dict = dict()
anonymized_info_dict["anonymized_name"] = self.anonymize(name)

# Legacy Datasources (<= v0.12 v2 BatchKwargs API)
if self.is_parent_class_recognized_v2_api(config=config) is not None:
self.anonymize_object_info(
anonymized_info_dict=anonymized_info_dict,
ge_classes=self._legacy_ge_classes,
object_config=config,
)
# Datasources (>= v0.13 v3 BatchRequest API), and custom v2 BatchKwargs API
elif self.is_parent_class_recognized_v3_api(config=config) is not None:
self.anonymize_object_info(
anonymized_info_dict=anonymized_info_dict,
ge_classes=self._ge_classes,
object_config=config,
)
execution_engine_config = config.get("execution_engine")
anonymized_info_dict[
"anonymized_execution_engine"
] = self._execution_engine_anonymizer.anonymize_execution_engine_info(
name=execution_engine_config.get("name", ""),
config=execution_engine_config,
)
data_connector_configs = config.get("data_connectors")
anonymized_info_dict["anonymized_data_connectors"] = [
self._data_connector_anonymizer.anonymize_data_connector_info(
name=data_connector_name, config=data_connector_config
)
for data_connector_name, data_connector_config in data_connector_configs.items()
]

return anonymized_info_dict

def anonymize_simple_sqlalchemy_datasource(self, name, config):
"""
SimpleSqlalchemyDatasource requires a separate anonymization scheme.
"""
anonymized_info_dict = dict()
anonymized_info_dict["anonymized_name"] = self.anonymize(name)
if config.get("module_name") is None:
config["module_name"] = "great_expectations.datasource"
self.anonymize_object_info(
anonymized_info_dict=anonymized_info_dict,
ge_classes=self._ge_classes,
object_config=config,
)

# Only and directly provide parent_class of execution engine
anonymized_info_dict["anonymized_execution_engine"] = {
"parent_class": "SqlAlchemyExecutionEngine"
}

# Use the `introspection` and `tables` keys to find data_connectors in SimpleSqlalchemyDatasources
introspection_data_connector_configs = config.get("introspection")
tables_data_connector_configs = config.get("tables")

introspection_data_connector_anonymized_configs = []
if introspection_data_connector_configs is not None:
for (
data_connector_name,
data_connector_config,
) in introspection_data_connector_configs.items():
if data_connector_config.get("class_name") is None:
data_connector_config[
"class_name"
] = "InferredAssetSqlDataConnector"
if data_connector_config.get("module_name") is None:
data_connector_config[
"module_name"
] = "great_expectations.datasource.data_connector"
introspection_data_connector_anonymized_configs.append(
self._data_connector_anonymizer.anonymize_data_connector_info(
name=data_connector_name, config=data_connector_config
)
)

tables_data_connector_anonymized_configs = []
if tables_data_connector_configs is not None:
for (
data_connector_name,
data_connector_config,
) in tables_data_connector_configs.items():
if data_connector_config.get("class_name") is None:
data_connector_config[
"class_name"
] = "ConfiguredAssetSqlDataConnector"
if data_connector_config.get("module_name") is None:
data_connector_config[
"module_name"
] = "great_expectations.datasource.data_connector"
tables_data_connector_anonymized_configs.append(
self._data_connector_anonymizer.anonymize_data_connector_info(
name=data_connector_name, config=data_connector_config
)
)

anonymized_info_dict["anonymized_data_connectors"] = (
introspection_data_connector_anonymized_configs
+ tables_data_connector_anonymized_configs
)

return anonymized_info_dict

def is_parent_class_recognized(self, config) -> Optional[str]:
return self._is_parent_class_recognized(
classes_to_check=self._ge_classes + self._legacy_ge_classes,
object_config=config,
)

def is_parent_class_recognized_v2_api(self, config) -> Optional[str]:
return self._is_parent_class_recognized(
classes_to_check=self._legacy_ge_classes,
object_config=config,
)

def is_parent_class_recognized_v3_api(self, config) -> Optional[str]:
return self._is_parent_class_recognized(
classes_to_check=self._ge_classes,
object_config=config,
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
StoreBackendAnonymizer,
)
from great_expectations.data_context.store import (
CheckpointStore,
ConfigurationStore,
EvaluationParameterStore,
ExpectationsStore,
HtmlSiteStore,
Expand All @@ -17,10 +19,12 @@ def __init__(self, salt=None):
super().__init__(salt=salt)
# ordered bottom up in terms of inheritance order
self._ge_classes = [
CheckpointStore,
ValidationsStore,
ExpectationsStore,
EvaluationParameterStore,
MetricStore,
ConfigurationStore,
Store,
HtmlSiteStore,
]
Expand All @@ -44,3 +48,8 @@ def anonymize_store_info(self, store_name, store_obj):
)

return anonymized_info_dict

def is_parent_class_recognized(self, store_obj):
return self._is_parent_class_recognized(
classes_to_check=self._ge_classes, object_=store_obj
)
Loading

0 comments on commit e274f29

Please sign in to comment.