-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FEATURE] RegexPatternStringParameterBuilder
for RuleBasedProfiler
#4167
Changes from 30 commits
10ef10d
f5cddbb
61c6395
349d057
48ae15d
3a254e8
008d700
155ce85
5ca0a80
e79fdc3
fa13803
437826f
b35fa97
8113c66
5f5b9f3
352578d
11af351
c350925
a4896a2
8d779dc
0e8b8c3
756a5bb
1f65cab
2e0a785
acf0c1e
860180b
5056284
302756b
cf32939
6e2e7c5
8c842a0
270058f
8307fd0
2bec72b
6ba7efc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,16 @@ | ||
from .parameter_builder import ParameterBuilder # isort:skip | ||
from .metric_multi_batch_parameter_builder import ( # isort:skip | ||
MetricMultiBatchParameterBuilder, | ||
from great_expectations.rule_based_profiler.parameter_builder.parameter_builder import ( # isort:skip | ||
ParameterBuilder, | ||
) | ||
from .numeric_metric_range_multi_batch_parameter_builder import ( # isort:skip | ||
NumericMetricRangeMultiBatchParameterBuilder, | ||
from great_expectations.rule_based_profiler.parameter_builder.regex_pattern_string_parameter_builder import ( | ||
RegexPatternStringParameterBuilder, | ||
) | ||
from .simple_date_format_string_parameter_builder import ( | ||
from great_expectations.rule_based_profiler.parameter_builder.simple_date_format_string_parameter_builder import ( | ||
SimpleDateFormatStringParameterBuilder, | ||
) | ||
|
||
from great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder import ( # isort:skip | ||
MetricMultiBatchParameterBuilder, | ||
) | ||
from great_expectations.rule_based_profiler.parameter_builder.numeric_metric_range_multi_batch_parameter_builder import ( # isort:skip | ||
NumericMetricRangeMultiBatchParameterBuilder, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
import logging | ||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union | ||
|
||
import numpy as np | ||
|
||
import great_expectations.exceptions as ge_exceptions | ||
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest | ||
from great_expectations.rule_based_profiler.parameter_builder.parameter_builder import ( | ||
MetricComputationResult, | ||
ParameterBuilder, | ||
) | ||
from great_expectations.rule_based_profiler.types import ( | ||
Domain, | ||
ParameterContainer, | ||
build_parameter_container, | ||
) | ||
from great_expectations.rule_based_profiler.util import ( | ||
get_parameter_value_and_validate_return_type, | ||
) | ||
from great_expectations.validator.validator import Validator | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class RegexPatternStringParameterBuilder(ParameterBuilder): | ||
""" | ||
Detects the domain REGEX from a set of candidate REGEX strings by computing the | ||
column_values.match_regex_format.unexpected_count metric for each candidate format and returning the format that | ||
has the lowest unexpected_count ratio. | ||
""" | ||
|
||
# list of candidate strings that are most commonly used | ||
# source: https://regexland.com/most-common-regular-expressions/ | ||
# source for UUID: https://stackoverflow.com/questions/7905929/how-to-test-valid-uuid-guid/13653180#13653180 | ||
CANDIDATE_REGEX: Set[str] = { | ||
r"/\d+/", # whole number with 1 or more digits ExpectValuesToBeNumeric? (.. youw oudl want to emit that expectation)? | ||
r"/-?\d+/", # negative whole numbers | ||
r"/-?\d+(\.\d*)?/", # decimal numbers with . (period) separator | ||
r"/[A-Za-z0-9\.,;:!?()\"'%\-]+/", # general text | ||
r"^\s+/", # leading space | ||
r"\s+/$", # trailing space | ||
r"/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)/", # Matching URL (including http(s) protocol) | ||
r"/<\/?(?:p|a|b|img)(?: \/)?>/", # HTML tags | ||
r"/(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}/", # IPv4 IP address | ||
r"/(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}/", # IPv6 IP address, | ||
r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ", # UUID | ||
} | ||
|
||
def __init__( | ||
self, | ||
name: str, | ||
metric_domain_kwargs: Optional[Union[str, dict]] = None, | ||
metric_value_kwargs: Optional[Union[str, dict]] = None, | ||
threshold: Union[float, str] = 1.0, | ||
candidate_regexes: Optional[Union[Iterable[str], str]] = None, | ||
data_context: Optional["DataContext"] = None, | ||
batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest, dict]] = None, | ||
): | ||
""" | ||
Configure this RegexPatternStringParameterBuilder | ||
Shinnnyshinshin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Args: | ||
name: the name of this parameter -- this is user-specified parameter name (from configuration); | ||
it is not the fully-qualified parameter name; a fully-qualified parameter name must start with "$parameter." | ||
and may contain one or more subsequent parts (e.g., "$parameter.<my_param_from_config>.<metric_name>"). | ||
threshold: the ratio of values that must match a format string for it to be accepted | ||
candidate_regexes: a list of candidate regex strings that will REPLACE the default | ||
data_context: DataContext | ||
batch_request: specified in ParameterBuilder configuration to get Batch objects for parameter computation. | ||
""" | ||
super().__init__( | ||
name=name, | ||
data_context=data_context, | ||
batch_request=batch_request, | ||
) | ||
|
||
self._metric_domain_kwargs = metric_domain_kwargs | ||
Shinnnyshinshin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._metric_value_kwargs = metric_value_kwargs | ||
|
||
self._threshold = threshold | ||
Shinnnyshinshin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
self._candidate_regexes = candidate_regexes | ||
|
||
@property | ||
def metric_domain_kwargs(self) -> Optional[Union[str, dict]]: | ||
return self._metric_domain_kwargs | ||
|
||
@property | ||
def metric_value_kwargs(self) -> Optional[Union[str, dict]]: | ||
return self._metric_value_kwargs | ||
|
||
@property | ||
def threshold(self) -> Union[str, float]: | ||
return self._threshold | ||
|
||
@property | ||
def candidate_regexes( | ||
self, | ||
) -> Union[ | ||
str, | ||
Union[ | ||
Set[str], List[str], "RegexPatternStringParameterBuilder.CANDIDATE_REGEX" | ||
], | ||
]: # noqa: F821 | ||
return self._candidate_regexes | ||
|
||
def _build_parameters( | ||
self, | ||
parameter_container: ParameterContainer, | ||
domain: Domain, | ||
variables: Optional[ParameterContainer] = None, | ||
parameters: Optional[Dict[str, ParameterContainer]] = None, | ||
) -> ParameterContainer: | ||
""" | ||
Check the percentage of values matching the REGEX string, and return the best fit, or None if no | ||
string exceeds the configured threshold. | ||
|
||
:return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details | ||
""" | ||
metric_computation_result: MetricComputationResult | ||
|
||
metric_values: np.ndarray | ||
|
||
metric_computation_result: MetricComputationResult = self.get_metrics( | ||
metric_name="column_values.nonnull.count", | ||
metric_domain_kwargs=self.metric_domain_kwargs, | ||
metric_value_kwargs=self.metric_value_kwargs, | ||
domain=domain, | ||
variables=variables, | ||
parameters=parameters, | ||
) | ||
metric_values = metric_computation_result.metric_values | ||
# Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID). | ||
metric_values = metric_values[:, 0] | ||
|
||
nonnull_count: int = sum(metric_values) | ||
|
||
regex_string_success_ratios: dict = {} | ||
|
||
# Obtain candidate_regexes from "rule state" (i.e, variables and parameters); from instance variable otherwise. | ||
candidate_regexes: Union[ | ||
Set[str], | ||
List[str], | ||
"RegexPatternStringParameterBuilder.CANDIDATE_REGEX", # noqa: F821 | ||
] = get_parameter_value_and_validate_return_type( | ||
domain=domain, | ||
parameter_reference=self.candidate_regexes, | ||
expected_return_type=None, | ||
variables=variables, | ||
parameters=parameters, | ||
) | ||
if candidate_regexes is not None and isinstance(candidate_regexes, list): | ||
candidate_regexes = set(candidate_regexes) | ||
else: | ||
candidate_regexes = RegexPatternStringParameterBuilder.CANDIDATE_REGEX | ||
|
||
regex_string: str | ||
match_regex_metric_value_kwargs: dict | ||
for regex_string in candidate_regexes: | ||
if self.metric_value_kwargs: | ||
match_regex_metric_value_kwargs: dict = { | ||
**self._metric_value_kwargs, | ||
**{"regex": regex_string}, | ||
} | ||
else: | ||
match_regex_metric_value_kwargs: dict = {"regex": regex_string} | ||
|
||
metric_computation_result: MetricComputationResult = self.get_metrics( | ||
metric_name="column_values.match_regex.unexpected_count", | ||
metric_domain_kwargs=self.metric_domain_kwargs, | ||
metric_value_kwargs=match_regex_metric_value_kwargs, | ||
domain=domain, | ||
variables=variables, | ||
parameters=parameters, | ||
) | ||
metric_values = metric_computation_result.metric_values | ||
# Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID). | ||
|
||
metric_values = metric_values[:, 0] | ||
match_regex_unexpected_count: int = sum(metric_values) | ||
success_ratio: float = ( | ||
nonnull_count - match_regex_unexpected_count | ||
) / nonnull_count | ||
regex_string_success_ratios[regex_string] = success_ratio | ||
Comment on lines
+167
to
+183
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As with all parameterbuilders, we need to be sure we're not computing metrics one-at-a-time like this. Get the list of all the metrics you want, then ask the validator for all of them at the same time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jcampbell good point, I know we talked about this during our Arch Review meeting. This change actually requires a change across all There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain that change more? I'm not seeing anything here that could require an upstream change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be a change in Since this is a change at the base There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alexsherstinsky thank you for the synchronous discussion on this |
||
|
||
best_regex_string: Optional[str] = None | ||
best_ratio: float = 0.0 | ||
# Obtain threshold from "rule state" (i.e., variables and parameters); from instance variable otherwise. | ||
threshold: float = get_parameter_value_and_validate_return_type( | ||
domain=domain, | ||
parameter_reference=self._threshold, | ||
expected_return_type=float, | ||
variables=variables, | ||
parameters=parameters, | ||
) | ||
|
||
regex_string: str | ||
ratio: float | ||
for regex_string, ratio in regex_string_success_ratios.items(): | ||
if ratio > best_ratio and ratio >= threshold: | ||
best_regex_string = regex_string | ||
best_ratio = ratio | ||
|
||
parameter_values: Dict[str, Any] = { | ||
Shinnnyshinshin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
f"$parameter.{self.name}": { | ||
"value": best_regex_string, | ||
"details": {"success_ratio": best_ratio}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jcampbell to clarify : would 2 include all regexes above the threshold? Or the full list? (I'm understanding 1 and 2 to be describing slightly different things) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated PR to include all REGEX values and success ratios that pass threshold. Added a test that tests multiple matches too |
||
}, | ||
} | ||
|
||
build_parameter_container( | ||
Shinnnyshinshin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
parameter_container=parameter_container, parameter_values=parameter_values | ||
) | ||
return parameter_container |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
non-relative imports added here