Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] RegexPatternStringParameterBuilder for RuleBasedProfiler #4167

Merged
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
10ef10d
first push
Feb 10, 2022
f5cddbb
push EOD
Feb 10, 2022
61c6395
push of proposed refactor
Feb 11, 2022
349d057
Merge branch 'develop' into feature/GREAT-494/refactor-rbp-tests
Feb 11, 2022
48ae15d
Merge branch 'develop' into feature/GREAT-494/refactor-rbp-tests
Feb 11, 2022
3a254e8
tests pass
Feb 11, 2022
008d700
Update test_user_workflow_fixtures.py
Feb 11, 2022
155ce85
Update conftest.py
Feb 11, 2022
5ca0a80
bugfix
Feb 11, 2022
e79fdc3
Merge branch 'develop' into feature/GREAT-494/refactor-rbp-tests
Feb 11, 2022
fa13803
pushing other branch changes
Feb 11, 2022
437826f
push EOD
Feb 12, 2022
b35fa97
pushing fix
Feb 12, 2022
8113c66
bugfix
Feb 13, 2022
5f5b9f3
final fix
Feb 13, 2022
352578d
final clean up before writing final test
Feb 13, 2022
11af351
final clean up
Feb 13, 2022
c350925
update fixture
Feb 13, 2022
a4896a2
Merge branch 'develop' into feature/GREAT-494/regular-expression-para…
Feb 16, 2022
8d779dc
Delete conftest.py
Feb 16, 2022
0e8b8c3
before adding avengers tests
Feb 17, 2022
756a5bb
Merge branch 'develop' into feature/GREAT-494/regular-expression-para…
Feb 17, 2022
1f65cab
update for final review
Feb 17, 2022
2e0a785
last bit of clean up
Feb 17, 2022
acf0c1e
Update tests/rule_based_profiler/parameter_builder/test_regex_pattern…
Feb 17, 2022
860180b
after review
Feb 17, 2022
5056284
Merge branch 'develop' into feature/GREAT-494/regular-expression-para…
Feb 18, 2022
302756b
after review by Sherstinsky
Feb 18, 2022
cf32939
added period
Feb 18, 2022
6e2e7c5
Update regex_pattern_string_parameter_builder.py
Feb 18, 2022
8c842a0
update test
Feb 18, 2022
270058f
Merge branch 'develop' into feature/GREAT-494/regular-expression-para…
Feb 18, 2022
8307fd0
updated output to handle multiple matches
Feb 18, 2022
2bec72b
pushing after final review
Feb 18, 2022
6ba7efc
Update test_regex_pattern_string_parameter_builder.py
Feb 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from .parameter_builder import ParameterBuilder # isort:skip
from .metric_multi_batch_parameter_builder import ( # isort:skip
MetricMultiBatchParameterBuilder,
from great_expectations.rule_based_profiler.parameter_builder.parameter_builder import ( # isort:skip
ParameterBuilder,
)
from .numeric_metric_range_multi_batch_parameter_builder import ( # isort:skip
NumericMetricRangeMultiBatchParameterBuilder,
from great_expectations.rule_based_profiler.parameter_builder.regex_pattern_string_parameter_builder import (
RegexPatternStringParameterBuilder,
Comment on lines +4 to +5
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non-relative imports added here

)
from .simple_date_format_string_parameter_builder import (
from great_expectations.rule_based_profiler.parameter_builder.simple_date_format_string_parameter_builder import (
SimpleDateFormatStringParameterBuilder,
)

from great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder import ( # isort:skip
MetricMultiBatchParameterBuilder,
)
from great_expectations.rule_based_profiler.parameter_builder.numeric_metric_range_multi_batch_parameter_builder import ( # isort:skip
NumericMetricRangeMultiBatchParameterBuilder,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import logging
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union

import numpy as np

import great_expectations.exceptions as ge_exceptions
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest
from great_expectations.rule_based_profiler.parameter_builder.parameter_builder import (
MetricComputationResult,
ParameterBuilder,
)
from great_expectations.rule_based_profiler.types import (
Domain,
ParameterContainer,
build_parameter_container,
)
from great_expectations.rule_based_profiler.util import (
get_parameter_value_and_validate_return_type,
)
from great_expectations.validator.validator import Validator

logger = logging.getLogger(__name__)


class RegexPatternStringParameterBuilder(ParameterBuilder):
"""
Detects the domain REGEX from a set of candidate REGEX strings by computing the
column_values.match_regex_format.unexpected_count metric for each candidate format and returning the format that
has the lowest unexpected_count ratio.
"""

# list of candidate strings that are most commonly used
# source: https://regexland.com/most-common-regular-expressions/
# source for UUID: https://stackoverflow.com/questions/7905929/how-to-test-valid-uuid-guid/13653180#13653180
CANDIDATE_REGEX: Set[str] = {
r"/\d+/", # whole number with 1 or more digits
r"/-?\d+/", # negative whole numbers
r"/-?\d+(\.\d*)?/", # decimal numbers with . (period) separator
r"/[A-Za-z0-9\.,;:!?()\"'%\-]+/", # general text
r"^\s+/", # leading space
r"\s+/$", # trailing space
r"/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)/", # Matching URL (including http(s) protocol)
r"/<\/?(?:p|a|b|img)(?: \/)?>/", # HTML tags
r"/(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}/", # IPv4 IP address
r"/(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}/", # IPv6 IP address,
r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ", # UUID
}

def __init__(
self,
name: str,
metric_domain_kwargs: Optional[Union[str, dict]] = None,
metric_value_kwargs: Optional[Union[str, dict]] = None,
threshold: Union[float, str] = 1.0,
candidate_regexes: Optional[Union[Iterable[str], str]] = None,
data_context: Optional["DataContext"] = None,
batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest, dict]] = None,
):
"""
Configure this RegexPatternStringParameterBuilder
Shinnnyshinshin marked this conversation as resolved.
Show resolved Hide resolved
Args:
name: the name of this parameter -- this is user-specified parameter name (from configuration);
it is not the fully-qualified parameter name; a fully-qualified parameter name must start with "$parameter."
and may contain one or more subsequent parts (e.g., "$parameter.<my_param_from_config>.<metric_name>").
threshold: the ratio of values that must match a format string for it to be accepted
candidate_regexes: a list of candidate regex strings that will REPLACE the default
data_context: DataContext
batch_request: specified in ParameterBuilder configuration to get Batch objects for parameter computation.
"""
super().__init__(
name=name,
data_context=data_context,
batch_request=batch_request,
)

self._metric_domain_kwargs = metric_domain_kwargs
Shinnnyshinshin marked this conversation as resolved.
Show resolved Hide resolved
self._metric_value_kwargs = metric_value_kwargs

self._threshold = threshold
Shinnnyshinshin marked this conversation as resolved.
Show resolved Hide resolved

self._candidate_regexes = candidate_regexes

@property
def metric_domain_kwargs(self) -> Optional[Union[str, dict]]:
return self._metric_domain_kwargs

@property
def metric_value_kwargs(self) -> Optional[Union[str, dict]]:
return self._metric_value_kwargs

@property
def threshold(self) -> Union[str, float]:
return self._threshold

@property
def candidate_regexes(
self,
) -> Union[
str,
Union[
Set[str], List[str], "RegexPatternStringParameterBuilder.CANDIDATE_REGEX"
],
]: # noqa: F821
return self._candidate_regexes

def _build_parameters(
self,
parameter_container: ParameterContainer,
domain: Domain,
variables: Optional[ParameterContainer] = None,
parameters: Optional[Dict[str, ParameterContainer]] = None,
) -> ParameterContainer:
"""
Check the percentage of values matching the REGEX string, and return the best fit, or None if no
string exceeds the configured threshold.

:return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details
"""
metric_computation_result: MetricComputationResult

metric_values: np.ndarray

metric_computation_result: MetricComputationResult = self.get_metrics(
metric_name="column_values.nonnull.count",
metric_domain_kwargs=self.metric_domain_kwargs,
metric_value_kwargs=self.metric_value_kwargs,
domain=domain,
variables=variables,
parameters=parameters,
)
metric_values = metric_computation_result.metric_values
# Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).
metric_values = metric_values[:, 0]

nonnull_count: int = sum(metric_values)

regex_string_success_ratios: dict = {}

# Obtain candidate_regexes from "rule state" (i.e, variables and parameters); from instance variable otherwise.
candidate_regexes: Union[
Set[str],
List[str],
"RegexPatternStringParameterBuilder.CANDIDATE_REGEX", # noqa: F821
] = get_parameter_value_and_validate_return_type(
domain=domain,
parameter_reference=self.candidate_regexes,
expected_return_type=None,
variables=variables,
parameters=parameters,
)
if candidate_regexes is not None and isinstance(candidate_regexes, list):
candidate_regexes = set(candidate_regexes)
else:
candidate_regexes = RegexPatternStringParameterBuilder.CANDIDATE_REGEX

regex_string: str
match_regex_metric_value_kwargs: dict
for regex_string in candidate_regexes:
if self.metric_value_kwargs:
match_regex_metric_value_kwargs: dict = {
**self._metric_value_kwargs,
**{"regex": regex_string},
}
else:
match_regex_metric_value_kwargs: dict = {"regex": regex_string}

metric_computation_result: MetricComputationResult = self.get_metrics(
metric_name="column_values.match_regex.unexpected_count",
metric_domain_kwargs=self.metric_domain_kwargs,
metric_value_kwargs=match_regex_metric_value_kwargs,
domain=domain,
variables=variables,
parameters=parameters,
)
metric_values = metric_computation_result.metric_values
# Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).

metric_values = metric_values[:, 0]
match_regex_unexpected_count: int = sum(metric_values)
success_ratio: float = (
nonnull_count - match_regex_unexpected_count
) / nonnull_count
regex_string_success_ratios[regex_string] = success_ratio
Comment on lines +167 to +183
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As with all parameterbuilders, we need to be sure we're not computing metrics one-at-a-time like this. Get the list of all the metrics you want, then ask the validator for all of them at the same time.

Copy link
Contributor Author

@Shinnnyshinshin Shinnnyshinshin Feb 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jcampbell good point, I know we talked about this during our Arch Review meeting. This change actually requires a change across all ParameterBuilders, that we are tracking as part of GREAT-584, and would require a refactor/optimization that would be better-suited for a separate PR. Would that be ok?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain that change more? I'm not seeing anything here that could require an upstream change

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be a change in def get_metrics() in the ParameterBuilders that calls get_metrics() in Validator. It requires building and parsing a dict (currently a single value is passed in) that will be passed from the ParameterBuilder to the Validator.

Since this is a change at the base ParameterBuilders, it is something that would affect to both Regex and SimpleDataFormatString ParameterBuilders.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexsherstinsky thank you for the synchronous discussion on this


best_regex_string: Optional[str] = None
best_ratio: float = 0.0
# Obtain threshold from "rule state" (i.e., variables and parameters); from instance variable otherwise.
threshold: float = get_parameter_value_and_validate_return_type(
domain=domain,
parameter_reference=self._threshold,
expected_return_type=float,
variables=variables,
parameters=parameters,
)

regex_string: str
ratio: float
for regex_string, ratio in regex_string_success_ratios.items():
if ratio > best_ratio and ratio >= threshold:
best_regex_string = regex_string
best_ratio = ratio

parameter_values: Dict[str, Any] = {
Shinnnyshinshin marked this conversation as resolved.
Show resolved Hide resolved
f"$parameter.{self.name}": {
"value": best_regex_string,
"details": {"success_ratio": best_ratio},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. For value instead of the single best, let's include all of the ones that are above the target threshold, sorted by ratio.
  2. Let's include the success ratios for all of the regexes that we tried.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jcampbell to clarify : would 2 include all regexes above the threshold? Or the full list? (I'm understanding 1 and 2 to be describing slightly different things)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated PR to include all REGEX values and success ratios that pass threshold. Added a test that tests multiple matches too

},
}

build_parameter_container(
Shinnnyshinshin marked this conversation as resolved.
Show resolved Hide resolved
parameter_container=parameter_container, parameter_values=parameter_values
)
return parameter_container
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,12 @@ def _build_parameters(
domain: Domain,
variables: Optional[ParameterContainer] = None,
parameters: Optional[Dict[str, ParameterContainer]] = None,
):
) -> ParameterContainer:
"""
Check the percentage of values matching each string, and return the best fit, or None if no
string exceeds the configured threshold.

:return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and
ptional details
:return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details
"""
metric_computation_result: MetricComputationResult

Expand Down Expand Up @@ -200,16 +199,16 @@ def _build_parameters(
match_strftime_metric_value_kwargs: dict
for fmt_string in candidate_strings:
if self.metric_value_kwargs:
match_strftime_metric_value_kwargs = {
match_strftime_metric_value_kwargs: dict = {
**self.metric_value_kwargs,
**{"strftime_format": fmt_string},
}
else:
match_strftime_metric_value_kwargs = {
match_strftime_metric_value_kwargs: dict = {
"strftime_format": fmt_string,
}

metric_computation_result = self.get_metrics(
metric_computation_result: MetricComputationResult = self.get_metrics(
metric_name="column_values.match_strftime_format.unexpected_count",
metric_domain_kwargs=self.metric_domain_kwargs,
metric_value_kwargs=match_strftime_metric_value_kwargs,
Expand All @@ -222,9 +221,10 @@ def _build_parameters(
metric_values = metric_values[:, 0]

match_strftime_unexpected_count: int = sum(metric_values)
format_string_success_ratios[fmt_string] = (
success_ratio: float = (
nonnull_count - match_strftime_unexpected_count
) / nonnull_count
format_string_success_ratios[fmt_string] = success_ratio

best_fmt_string: Optional[str] = None
best_ratio: float = 0.0
Expand Down Expand Up @@ -255,3 +255,4 @@ def _build_parameters(
build_parameter_container(
parameter_container=parameter_container, parameter_values=parameter_values
)
return parameter_container
79 changes: 78 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6173,7 +6173,81 @@ def bobby_columnar_table_multi_batch(empty_data_context):
}
),
]

my_column_regex_rule_expectation_configurations_oneshot_sampling_method: List[
ExpectationConfiguration
] = [
ExpectationConfiguration(
**{
"expectation_type": "expect_column_values_to_match_regex",
"kwargs": {
"column": "VendorID",
"regex": {"value": r"^\d{1}$", "details": {"success_ratio": 1.0}},
},
"meta": {
"notes": {
"format": "markdown",
"content": [
"### This expectation confirms that fields ending in ID are of the format detected by parameter builder RegexPatternStringParameterBuilder"
],
}
},
}
),
ExpectationConfiguration(
**{
"expectation_type": "expect_column_values_to_match_regex",
"meta": {"notes": {"format": "markdown", "content": None}},
"kwargs": {
"column": "RatecodeID",
"regex": {"value": r"^\d{1}$", "details": {"success_ratio": 1.0}},
},
"meta": {
"notes": {
"format": "markdown",
"content": [
"### This expectation confirms that fields ending in ID are of the format detected by parameter builder RegexPatternStringParameterBuilder"
],
}
},
}
),
ExpectationConfiguration(
**{
"expectation_type": "expect_column_values_to_match_regex",
"meta": {"notes": {"format": "markdown", "content": None}},
"kwargs": {
"column": "PULocationID",
"regex": {"value": r"^\d{1}$", "details": {"success_ratio": 1.0}},
},
"meta": {
"notes": {
"format": "markdown",
"content": [
"### This expectation confirms that fields ending in ID are of the format detected by parameter builder RegexPatternStringParameterBuilder"
],
}
},
}
),
ExpectationConfiguration(
**{
"expectation_type": "expect_column_values_to_match_regex",
"meta": {"notes": {"format": "markdown", "content": None}},
"kwargs": {
"column": "DOLocationID",
"regex": {"value": r"^\d{1}$", "details": {"success_ratio": 1.0}},
},
"meta": {
"notes": {
"format": "markdown",
"content": [
"### This expectation confirms that fields ending in ID are of the format detected by parameter builder RegexPatternStringParameterBuilder"
],
}
},
}
),
]
expectation_configurations: List[ExpectationConfiguration] = []

expectation_configurations.extend(
Expand All @@ -6186,6 +6260,9 @@ def bobby_columnar_table_multi_batch(empty_data_context):
my_column_timestamps_rule_expectation_configurations_oneshot_sampling_method
)

expectation_configurations.extend(
my_column_regex_rule_expectation_configurations_oneshot_sampling_method
)
expectation_suite_name_oneshot_sampling_method: str = (
"bobby_columnar_table_multi_batch_oneshot_sampling_method"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ def test_alice_profiler_user_workflow_single_batch(
],
include_citation=True,
)

assert (
expectation_suite
== alice_columnar_table_single_batch["expected_expectation_suite"]
Expand Down
Loading