This repository has been archived by the owner on Jul 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This is the first-take at the initial data quality decorator. A few components: 1. check_outputs decorator -- this enables us to run a few defualt decorators 2. the DataValidator base class -- this allows us to have extensible data validators 3. the DefaultDataValidator base class -- this allows us to have a few default validators that map to args of check_outputs 4. some basic default data validators All is tested so far. Upcoming is: 1. round out the list of default data validators 2. Add documentation for check_output 3. Add end-to-end tests 4. Configure log/warn levels 5. Add documentatino for extending validators
- Loading branch information
1 parent
80c55c1
commit 5eb2e98
Showing
7 changed files
with
554 additions
and
1 deletion.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import abc | ||
from typing import Type, Any, List, Dict | ||
|
||
import dataclasses | ||
|
||
|
||
class DataValidationError: | ||
pass | ||
|
||
|
||
@dataclasses.dataclass | ||
class ValidationResult: | ||
passes: bool # Whether or not this passed the validation | ||
message: str # Error message or success message | ||
diagnostics: Dict[str, Any] = dataclasses.field(default_factory=dict) # Any extra diagnostics information needed, free-form | ||
|
||
|
||
class DataValidator(abc.ABC): | ||
"""Base class for a data quality operator. This will be used by the `data_quality` operator""" | ||
# Importance levels | ||
WARN = 'warn' | ||
FAIL = 'fail' | ||
|
||
VALID_IMPORTANCES = {WARN, FAIL} # TODO -- think through the API | ||
|
||
def __init__(self, importance: str): | ||
self._importance = importance | ||
|
||
@property | ||
def importance(self) -> str: | ||
return self._importance | ||
|
||
@staticmethod | ||
def validate_importance_level(importance: str): | ||
if importance not in DataValidator.VALID_IMPORTANCES: | ||
raise ValueError(f'Importance level must be one of: {DataValidator.VALID_IMPORTANCES}') | ||
|
||
@abc.abstractmethod | ||
def applies_to(self, datatype: Type[Type]) -> bool: | ||
"""Whether or not this data validator can apply to the specified dataset | ||
:param datatype: | ||
:return: True if it can be run on the specified type, false otherwise | ||
""" | ||
pass | ||
|
||
@abc.abstractmethod | ||
def description(self) -> str: | ||
"""Gives a description of this validator. E.G. | ||
`Checks whether the entire dataset lies between 0 and 1.` | ||
Note it should be able to access internal state (E.G. constructor arguments). | ||
:return: The description of the validator as a string | ||
""" | ||
pass | ||
|
||
@abc.abstractmethod | ||
def name(self) -> str: | ||
"""Returns the name for this validator.""" | ||
|
||
@abc.abstractmethod | ||
def validate(self, dataset: Any) -> ValidationResult: | ||
"""Actually performs the validation. Note when you | ||
:param dataset: | ||
:return: | ||
""" | ||
pass | ||
|
||
def required_config(self) -> List[str]: | ||
"""Gets the required configuration items. These are likely passed in in construction | ||
(E.G. in the constructor parameters). | ||
:return: A list of required configurations | ||
""" | ||
return [] | ||
|
||
def dependencies(self) -> List[str]: | ||
"""Nodes upon which this depends. For example, | ||
this might depend on a node that provides the output from the | ||
last run of this DAG to execute an auto-correlation. | ||
:return: The list of node-name dependencies. | ||
""" | ||
return [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
import abc | ||
import numbers | ||
from typing import Any, Type, List, Optional, Tuple | ||
|
||
from hamilton.data_quality.base import DataValidator, ValidationResult | ||
import pandas as pd | ||
|
||
|
||
class BaseDefaultValidator(DataValidator, abc.ABC): | ||
"""Base class for a default validator. | ||
These are all validators that utilize a single argument to be passed to the decorator check_output. | ||
check_output can thus delegate to multiple of these. This is an internal abstraction to allow for easy | ||
creation of validators. | ||
""" | ||
|
||
@classmethod | ||
@abc.abstractmethod | ||
def applies_to(cls, datatype: Type[Type]) -> bool: | ||
pass | ||
|
||
@abc.abstractmethod | ||
def description(self) -> str: | ||
pass | ||
|
||
@abc.abstractmethod | ||
def validate(self, data: Any) -> ValidationResult: | ||
pass | ||
|
||
@classmethod | ||
@abc.abstractmethod | ||
def arg(cls) -> str: | ||
"""Yields a string that represents this validator's argument. | ||
@check_output() will be passed a series of kwargs, each one of which will correspond to | ||
one of these default validators. Note that we have the limitation of allowing just a single | ||
argument. | ||
:return: The argument that this needs. | ||
""" | ||
pass | ||
|
||
|
||
class DataInRangeValidatorPandas(BaseDefaultValidator): | ||
|
||
def name(self) -> str: | ||
return f'data_in_range_validator' | ||
|
||
def __init__(self, range: Tuple[float, float], importance: str): | ||
"""Data validator that tells if data is in a range. This applies to primitives (ints, floats). | ||
:param range: Inclusive range of parameters | ||
""" | ||
super(DataInRangeValidatorPandas).__init__(importance=importance) | ||
self.range = range | ||
|
||
@classmethod | ||
def arg(cls) -> str: | ||
return 'range' | ||
|
||
@classmethod | ||
def applies_to(cls, datatype: Type[Type]) -> bool: | ||
return issubclass(datatype, pd.Series) # TODO -- handle dataframes? | ||
|
||
def description(self) -> str: | ||
return f'Validates that the datapoint falls within the range ({self.range[0]}, {self.range[1]})' | ||
|
||
def validate(self, data: pd.Series) -> ValidationResult: | ||
min_, max_ = self.range | ||
between = data.between(min_, max_, inclusive=True) | ||
counts = between.value_counts() | ||
in_range = counts[True] | ||
out_range = counts[False] | ||
passes = out_range == 0 | ||
message = f'Series contains {in_range} values in range ({min_},{max_}), and {out_range} outside.' | ||
return ValidationResult( | ||
passes=passes, | ||
message=message, | ||
diagnostics={ | ||
'range': self.range, | ||
'in_range': in_range, | ||
'out_range': out_range, | ||
'data_size': len(data) | ||
} | ||
) | ||
|
||
|
||
class DataInRangeValidatorPrimitives(BaseDefaultValidator): | ||
def __init__(self, range: str, importance: str): | ||
"""Data validator that tells if data is in a range. This applies to primitives (ints, floats). | ||
:param range: Inclusive range of parameters | ||
""" | ||
super(DataInRangeValidatorPrimitives).__init__(importance=importance) | ||
self.range = range | ||
|
||
@classmethod | ||
def applies_to(cls, datatype: Type[Type]) -> bool: | ||
return issubclass(datatype, numbers.Real) | ||
|
||
def description(self) -> str: | ||
return f'Validates that the datapoint falls within the range ({self.range[0]}, {self.range[1]})' | ||
|
||
def validate(self, data: numbers.Real) -> ValidationResult: | ||
min_, max_ = self.range | ||
passes = min_ <= data <= max_ | ||
message = f'Data point {data} falls within acceptable range: ({min_}, {max_})' if passes else \ | ||
f'Data point {data} does not fall within acceptable range: ({min_}, {max_})' | ||
return ValidationResult( | ||
passes=passes, | ||
message=message, | ||
diagnostics={ | ||
'range': self.range, | ||
'value': data | ||
} | ||
) | ||
|
||
@classmethod | ||
def arg(cls) -> str: | ||
return 'range' | ||
|
||
def name(self) -> str: | ||
return 'data_in_range_validator' | ||
|
||
|
||
AVAILABLE_DEFAULT_VALIDATORS = [ | ||
DataInRangeValidatorPandas, | ||
DataInRangeValidatorPrimitives, | ||
] | ||
|
||
|
||
def resolve_default_validators( | ||
output_type: Type[Type], | ||
importance: str, | ||
available_validators: List[Type[BaseDefaultValidator]] = None, | ||
**default_validator_kwargs) -> List[BaseDefaultValidator]: | ||
"""Resolves default validators given a set pof parameters and the type to which they apply. | ||
Note that each (kwarg, type) combination should map to a validator | ||
@param importance: importance level of the validator to instantiate | ||
@param output_type: The type to which the validator should apply | ||
@param available_validators: The available validators to choose from | ||
@param default_validator_kwargs: Kwargs to use | ||
@return: A list of validators to use | ||
""" | ||
if available_validators is None: | ||
available_validators = AVAILABLE_DEFAULT_VALIDATORS | ||
validators = [] | ||
for key in default_validator_kwargs.keys(): | ||
for validator_cls in available_validators: | ||
if key == validator_cls.arg() and validator_cls.applies_to(output_type): | ||
validators.append(validator_cls(**{key: default_validator_kwargs[key], 'importance': importance})) | ||
break | ||
else: | ||
raise ValueError(f'No registered subclass of BaseDefaultValidator is available ' | ||
f'for arg: {key} and type {output_type}. This either means (a) this arg-type ' | ||
f"contribution isn't supported or (b) this has not been added yet (but should be). " | ||
f'In the case of (b), we welcome contributions. Get started at github.com/stitchfix/hamilton') | ||
return validators |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.