Skip to content

Commit

Permalink
SAT: stricter integer validation (#19820)
Browse files Browse the repository at this point in the history
  • Loading branch information
alafanechere authored Nov 29, 2022
1 parent f262bef commit 9cb4fbf
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 9 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Changelog

## 0.2.20
Stricter integer field schema validation. [#19820](https://github.com/airbytehq/airbyte/pull/19820).

## 0.2.19
Test for exposed secrets: const values can not hold secrets. [#19465](https://github.com/airbytehq/airbyte/pull/19465).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ COPY pytest.ini setup.py ./
COPY source_acceptance_test ./source_acceptance_test
RUN pip install .

LABEL io.airbyte.version=0.2.19
LABEL io.airbyte.version=0.2.20
LABEL io.airbyte.name=airbyte/source-acceptance-test

ENTRYPOINT ["python", "-m", "pytest", "-p", "source_acceptance_test.plugin", "-r", "fEsx"]
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import pendulum
from airbyte_cdk.models import AirbyteRecordMessage, ConfiguredAirbyteCatalog
from jsonschema import Draft7Validator, FormatChecker, FormatError, ValidationError
from jsonschema import Draft7Validator, FormatChecker, FormatError, ValidationError, validators

# fmt: off
timestamp_regex = re.compile((r"^\d{4}-\d?\d-\d?\d" # date
Expand All @@ -18,6 +18,11 @@
r".*$")) # timezone
# fmt: on

# In Json schema, numbers with a zero fractional part are considered integers. E.G. 1.0 is considered a valid integer
# For stricter type validation we don't want to keep this behavior. We want to consider integers in the Pythonic way.
strict_integer_type_checker = Draft7Validator.TYPE_CHECKER.redefine("integer", lambda _, value: isinstance(value, int))
Draft7ValidatorWithStrictInteger = validators.extend(Draft7Validator, type_checker=strict_integer_type_checker)


class CustomFormatChecker(FormatChecker):
@staticmethod
Expand Down Expand Up @@ -45,14 +50,14 @@ def verify_records_schema(
"""Check records against their schemas from the catalog, yield error messages.
Only first record with error will be yielded for each stream.
"""
validators = {}
stream_validators = {}
for stream in catalog.streams:
validators[stream.stream.name] = Draft7Validator(stream.stream.json_schema, format_checker=CustomFormatChecker())

stream_validators[stream.stream.name] = Draft7ValidatorWithStrictInteger(
stream.stream.json_schema, format_checker=CustomFormatChecker()
)
stream_errors = defaultdict(dict)

for record in records:
validator = validators.get(record.stream)
validator = stream_validators.get(record.stream)
if not validator:
logging.error(f"Record from the {record.stream} stream that is not in the catalog.")
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def record_schema_fixture():
"properties": {
"text_or_null": {"type": ["null", "string"]},
"number_or_null": {"type": ["null", "number"]},
"integer_or_null": {"type": ["null", "integer"]},
"text": {"type": ["string"]},
"number": {"type": ["number"]},
},
Expand Down Expand Up @@ -66,6 +67,8 @@ def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog):
"text": "text",
"number": "text", # wrong format
},
{"text_or_null": None, "number_or_null": None, "text": "text", "number": 10.3, "integer": 1},
{"text_or_null": None, "number_or_null": None, "text": "text", "number": 10.3, "integer_or_null": 1.0}, # wrong format
]

records = [AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0) for record in records]
Expand All @@ -75,8 +78,13 @@ def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog):

assert "my_stream" in streams_with_errors
assert len(streams_with_errors) == 1, "only one stream"
assert len(streams_with_errors["my_stream"]) == 3, "only first error for each field"
assert errors == ["123 is not of type 'null', 'string'", "'text' is not of type 'number'", "None is not of type 'string'"]
assert len(streams_with_errors["my_stream"]) == 4, "only first error for each field"
assert errors == [
"123 is not of type 'null', 'string'",
"'text' is not of type 'number'",
"None is not of type 'string'",
"1.0 is not of type 'null', 'integer'",
]


@pytest.mark.parametrize(
Expand Down

0 comments on commit 9cb4fbf

Please sign in to comment.