Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SAT failing on similar nested strucutres with different order. #7491

Merged
merged 4 commits into from
Nov 1, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from jsonschema import validate
from source_acceptance_test.base import BaseTest
from source_acceptance_test.config import BasicReadTestConfig, ConnectionTestConfig
from source_acceptance_test.utils import ConnectorRunner, SecretDict, filter_output, serialize, verify_records_schema
avida marked this conversation as resolved.
Show resolved Hide resolved
from source_acceptance_test.utils import ConnectorRunner, SecretDict, filter_output, make_hashible, verify_records_schema
from source_acceptance_test.utils.json_schema_helper import JsonSchemaHelper, get_expected_schema_structure, get_object_structure


Expand Down Expand Up @@ -308,8 +308,8 @@ def compare_records(
r2 = TestBasicRead.remove_extra_fields(r2, r1)
assert r1 == r2, f"Stream {stream_name}: Mismatch of record order or values"
else:
expected = set(map(serialize, expected))
actual = set(map(serialize, actual))
expected = set(map(make_hashible, expected))
actual = set(map(make_hashible, actual))
missing_expected = set(expected) - set(actual)

if missing_expected:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest
from airbyte_cdk.models import Type
from source_acceptance_test.base import BaseTest
from source_acceptance_test.utils import ConnectorRunner, full_refresh_only_catalog, serialize
from source_acceptance_test.utils import ConnectorRunner, full_refresh_only_catalog, make_hashible


@pytest.mark.default_timeout(20 * 60)
Expand All @@ -19,7 +19,7 @@ def test_sequential_reads(self, connector_config, configured_catalog, docker_run
output = docker_runner.call_read(connector_config, configured_catalog)
records_2 = [message.record.data for message in output if message.type == Type.RECORD]

output_diff = set(map(serialize, records_1)) - set(map(serialize, records_2))
output_diff = set(map(make_hashible, records_1)).symmetric_difference(set(map(make_hashible, records_2)))
if output_diff:
msg = "The two sequential reads should produce either equal set of records or one of them is a strict subset of the other"
detailed_logger.info(msg)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
#
from .asserts import verify_records_schema
from .common import SecretDict, filter_output, full_refresh_only_catalog, incremental_only_catalog, load_config
from .compare import diff_dicts, serialize
from .compare import diff_dicts, make_hashible
from .connector_runner import ConnectorRunner
from .json_schema_helper import JsonSchemaHelper

Expand All @@ -13,6 +16,6 @@
"SecretDict",
"ConnectorRunner",
"diff_dicts",
"serialize",
"make_hashible",
"verify_records_schema",
]
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


import functools
import json
from typing import List, Mapping, Optional

import icdiff
Expand Down Expand Up @@ -52,13 +51,20 @@ def diff_dicts(left, right, use_markup) -> Optional[List[str]]:


@functools.total_ordering
class DictWithHash(dict):
class HashMixin:
_hash = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't be persisting the hash value in the object because this Mixin is designed to be used with mutable objects (dict and list). e.g.

a = DictWithHashMixin( {"one": 1} )
b = DictWithHashMixin( {"one": 1} )
a == b  # this will correctly be True
a["two"] = 2
a == b  # this will now INCORRECTLY be True, because we're comparing the same saved hash as before
# meaning hash(a) == hash(b) when it should be different

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I was thinking on this too, but this just an utility function used by test and test data not supposed to be changed, so I choose optimization.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on current usage, yes... but ultimately this PR implements a new Dict and List that don't update their hash after being mutated. For obvious reasons that behaviour would not be assumed by a developer in the future utilising these and we could end up with problems. I think we should therefore always recompute.

I would be very surprised if performance from Python's hash function became limiting.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this make sense, updated PR


_hash: str = None
@staticmethod
def get_hash(obj):
if isinstance(obj, Mapping):
return hash(str({k: (HashMixin.get_hash(v)) for k, v in sorted(obj.items())}))
if isinstance(obj, List):
return hash(str(sorted([HashMixin.get_hash(v) for v in obj])))
return hash(obj)

def __hash__(self):
if not self._hash:
self._hash = hash(json.dumps({k: serialize(v) for k, v in self.items()}, sort_keys=True))
self._hash = HashMixin.get_hash(self)
return self._hash

def __lt__(self, other):
Expand All @@ -68,10 +74,17 @@ def __eq__(self, other):
return hash(self) == hash(other)


def serialize(value) -> str:
"""Simplify comparison of nested dicts/lists"""
if isinstance(value, Mapping):
return DictWithHash(value)
if isinstance(value, List):
return sorted([serialize(v) for v in value])
return str(value)
class DictWithHashMixin(HashMixin, dict):
pass


class ListWithHashMixin(HashMixin, list):
pass


def make_hashible(obj):
if isinstance(obj, Mapping):
return DictWithHashMixin(obj)
if isinstance(obj, List):
return ListWithHashMixin(obj)
return obj
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,148 @@
#

import pytest
from source_acceptance_test.utils.compare import serialize
from source_acceptance_test.utils.compare import make_hashible


@pytest.fixture(name="not_sorted_data")
def not_sorted_data_fixture():
return [
{
"date_created": "0001-01-01T00:00:00",
"date_updated": "0001-01-01T00:00:00",
"editable": False,
"id": "superuser",
"name": "Super User",
"organization_id": "orga_ya3w9oMjeLtWe7zFGZr63Dz8ruBbjybG0EIUdUXaESi",
"permissions": [
"bulk_edit",
"delete_own_opportunities",
"export",
"manage_group_numbers",
"manage_email_sequences",
"delete_leads",
"call_coach_listen",
"call_coach_barge",
"manage_others_tasks",
"manage_others_activities",
"delete_own_tasks",
"manage_customizations",
"manage_team_smart_views",
"bulk_delete",
"manage_team_email_templates",
"bulk_email",
"merge_leads",
"calling",
"bulk_sequence_subscriptions",
"bulk_import",
"delete_own_activities",
"manage_others_opportunities",
],
}
]
def not_sorted_data():
return {
"date_created": "0001-01-01T00:00:00",
"date_updated": "0001-01-01T00:00:00",
"editable": False,
"id": "superuser",
"name": "Super User",
"organization_id": "orga_ya3w9oMjeLtWe7zFGZr63Dz8ruBbjybG0EIUdUXaESi",
"permissions": [
"bulk_edit",
"delete_own_opportunities",
"export",
"manage_group_numbers",
"manage_email_sequences",
"delete_leads",
"call_coach_listen",
"call_coach_barge",
"manage_others_tasks",
"manage_others_activities",
"delete_own_tasks",
"manage_customizations",
"manage_team_smart_views",
"bulk_delete",
"manage_team_email_templates",
"bulk_email",
"merge_leads",
"calling",
"bulk_sequence_subscriptions",
"bulk_import",
"delete_own_activities",
"manage_others_opportunities",
],
}


@pytest.fixture(name="sorted_data")
def sorted_data_fixture():
return [
{
"date_created": "0001-01-01T00:00:00",
"date_updated": "0001-01-01T00:00:00",
"editable": False,
"id": "superuser",
"name": "Super User",
"organization_id": "orga_ya3w9oMjeLtWe7zFGZr63Dz8ruBbjybG0EIUdUXaESi",
"permissions": [
"bulk_delete",
"bulk_edit",
"bulk_email",
"bulk_import",
"bulk_sequence_subscriptions",
"call_coach_barge",
"call_coach_listen",
"calling",
"delete_leads",
"delete_own_activities",
"delete_own_opportunities",
"delete_own_tasks",
"export",
"manage_customizations",
"manage_email_sequences",
"manage_group_numbers",
"manage_others_activities",
"manage_others_opportunities",
"manage_others_tasks",
"manage_team_email_templates",
"manage_team_smart_views",
"merge_leads",
],
}
]
def sorted_data():
return {
"date_created": "0001-01-01T00:00:00",
"date_updated": "0001-01-01T00:00:00",
"editable": False,
"id": "superuser",
"name": "Super User",
"organization_id": "orga_ya3w9oMjeLtWe7zFGZr63Dz8ruBbjybG0EIUdUXaESi",
"permissions": [
"bulk_delete",
"bulk_edit",
"bulk_email",
"bulk_import",
"bulk_sequence_subscriptions",
"call_coach_barge",
"call_coach_listen",
"calling",
"delete_leads",
"delete_own_activities",
"delete_own_opportunities",
"delete_own_tasks",
"export",
"manage_customizations",
"manage_email_sequences",
"manage_group_numbers",
"manage_others_activities",
"manage_others_opportunities",
"manage_others_tasks",
"manage_team_email_templates",
"manage_team_smart_views",
"merge_leads",
],
}


def test_compare_two_records(not_sorted_data, sorted_data):
@pytest.mark.parametrize(
"obj1,obj2,is_same",
[
(sorted_data(), not_sorted_data(), True),
(
{
"organization": {
"features": [
"issue-percent-filters",
"performance-tag-page",
]
}
},
{
"organization": {
"features": [
"performance-tag-page",
"issue-percent-filters",
]
}
},
True,
),
(
{
"organization": {
"features": [
"issue-percent-filters",
"performance-tag-page",
]
}
},
{
"organization": {
"features": [
"performance-tag-pag",
"issue-percent-filters",
]
}
},
False,
),
(
{
"organization": {
"features": [
"issue-percent-filters",
"performance-tag-page",
]
}
},
{
"organization": {
"features": [
"performance-tag-page",
]
}
},
False,
),
({"a": 1, "b": 2}, {"b": 2, "a": 1}, True),
({"a": 1, "b": 2, "c": {"d": [1, 2]}}, {"b": 2, "a": 1, "c": {"d": [2, 1]}}, True),
({"a": 1, "b": 2, "c": {"d": [1, 2]}}, {"b": 2, "a": 1, "c": {"d": [3, 4]}}, False),
],
)
def test_compare_two_records_nested_with_different_orders(obj1, obj2, is_same):
"""Test that compare two records with equals, not sorted data."""
output_diff = set(map(serialize, sorted_data)) - set(map(serialize, not_sorted_data))
assert not output_diff
output_diff = set(map(make_hashible, [obj1])).symmetric_difference(set(map(make_hashible, [obj2])))
if is_same:
assert not output_diff, f"{obj1} should be equal to {obj2}"
else:
assert output_diff, f"{obj1} shouldnt be equal to {obj2}"