🏗️ Python CDK: add schema transformer class (#6139)

* Python CDK: add schema transformer class
airbytehq · Sep 27, 2021 · 1c5ac5b · 1c5ac5b
1 parent d386ed7
commit 1c5ac5b
Show file tree

Hide file tree

Showing 8 changed files with 597 additions and 4 deletions.
diff --git a/airbyte-cdk/python/CHANGELOG.md b/airbyte-cdk/python/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## 0.1.24
+Added Transform class to use for mutating record value types so they adhere to jsonschema definition.
+
 ## 0.1.23
 Added the ability to use caching for efficient synchronization of nested streams.
 

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py b/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py
@@ -26,7 +26,8 @@
 import copy
 from abc import ABC, abstractmethod
 from datetime import datetime
-from typing import Any, Iterator, List, Mapping, MutableMapping, Optional, Tuple
+from functools import lru_cache
+from typing import Any, Dict, Iterator, List, Mapping, MutableMapping, Optional, Tuple
 
 from airbyte_cdk.logger import AirbyteLogger
 from airbyte_cdk.models import (
@@ -35,6 +36,7 @@
     AirbyteMessage,
     AirbyteRecordMessage,
     AirbyteStateMessage,
+    AirbyteStream,
     ConfiguredAirbyteCatalog,
     ConfiguredAirbyteStream,
     Status,
@@ -45,6 +47,7 @@
 from airbyte_cdk.sources.streams import Stream
 from airbyte_cdk.sources.streams.http.http import HttpStream
 from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config
+from airbyte_cdk.sources.utils.transform import TypeTransformer
 
 
 class AbstractSource(Source, ABC):
@@ -70,6 +73,9 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
         :return: A list of the streams in this source connector.
         """
 
+    # Stream name to instance map for applying output object transformation
+    _stream_to_instance_map: Dict[str, AirbyteStream] = {}
+
     @property
     def name(self) -> str:
         """Source name"""
@@ -101,6 +107,7 @@ def read(
         # TODO assert all streams exist in the connector
         # get the streams once in case the connector needs to make any queries to generate them
         stream_instances = {s.name: s for s in self.streams(config)}
+        self._stream_to_instance_map = stream_instances
         for configured_stream in catalog.streams:
             stream_instance = stream_instances.get(configured_stream.stream.name)
             if not stream_instance:
@@ -227,7 +234,25 @@ def _checkpoint_state(self, stream_name, stream_state, connector_state, logger):
         connector_state[stream_name] = stream_state
         return AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=connector_state))
 
+    @lru_cache(maxsize=None)
+    def _get_stream_transformer_and_schema(self, stream_name: str) -> Tuple[TypeTransformer, dict]:
+        """
+        Lookup stream's transform object and jsonschema based on stream name.
+        This function would be called a lot so using caching to save on costly
+        get_json_schema operation.
+        :param stream_name name of stream from catalog.
+        :return tuple with stream transformer object and discover json schema.
+        """
+        stream_instance = self._stream_to_instance_map.get(stream_name)
+        return stream_instance.transformer, stream_instance.get_json_schema()
+
     def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]):
         now_millis = int(datetime.now().timestamp()) * 1000
+        transformer, schema = self._get_stream_transformer_and_schema(stream_name)
+        # Transform object fields according to config. Most likely you will
+        # need it to normalize values against json schema. By default no action
+        # taken unless configured. See
+        # docs/connector-development/cdk-python/schemas.md for details.
+        transformer.transform(data, schema)
         message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
         return AirbyteMessage(type=MessageType.RECORD, record=message)
diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py
@@ -31,6 +31,7 @@
 from airbyte_cdk.logger import AirbyteLogger
 from airbyte_cdk.models import AirbyteStream, SyncMode
 from airbyte_cdk.sources.utils.schema_helpers import ResourceSchemaLoader
+from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
 
 
 def package_name_from_class(cls: object) -> str:
@@ -47,6 +48,9 @@ class Stream(ABC):
     # Use self.logger in subclasses to log any messages
     logger = AirbyteLogger()  # TODO use native "logging" loggers with custom handlers
 
+    # TypeTransformer object to perform output data transformation
+    transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform)
+
     @property
     def name(self) -> str:
         """

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py
@@ -0,0 +1,196 @@
+#
+# MIT License
+#
+# Copyright (c) 2020 Airbyte
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from distutils.util import strtobool
+from enum import Flag, auto
+from typing import Any, Callable, Dict
+
+from airbyte_cdk.logger import AirbyteLogger
+from jsonschema import Draft7Validator, validators
+
+logger = AirbyteLogger()
+
+
+class TransformConfig(Flag):
+    """
+    TypeTransformer class config. Configs can be combined using bitwise or operator e.g.
+        ```
+        TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization
+        ```
+    """
+
+    # No action taken, default behaviour. Cannot be combined with any other options.
+    NoTransform = auto()
+    # Applies default type casting with default_convert method which converts
+    # values by applying simple type casting to specified jsonschema type.
+    DefaultSchemaNormalization = auto()
+    # Allow registering custom type transformation callback. Can be combined
+    # with DefaultSchemaNormalization. In this case default type casting would
+    # be applied before custom one.
+    CustomSchemaNormalization = auto()
+
+
+class TypeTransformer:
+    """
+    Class for transforming object before output.
+    """
+
+    _custom_normalizer: Callable[[Any, Dict[str, Any]], Any] = None
+
+    def __init__(self, config: TransformConfig):
+        """
+        Initialize TypeTransformer instance.
+        :param config Transform config that would be applied to object
+        """
+        if TransformConfig.NoTransform in config and config != TransformConfig.NoTransform:
+            raise Exception("NoTransform option cannot be combined with other flags.")
+        self._config = config
+        all_validators = {
+            key: self.__get_normalizer(key, orig_validator)
+            for key, orig_validator in Draft7Validator.VALIDATORS.items()
+            # Do not validate field we do not transform for maximum performance.
+            if key in ["type", "array", "$ref", "properties", "items"]
+        }
+        self._normalizer = validators.create(meta_schema=Draft7Validator.META_SCHEMA, validators=all_validators)
+
+    def registerCustomTransform(self, normalization_callback: Callable[[Any, Dict[str, Any]], Any]) -> Callable:
+        """
+        Register custom normalization callback.
+        :param normalization_callback function to be used for value
+        normalization. Takes original value and part type schema. Should return
+        normalized value. See docs/connector-development/cdk-python/schemas.md
+        for details.
+        :return Same callbeck, this is usefull for using registerCustomTransform function as decorator.
+        """
+        if TransformConfig.CustomSchemaNormalization not in self._config:
+            raise Exception("Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer")
+        self._custom_normalizer = normalization_callback
+        return normalization_callback
+
+    def __normalize(self, original_item: Any, subschema: Dict[str, Any]) -> Any:
+        """
+        Applies different transform function to object's field according to config.
+        :param original_item original value of field.
+        :param subschema part of the jsonschema containing field type/format data.
+        :return Final field value.
+        """
+        if TransformConfig.DefaultSchemaNormalization in self._config:
+            original_item = self.default_convert(original_item, subschema)
+
+        if self._custom_normalizer:
+            original_item = self._custom_normalizer(original_item, subschema)
+        return original_item
+
+    @staticmethod
+    def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any:
+        """
+        Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set.
+        :param original_item original value of field.
+        :param subschema part of the jsonschema containing field type/format data.
+        :return transformed field value.
+        """
+        target_type = subschema.get("type")
+        if original_item is None and "null" in target_type:
+            return None
+        if isinstance(target_type, list):
+            # jsonschema type could either be a single string or array of type
+            # strings. In case if there is some disambigous and more than one
+            # type (except null) do not do any conversion and return original
+            # value. If type array has one type and null i.e. {"type":
+            # ["integer", "null"]}, convert value to specified type.
+            target_type = [t for t in target_type if t != "null"]
+            if len(target_type) != 1:
+                return original_item
+            target_type = target_type[0]
+        try:
+            if target_type == "string":
+                return str(original_item)
+            elif target_type == "number":
+                return float(original_item)
+            elif target_type == "integer":
+                return int(original_item)
+            elif target_type == "boolean":
+                if isinstance(original_item, str):
+                    return strtobool(original_item) == 1
+                return bool(original_item)
+        except ValueError:
+            return original_item
+        return original_item
+
+    def __get_normalizer(self, schema_key: str, original_validator: Callable):
+        """
+        Traverse through object fields using native jsonschema validator and apply normalization function.
+        :param schema_key related json schema key that currently being validated/normalized.
+        :original_validator: native jsonschema validator callback.
+        """
+
+        def normalizator(validator_instance: Callable, val: Any, instance: Any, schema: Dict[str, Any]):
+            """
+            Jsonschema validator callable it uses for validating instance. We
+            override default Draft7Validator to perform value transformation
+            before validation take place. We do not take any action except
+            logging warn if object does not conform to json schema, just using
+            jsonschema algorithm to traverse through object fields.
+            Look
+            https://python-jsonschema.readthedocs.io/en/stable/creating/?highlight=validators.create#jsonschema.validators.create
+            validators parameter for detailed description.
+            :
+            """
+
+            def resolve(subschema):
+                if "$ref" in subschema:
+                    _, resolved = validator_instance.resolver.resolve(subschema["$ref"])
+                    return resolved
+                return subschema
+
+            if schema_key == "type" and instance is not None:
+                if "object" in val and isinstance(instance, dict):
+                    for k, subschema in schema.get("properties", {}).items():
+                        if k in instance:
+                            subschema = resolve(subschema)
+                            instance[k] = self.__normalize(instance[k], subschema)
+                elif "array" in val and isinstance(instance, list):
+                    subschema = schema.get("items", {})
+                    subschema = resolve(subschema)
+                    for index, item in enumerate(instance):
+                        instance[index] = self.__normalize(item, subschema)
+            # Running native jsonschema traverse algorithm after field normalization is done.
+            yield from original_validator(validator_instance, val, instance, schema)
+
+        return normalizator
+
+    def transform(self, record: Dict[str, Any], schema: Dict[str, Any]):
+        """
+        Normalize and validate according to config.
+        :param record record instance for normalization/transformation. All modification are done by modifing existent object.
+        :schema object's jsonschema for normalization.
+        """
+        if TransformConfig.NoTransform in self._config:
+            return
+        normalizer = self._normalizer(schema)
+        for e in normalizer.iter_errors(record):
+            """
+            just calling normalizer.validate() would throw an exception on
+            first validation occurences and stop processing rest of schema.
+            """
+            logger.warn(e.message)
diff --git a/airbyte-cdk/python/setup.py b/airbyte-cdk/python/setup.py
@@ -35,7 +35,7 @@
 
 setup(
     name="airbyte-cdk",
-    version="0.1.23",
+    version="0.1.24",
     description="A framework for writing Airbyte Connectors.",
     long_description=README,
     long_description_content_type="text/markdown",

diff --git a/airbyte-cdk/python/unit_tests/sources/test_source.py b/airbyte-cdk/python/unit_tests/sources/test_source.py
@@ -34,6 +34,7 @@
 from airbyte_cdk.sources import AbstractSource, Source
 from airbyte_cdk.sources.streams.core import Stream
 from airbyte_cdk.sources.streams.http.http import HttpStream
+from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
 
 
 class MockSource(Source):
@@ -81,6 +82,7 @@ def abstract_source(mocker):
     class MockHttpStream(MagicMock, HttpStream):
         url_base = "http://example.com"
         path = "/dummy/path"
+        get_json_schema = MagicMock()
 
         def supports_incremental(self):
             return True
@@ -92,6 +94,7 @@ def __init__(self, *args, **kvargs):
 
     class MockStream(MagicMock, Stream):
         page_size = None
+        get_json_schema = MagicMock()
 
         def __init__(self, *args, **kvargs):
             MagicMock.__init__(self)
@@ -145,8 +148,7 @@ def test_read_catalog(source):
 def test_internal_config(abstract_source, catalog):
     streams = abstract_source.streams(None)
     assert len(streams) == 2
-    http_stream = streams[0]
-    non_http_stream = streams[1]
+    http_stream, non_http_stream = streams
     assert isinstance(http_stream, HttpStream)
     assert not isinstance(non_http_stream, HttpStream)
     http_stream.read_records.return_value = [{}] * 3
@@ -216,3 +218,44 @@ def test_internal_config_limit(abstract_source, catalog):
     logger_info_args = [call[0][0] for call in logger_mock.info.call_args_list]
     read_log_record = [_l for _l in logger_info_args if _l.startswith("Read")]
     assert read_log_record[0].startswith(f"Read {STREAM_LIMIT} ")
+
+
+SCHEMA = {"type": "object", "properties": {"value": {"type": "string"}}}
+
+
+def test_source_config_no_transform(abstract_source, catalog):
+    logger_mock = MagicMock()
+    streams = abstract_source.streams(None)
+    http_stream, non_http_stream = streams
+    http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
+    http_stream.read_records.return_value, non_http_stream.read_records.return_value = [[{"value": 23}] * 5] * 2
+    records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
+    assert len(records) == 2 * 5
+    assert [r.record.data for r in records] == [{"value": 23}] * 2 * 5
+    assert http_stream.get_json_schema.call_count == 1
+    assert non_http_stream.get_json_schema.call_count == 1
+
+
+def test_source_config_transform(abstract_source, catalog):
+    logger_mock = MagicMock()
+    streams = abstract_source.streams(None)
+    http_stream, non_http_stream = streams
+    http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
+    non_http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
+    http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
+    http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
+    records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
+    assert len(records) == 2
+    assert [r.record.data for r in records] == [{"value": "23"}] * 2
+
+
+def test_source_config_transform_and_no_transform(abstract_source, catalog):
+    logger_mock = MagicMock()
+    streams = abstract_source.streams(None)
+    http_stream, non_http_stream = streams
+    http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
+    http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA
+    http_stream.read_records.return_value, non_http_stream.read_records.return_value = [{"value": 23}], [{"value": 23}]
+    records = [r for r in abstract_source.read(logger=logger_mock, config={}, catalog=catalog, state={})]
+    assert len(records) == 2
+    assert [r.record.data for r in records] == [{"value": "23"}, {"value": 23}]