From eaba48398265fc31b77cb7a8c86f8182ce3a7200 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 4 Aug 2023 13:11:21 -0400 Subject: [PATCH 01/27] [ISSUE #28893] infer csv schema --- .../file_based/file_types/csv_parser.py | 387 +++++++++++------- .../file_based/file_types/test_csv_parser.py | 249 ++++++++++- 2 files changed, 476 insertions(+), 160 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 479402877272..3b6e6f7dc915 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -5,9 +5,10 @@ import csv import json import logging +from collections import defaultdict from functools import partial from io import IOBase -from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set +from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig @@ -27,17 +28,20 @@ } -class CsvParser(FileTypeParser): - async def infer_schema( +class _CsvReader: + def read_data( self, config: FileBasedStreamConfig, file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: - config_format = config.format.get(config.file_type) if config.format else CsvFormat() - if not isinstance(config_format, CsvFormat): - raise ValueError(f"Invalid format config: {config_format}") + file_read_mode: FileReadMode, + ) -> Generator[Dict[str, Any], None, None]: + config_format = _extract_config_format(config) + + # Formats are configured individually per-stream so a unique dialect should be registered for each stream. + # We don't unregister the dialect because we are lazily parsing each csv file to generate records + # This will potentially be a problem if we ever process multiple streams concurrently dialect_name = config.name + DIALECT_NAME csv.register_dialect( dialect_name, @@ -47,13 +51,91 @@ async def infer_schema( doublequote=config_format.double_quote, quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL), ) - with stream_reader.open_file(file, self.file_read_mode, logger) as fp: - # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual - # sources will likely require one. Rather than modify the interface now we can wait until the real use case + with stream_reader.open_file(file, file_read_mode, logger) as fp: headers = self._get_headers(fp, config_format, dialect_name) - schema = {field.strip(): {"type": "string"} for field in headers} - csv.unregister_dialect(dialect_name) - return schema + + fp.seek(0) + # we assume that if we autogenerate columns, it is because we don't have headers + # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows + rows_to_skip = config_format.skip_rows_before_header + (0 if config_format.autogenerate_column_names else 1) + config_format.skip_rows_after_header + self._skip_rows(fp, rows_to_skip) + + reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore + try: + for row in reader: + # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns than + # headers + if None in row: + raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD) + yield row + finally: + # due to RecordParseError or GeneratorExit + csv.unregister_dialect(dialect_name) + + def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + # Note that this method assumes the dialect has already been registered if we're parsing the headers + self._skip_rows(fp, config_format.skip_rows_before_header) + if config_format.autogenerate_column_names: + return self._auto_generate_headers(fp, config_format, dialect_name) + else: + # Then read the header + reader = csv.reader(fp, dialect=dialect_name) # type: ignore + return list(next(reader)) + + def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + """ + Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True. + See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html + """ + reader = csv.reader(fp, dialect=dialect_name) # type: ignore + number_of_columns = len(next(reader)) # type: ignore + # Reset the file pointer to the beginning of the file so that the first row is not skipped + fp.seek(0) + return [f"f{i}" for i in range(number_of_columns)] + + @staticmethod + def _skip_rows(fp: IOBase, rows_to_skip: int) -> None: + """ + Skip rows before the header. This has to be done on the file object itself, not the reader + """ + for _ in range(rows_to_skip): + fp.readline() + + +class CsvParser(FileTypeParser): + _MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000 + + def __init__(self, csv_reader: Optional[_CsvReader] = None): + self._csv_reader = csv_reader if csv_reader else _CsvReader() + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> Dict[str, Any]: + # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual + # sources will likely require one. Rather than modify the interface now we can wait until the real use case + config_format = _extract_config_format(config) + type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( + lambda: _TypeInferrer(config_format.true_values, config_format.false_values) + ) + data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) + read_bytes = 0 + for row in data_generator: + for header, value in row.items(): + type_inferrer_by_field[header].add_value(value) + # This is not accurate as a representation of how many bytes were read because csv does some processing on the actual value + # before returning. Given we would like to be more accurate, we could wrap the IO file using a decorator + read_bytes += len(value) + read_bytes += len(row) - 1 # for separators + if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE: + break + + schema = {header.strip(): {"type": type_inferred.infer()} for header, type_inferred in type_inferrer_by_field.items()} + data_generator.close() + return schema def parse_records( self, @@ -63,54 +145,17 @@ def parse_records( logger: logging.Logger, ) -> Iterable[Dict[str, Any]]: schema: Mapping[str, Any] = config.input_schema # type: ignore - config_format = config.format.get(config.file_type) if config.format else CsvFormat() - if not isinstance(config_format, CsvFormat): - raise ValueError(f"Invalid format config: {config_format}") - # Formats are configured individually per-stream so a unique dialect should be registered for each stream. - # We don't unregister the dialect because we are lazily parsing each csv file to generate records - # This will potentially be a problem if we ever process multiple streams concurrently - dialect_name = config.name + DIALECT_NAME - csv.register_dialect( - dialect_name, - delimiter=config_format.delimiter, - quotechar=config_format.quote_char, - escapechar=config_format.escape_char, - doublequote=config_format.double_quote, - quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL), - ) - with stream_reader.open_file(file, self.file_read_mode, logger) as fp: - # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual - # sources will likely require one. Rather than modify the interface now we can wait until the real use case - self._skip_rows_before_header(fp, config_format.skip_rows_before_header) - field_names = self._auto_generate_headers(fp, config_format) if config_format.autogenerate_column_names else None - reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=field_names) # type: ignore - yield from self._read_and_cast_types(reader, schema, config_format, logger) + config_format = _extract_config_format(config) + cast_fn = CsvParser._get_cast_function(schema, config_format, logger) + data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) + for row in data_generator: + yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) + data_generator.close() @property def file_read_mode(self) -> FileReadMode: return FileReadMode.READ - @staticmethod - def _read_and_cast_types( - reader: csv.DictReader, schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger # type: ignore - ) -> Iterable[Dict[str, Any]]: - """ - If the user provided a schema, attempt to cast the record values to the associated type. - - If a column is not in the schema or cannot be cast to an appropriate python type, - cast it to a string. Downstream, the user's validation policy will determine whether the - record should be emitted. - """ - cast_fn = CsvParser._get_cast_function(schema, config_format, logger) - for i, row in enumerate(reader): - if i < config_format.skip_rows_after_header: - continue - # The row was not properly parsed if any of the values are None - if any(val is None for val in row.values()): - raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD) - else: - yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) - @staticmethod def _get_cast_function( schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger @@ -118,7 +163,7 @@ def _get_cast_function( # Only cast values if the schema is provided if schema: property_types = {col: prop["type"] for col, prop in schema["properties"].items()} - return partial(_cast_types, property_types=property_types, config_format=config_format, logger=logger) + return partial(CsvParser._cast_types, property_types=property_types, config_format=config_format, logger=logger) else: # If no schema is provided, yield the rows as they are return _no_cast @@ -129,96 +174,137 @@ def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Opt return nullable @staticmethod - def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None: - """ - Skip rows before the header. This has to be done on the file object itself, not the reader + def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]: """ - for _ in range(rows_to_skip): - fp.readline() + Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. - def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: - # Note that this method assumes the dialect has already been registered if we're parsing the headers - if config_format.autogenerate_column_names: - return self._auto_generate_headers(fp, config_format) - else: - # If we're not autogenerating column names, we need to skip the rows before the header - self._skip_rows_before_header(fp, config_format.skip_rows_before_header) - # Then read the header - reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore - return next(reader) # type: ignore + Array and object types are only handled if they can be deserialized as JSON. - def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat) -> List[str]: - """ - Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True. - See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html + If any errors are encountered, the value will be emitted as a string._to_nullable """ - next_line = next(fp).strip() - number_of_columns = len(next_line.split(config_format.delimiter)) # type: ignore - # Reset the file pointer to the beginning of the file so that the first row is not skipped - fp.seek(0) - return [f"f{i}" for i in range(number_of_columns)] + warnings = [] + result = {} + + for key, value in row.items(): + prop_type = property_types.get(key) + cast_value: Any = value + + if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: + _, python_type = TYPE_PYTHON_MAPPING[prop_type] + + if python_type is None: + if value == "": + cast_value = None + else: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == bool: + try: + cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values) + except ValueError: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == dict: + try: + # we don't re-use _value_to_object here because we type the column as object as long as there is only one object + cast_value = json.loads(value) + except json.JSONDecodeError: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == list: + try: + cast_value = _value_to_list(value) + except (ValueError, json.JSONDecodeError): + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type: + try: + cast_value = _value_to_python_type(value, python_type) + except ValueError: + warnings.append(_format_warning(key, value, prop_type)) + else: + warnings.append(_format_warning(key, value, prop_type)) + + result[key] = cast_value + + if warnings: + logger.warning( + f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}", + ) + return result + + +class _TypeInferrer: + _BOOLEAN_TYPE = "boolean" + _NUMBER_TYPE = "number" + _ARRAY_TYPE = "array" + _OBJECT_TYPE = "object" + _STRING_TYPE = "string" + + def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str]) -> None: + self._boolean_trues = boolean_trues + self._boolean_falses = boolean_falses + self._values: Set[str] = set() + + def add_value(self, value: Any) -> None: + self._values.add(value) + + def infer(self) -> str: + types = {self._infer_type(value) for value in self._values} + + if types == {self._BOOLEAN_TYPE}: + return self._BOOLEAN_TYPE + elif types == {self._NUMBER_TYPE}: + return self._NUMBER_TYPE + elif types == {self._ARRAY_TYPE}: + return self._ARRAY_TYPE + elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: + return self._OBJECT_TYPE + return self._STRING_TYPE + + def _infer_type(self, value: str) -> str: + if self._is_boolean(value): + return self._BOOLEAN_TYPE + elif self._is_number(value): + return self._NUMBER_TYPE + elif self._is_array(value): + return self._ARRAY_TYPE + elif self._is_object(value): + return self._OBJECT_TYPE + else: + return self._STRING_TYPE -def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]: - """ - Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. - - Array and object types are only handled if they can be deserialized as JSON. - - If any errors are encountered, the value will be emitted as a string. - """ - warnings = [] - result = {} - - for key, value in row.items(): - prop_type = property_types.get(key) - cast_value: Any = value - - if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: - _, python_type = TYPE_PYTHON_MAPPING[prop_type] - - if python_type is None: - if value == "": - cast_value = None - else: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type == bool: - try: - cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values) - except ValueError: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type == dict: - try: - cast_value = json.loads(value) - except json.JSONDecodeError: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type == list: - try: - parsed_value = json.loads(value) - if isinstance(parsed_value, list): - cast_value = parsed_value - except json.JSONDecodeError: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type: - try: - cast_value = python_type(value) - except ValueError: - warnings.append(_format_warning(key, value, prop_type)) + def _is_boolean(self, value: str) -> bool: + try: + _value_to_bool(value, self._boolean_trues, self._boolean_falses) + return True + except ValueError: + return False - else: - warnings.append(_format_warning(key, value, prop_type)) + @staticmethod + def _is_number(value: str) -> bool: + try: + _value_to_python_type(value, float) + return True + except ValueError: + return False - result[key] = cast_value + @staticmethod + def _is_array(value: str) -> bool: + try: + _value_to_list(value) + return True + except (ValueError, json.JSONDecodeError): + return False - if warnings: - logger.warning( - f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}", - ) - return result + @staticmethod + def _is_object(value: str) -> bool: + try: + _value_to_object(value) + return True + except (ValueError, json.JSONDecodeError): + return False def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool: @@ -229,9 +315,34 @@ def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> raise ValueError(f"Value {value} is not a valid boolean value") +def _value_to_object(value: str) -> Dict[Any, Any]: + parsed_value = json.loads(value) + if isinstance(parsed_value, dict): + return parsed_value + raise ValueError(f"Value {parsed_value} is not a valid dict value") + + +def _value_to_list(value: str) -> List[Any]: + parsed_value = json.loads(value) + if isinstance(parsed_value, list): + return parsed_value + raise ValueError(f"Value {parsed_value} is not a valid list value") + + +def _value_to_python_type(value: str, python_type: type) -> Any: + return python_type(value) + + def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str: return f"{key}: value={value},expected_type={expected_type}" def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]: return row + + +def _extract_config_format(config: FileBasedStreamConfig) -> CsvFormat: + config_format = config.format.get(config.file_type) if config.format else CsvFormat() + if not isinstance(config_format, CsvFormat): + raise ValueError(f"Invalid format config: {config_format}") + return config_format diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 1d2079396383..48fb991f2195 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -2,13 +2,21 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # +import asyncio +import csv +import io import logging -from unittest.mock import MagicMock, Mock +import unittest +from typing import Any, Dict, Generator, List, Set +from unittest.mock import Mock +from unittest import TestCase import pytest from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat from airbyte_cdk.sources.file_based.exceptions import RecordParseError -from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _cast_types +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.csv_parser import _CsvReader, CsvParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile PROPERTY_TYPES = { "col1": "null", @@ -70,29 +78,226 @@ pytest.param({"col10": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col10": "x"}, id="item-not-in-props-doesn't-error"), ] ) -def test_cast_to_python_type(row, true_values, false_values, expected_output): +def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None: csv_format = CsvFormat(true_values=true_values, false_values=false_values) - assert _cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output + assert CsvParser._cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output -@pytest.mark.parametrize( - "reader_values, expected_rows", [ - pytest.param([{"col1": "1", "col2": None}], None, id="raise_exception_if_any_value_is_none"), - pytest.param([{"col1": "1", "col2": "2"}], [{"col1": "1", "col2": "2"}], id="read_no_cast"), - ] -) -def test_read_and_cast_types(reader_values, expected_rows): - reader = MagicMock() - reader.__iter__.return_value = reader_values - schema = {} - config_format = CsvFormat() - logger = Mock() +_DEFAULT_TRUE_VALUES = {"yes", "yeah", "right"} +_DEFAULT_FALSE_VALUES = {"no", "nop", "wrong"} + + +class SchemaInferrenceTestCase(TestCase): + def setUp(self) -> None: + self._config_format = CsvFormat() + self._config_format.true_values = _DEFAULT_TRUE_VALUES + self._config_format.false_values = _DEFAULT_FALSE_VALUES + self._config = Mock() + self._config.format.get.return_value = self._config_format + + self._file = Mock(spec=RemoteFile) + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._logger = Mock(spec=logging.Logger) + self._csv_reader = Mock(spec=_CsvReader) + self._parser = CsvParser(self._csv_reader) + + def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") + + def test_given_numbers_only_when_infer_schema_then_type_is_number(self) -> None: + self._test_infer_schema(["2", "90329", "2.312"], "number") + + def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") + + def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") + + def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") + + def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") + + def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: + self._test_infer_schema(["a string", "another string"], "string") + + def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: + self._csv_reader.read_data.return_value = ({"header": row} for row in rows) + inferred_schema = self._infer_schema() + assert inferred_schema == {"header": {"type": expected_type}} + + def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: + self._csv_reader.read_data.return_value = ({"header": row} for row in ["2" * 1_000_000] + ["this is a string"]) + inferred_schema = self._infer_schema() + # since the type is number, we know the string at the end was not considered + assert inferred_schema == {"header": {"type": "number"}} + + def _infer_schema(self): + loop = asyncio.new_event_loop() + task = loop.create_task(self._parser.infer_schema(self._config, self._file, self._stream_reader, self._logger)) + loop.run_until_complete(task) + return task.result() + + +class CsvFileBuilder: + def __init__(self) -> None: + self._prefixed_rows: List[str] = [] + self._data: List[str] = [] + + def with_prefixed_rows(self, rows: List[str]) -> 'CsvFileBuilder': + self._prefixed_rows = rows + return self + + def with_data(self, data: List[str]) -> 'CsvFileBuilder': + self._data = data + return self + + def build(self) -> io.StringIO: + return io.StringIO("\n".join(self._prefixed_rows + self._data)) + + +class CsvReaderTest(unittest.TestCase): + _CONFIG_NAME = "config_name" + + def setUp(self) -> None: + self._config_format = CsvFormat() + self._config = Mock() + self._config.name = self._CONFIG_NAME + self._config.format.get.return_value = self._config_format + + self._file = Mock(spec=RemoteFile) + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._logger = Mock(spec=logging.Logger) + self._csv_reader = _CsvReader() + + def test_given_skip_rows_when_read_data_then_do_not_considered_prefixed_rows(self) -> None: + self._config_format.skip_rows_before_header = 2 + self._stream_reader.open_file.return_value = CsvFileBuilder().with_prefixed_rows(["first line", "second line"]).with_data([ + "header", + "a value", + "another value", + ]).build() + + data_generator = self._read_data() - parser = CsvParser() + assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}] + + def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None: + self._config_format.autogenerate_column_names = True + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + '0,1,2,3,4,5,6' + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}] + + def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None: + self._config_format.skip_rows_after_header = 1 + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + "skipped row: important that the is no comma in this string to test if columns do not match in skipped rows", + "a value 1,a value 2", + "another value 1,another value 2" + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [ + {"header1": "a value 1", "header2": "a value 2"}, + {"header1": "another value 1", "header2": "another value 2"} + ] + + def test_given_quote_delimiter_when_read_data_then_parse_properly(self) -> None: + self._config_format.delimiter = "|" + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1|header2", + "a value 1|a value 2", + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "a value 1", "header2": "a value 2"}] + + def test_given_quote_char_when_read_data_then_parse_properly(self) -> None: + self._config_format.quote_char = "|" + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + "|a,value,1|,|a,value,2|", + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "a,value,1", "header2": "a,value,2"}] + + def test_given_escape_char_when_read_data_then_parse_properly(self) -> None: + self._config_format.escape_char = "|" + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + '"a |"value|", 1",a value 2', + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": 'a "value", 1', "header2": "a value 2"}] + + def test_given_double_quote_on_when_read_data_then_parse_properly(self) -> None: + self._config_format.double_quote = True + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "1", "header2": 'Text with doublequote: "This is a text."'}] + + def test_given_double_quote_off_when_read_data_then_parse_properly(self) -> None: + self._config_format.double_quote = False + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "1", "header2": 'Text with doublequote: "This is a text."""'}] + + def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header", + "a value", + "another value", + ]).build() + + data_generator = self._read_data() + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" in csv.list_dialects() + data_generator.close() + assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + + def test_given_exception_when_read_data_then_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header", + "a value", + "too many values,value,value,value", + ]).build() + + data_generator = self._read_data() + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" in csv.list_dialects() - expected_rows = expected_rows - if expected_rows is None: with pytest.raises(RecordParseError): - list(parser._read_and_cast_types(reader, schema, config_format, logger)) - else: - assert expected_rows == list(parser._read_and_cast_types(reader, schema, config_format, logger)) + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + + def _read_data(self) -> Generator[Dict[str, str], None, None]: + data_generator = self._csv_reader.read_data( + self._config, + self._file, + self._stream_reader, + self._logger, + FileReadMode.READ, + ) + return data_generator From 60757c1e21b6752b53ac4b93a1d5c493daea8621 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 4 Aug 2023 15:03:25 -0400 Subject: [PATCH 02/27] [ISSUE #28893] align with pyarrow --- .../file_based/file_types/csv_parser.py | 20 ++++++++++++++----- .../file_based/file_types/test_csv_parser.py | 15 ++++++++------ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 3b6e6f7dc915..9c7b58aadca4 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -237,6 +237,7 @@ def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_form class _TypeInferrer: _BOOLEAN_TYPE = "boolean" + _INTEGER_TYPE = "integer" _NUMBER_TYPE = "number" _ARRAY_TYPE = "array" _OBJECT_TYPE = "object" @@ -255,17 +256,18 @@ def infer(self) -> str: if types == {self._BOOLEAN_TYPE}: return self._BOOLEAN_TYPE - elif types == {self._NUMBER_TYPE}: + elif types == {self._INTEGER_TYPE}: + return self._INTEGER_TYPE + elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}: return self._NUMBER_TYPE - elif types == {self._ARRAY_TYPE}: - return self._ARRAY_TYPE - elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: - return self._OBJECT_TYPE + # to keep backward compatibility with PyArrow, we will not parse types return self._STRING_TYPE def _infer_type(self, value: str) -> str: if self._is_boolean(value): return self._BOOLEAN_TYPE + elif self._is_integer(value): + return self._INTEGER_TYPE elif self._is_number(value): return self._NUMBER_TYPE elif self._is_array(value): @@ -282,6 +284,14 @@ def _is_boolean(self, value: str) -> bool: except ValueError: return False + @staticmethod + def _is_integer(value: str) -> bool: + try: + _value_to_python_type(value, int) + return True + except ValueError: + return False + @staticmethod def _is_number(value: str) -> bool: try: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 48fb991f2195..4d3658091c84 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -104,20 +104,23 @@ def setUp(self) -> None: def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") - def test_given_numbers_only_when_infer_schema_then_type_is_number(self) -> None: + def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._test_infer_schema(["2", "90329", "5645"], "integer") + + def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: self._test_infer_schema(["2", "90329", "2.312"], "number") def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") + self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "string") def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") + self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "string") def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: self._test_infer_schema(["a string", "another string"], "string") @@ -128,7 +131,7 @@ def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: assert inferred_schema == {"header": {"type": expected_type}} def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._csv_reader.read_data.return_value = ({"header": row} for row in ["2" * 1_000_000] + ["this is a string"]) + self._csv_reader.read_data.return_value = ({"header": row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered assert inferred_schema == {"header": {"type": "number"}} From f49235d22b038c925fd54ee5f572bb6f5fd8fd51 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 4 Aug 2023 19:08:14 +0000 Subject: [PATCH 03/27] Automated Commit - Formatting Changes --- .../sources/file_based/file_types/csv_parser.py | 10 ++++++++-- .../sources/file_based/file_types/test_csv_parser.py | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 9c7b58aadca4..412e460846e9 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -57,7 +57,11 @@ def read_data( fp.seek(0) # we assume that if we autogenerate columns, it is because we don't have headers # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows - rows_to_skip = config_format.skip_rows_before_header + (0 if config_format.autogenerate_column_names else 1) + config_format.skip_rows_after_header + rows_to_skip = ( + config_format.skip_rows_before_header + + (0 if config_format.autogenerate_column_names else 1) + + config_format.skip_rows_after_header + ) self._skip_rows(fp, rows_to_skip) reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore @@ -174,7 +178,9 @@ def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Opt return nullable @staticmethod - def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]: + def _cast_types( + row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger + ) -> Dict[str, Any]: """ Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 4d3658091c84..811cb8f9fc7c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -8,14 +8,14 @@ import logging import unittest from typing import Any, Dict, Generator, List, Set -from unittest.mock import Mock from unittest import TestCase +from unittest.mock import Mock import pytest from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat from airbyte_cdk.sources.file_based.exceptions import RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.csv_parser import _CsvReader, CsvParser +from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader from airbyte_cdk.sources.file_based.remote_file import RemoteFile PROPERTY_TYPES = { From a3946660772937d63fe9da58c930d50d533cea9a Mon Sep 17 00:00:00 2001 From: maxi297 Date: Mon, 7 Aug 2023 10:14:50 -0400 Subject: [PATCH 04/27] [ISSUE #28893] legacy inference and infer only when needed --- .../sources/file_based/config/csv_format.py | 18 ++++ .../file_based/file_types/csv_parser.py | 59 +++++++++++-- .../file_based/config/test_csv_format.py | 20 ++++- .../file_based/file_types/test_csv_parser.py | 84 ++++++++++++++++--- 4 files changed, 162 insertions(+), 19 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index 1fda1016a00c..c3a75e21611e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -78,6 +78,16 @@ class CsvFormat(BaseModel): default=DEFAULT_FALSE_VALUES, description="A set of case-sensitive strings that should be interpreted as false values.", ) + infer_datatypes: bool = Field( + title="Infer Datatypes", + default=False, + description="Whether to autogenerate the schema based the file content.", + ) + infer_datatypes_legacy: bool = Field( + title="Infer Datatypes (legacy)", + default=False, + description="Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + ) @validator("delimiter") def validate_delimiter(cls, v: str) -> str: @@ -114,3 +124,11 @@ def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, if skip_rows_before_header > 0 and auto_generate_column_names: raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.") return values + + @root_validator + def validate_option_inference(cls, values: Mapping[str, Any]) -> Mapping[str, Any]: + infer_datatypes = values.get("infer_datatypes", False) + infer_datatypes_legacy = values.get("infer_datatypes_legacy", False) + if infer_datatypes and infer_datatypes_legacy: + raise ValueError("Only one way to inference can be configured at the same time") + return values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 412e460846e9..7c470f6d4d0c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -5,6 +5,7 @@ import csv import json import logging +from abc import ABC, abstractmethod from collections import defaultdict from functools import partial from io import IOBase @@ -67,9 +68,9 @@ def read_data( reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore try: for row in reader: - # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns than - # headers - if None in row: + # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns + # than headers or more headers dans columns + if None in row or None in row.values(): raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD) yield row finally: @@ -119,11 +120,21 @@ async def infer_schema( stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, ) -> Dict[str, Any]: + if config.input_schema: + # FIXME: what happens if it's a string + return config.input_schema + # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual # sources will likely require one. Rather than modify the interface now we can wait until the real use case config_format = _extract_config_format(config) type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( - lambda: _TypeInferrer(config_format.true_values, config_format.false_values) + lambda: _JsonTypeInferrer( + config_format.true_values, + config_format.false_values, + config_format.null_values, + config_format.infer_datatypes and not config_format.infer_datatypes_legacy + ) if config_format.infer_datatypes or config_format.infer_datatypes_legacy + else _DisabledTypeInferrer() ) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) read_bytes = 0 @@ -241,7 +252,26 @@ def _cast_types( return result -class _TypeInferrer: +class _TypeInferrer(ABC): + @abstractmethod + def add_value(self, value: Any) -> None: + pass + + @abstractmethod + def infer(self) -> str: + pass + + +class _DisabledTypeInferrer(_TypeInferrer): + def add_value(self, value: Any) -> None: + pass + + def infer(self) -> str: + return "string" + + +class _JsonTypeInferrer(_TypeInferrer): + _NULL_TYPE = "null" _BOOLEAN_TYPE = "boolean" _INTEGER_TYPE = "integer" _NUMBER_TYPE = "number" @@ -249,9 +279,11 @@ class _TypeInferrer: _OBJECT_TYPE = "object" _STRING_TYPE = "string" - def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str]) -> None: + def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str], allow_for_objects_and_arrays: bool) -> None: self._boolean_trues = boolean_trues self._boolean_falses = boolean_falses + self._null_values = null_values + self._allow_for_objects_and_arrays = allow_for_objects_and_arrays self._values: Set[str] = set() def add_value(self, value: Any) -> None: @@ -259,18 +291,29 @@ def add_value(self, value: Any) -> None: def infer(self) -> str: types = {self._infer_type(value) for value in self._values} + if types == {self._NULL_TYPE}: + # this is highly unusual but we will consider the column as a string + return self._STRING_TYPE + types.discard(self._NULL_TYPE) if types == {self._BOOLEAN_TYPE}: return self._BOOLEAN_TYPE elif types == {self._INTEGER_TYPE}: return self._INTEGER_TYPE elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}: return self._NUMBER_TYPE - # to keep backward compatibility with PyArrow, we will not parse types + elif not self._allow_for_objects_and_arrays: + return self._STRING_TYPE + elif types == {self._ARRAY_TYPE}: + return self._ARRAY_TYPE + elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: + return self._OBJECT_TYPE return self._STRING_TYPE def _infer_type(self, value: str) -> str: - if self._is_boolean(value): + if value in self._null_values: + return self._NULL_TYPE + elif self._is_boolean(value): return self._BOOLEAN_TYPE elif self._is_integer(value): return self._INTEGER_TYPE diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index 6903f126af30..edd39964124c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -3,6 +3,7 @@ # import pytest as pytest +from typing import Type from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat @@ -15,9 +16,26 @@ pytest.param(0, False, None, id="test_no_skip_rows_before_header_and_no_autogenerate_column_names"), ] ) -def test_csv_format(skip_rows_before_header, autogenerate_column_names, expected_error): +def test_csv_format_skip_rows_and_autogenerate_column_names(skip_rows_before_header, autogenerate_column_names, expected_error) -> None: if expected_error: with pytest.raises(expected_error): CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) else: CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) + + +@pytest.mark.parametrize( + "infer_datatypes, infer_datatypes_legacy, expected_error", + [ + pytest.param(True, True, ValueError, id="test_many_inferences_configured"), + pytest.param(True, False, None, id="test_infer_datatypes"), + pytest.param(False, True, None, id="test_infer_datatypes_legacy"), + pytest.param(False, False, None, id="test_no_inference"), + ] +) +def test_csv_format_inference(infer_datatypes: bool, infer_datatypes_legacy: bool, expected_error: Type[BaseException]) -> None: + if expected_error: + with pytest.raises(expected_error): + CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) + else: + CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 811cb8f9fc7c..5547f1254038 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -87,12 +87,19 @@ def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_v _DEFAULT_FALSE_VALUES = {"no", "nop", "wrong"} -class SchemaInferrenceTestCase(TestCase): +class SchemaInferenceTestCase(TestCase): + _A_NULL_VALUE = "null" + _HEADER_NAME = "header" + def setUp(self) -> None: self._config_format = CsvFormat() self._config_format.true_values = _DEFAULT_TRUE_VALUES self._config_format.false_values = _DEFAULT_FALSE_VALUES + self._config_format.null_values = {self._A_NULL_VALUE} + self._config_format.infer_datatypes = False + self._config_format.infer_datatypes_legacy = False self._config = Mock() + self._config.input_schema = None self._config.format.get.return_value = self._config_format self._file = Mock(spec=RemoteFile) @@ -101,40 +108,82 @@ def setUp(self) -> None: self._csv_reader = Mock(spec=_CsvReader) self._parser = CsvParser(self._csv_reader) + def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(self) -> None: + self._config.input_schema = {self._HEADER_NAME: {"type": "potato"}} + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") + + def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") + + def test_given_legacy_and_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(["2", "90329", "5645"], "integer") + + def test_given_legacy_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(["2", "90329", "5645"], "integer") def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(["2", "90329", "2.312"], "number") + + def test_given_legacy_and_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(["2", "90329", "2.312"], "number") def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") + + def test_given_arrays_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") + + def test_given_legacy_and_objects_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "string") + self._config_format.infer_datatypes = True + self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "string") + self._config_format.infer_datatypes = True + self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.infer_datatypes = True self._test_infer_schema(["a string", "another string"], "string") - def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: - self._csv_reader.read_data.return_value = ({"header": row} for row in rows) - inferred_schema = self._infer_schema() - assert inferred_schema == {"header": {"type": expected_type}} + def test_given_a_null_value_when_infer_then_ignore_null(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(["2", "90329", "5645", self._A_NULL_VALUE], "integer") + + def test_given_only_null_values_when_infer_then_type_is_string(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema([self._A_NULL_VALUE, self._A_NULL_VALUE, self._A_NULL_VALUE], "string") def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._csv_reader.read_data.return_value = ({"header": row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) + self._config_format.infer_datatypes = True + self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered - assert inferred_schema == {"header": {"type": "number"}} + assert inferred_schema == {self._HEADER_NAME: {"type": "number"}} + + def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: + self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in rows) + inferred_schema = self._infer_schema() + assert inferred_schema == {self._HEADER_NAME: {"type": expected_type}} def _infer_schema(self): loop = asyncio.new_event_loop() @@ -280,7 +329,7 @@ def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> data_generator.close() assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() - def test_given_exception_when_read_data_then_unregister_dialect(self) -> None: + def test_given_too_many_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ "header", "a value", @@ -295,6 +344,21 @@ def test_given_exception_when_read_data_then_unregister_dialect(self) -> None: next(data_generator) assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + def test_given_too_few_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2,header3", + "value1,value2,value3", + "a value", + ]).build() + + data_generator = self._read_data() + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" in csv.list_dialects() + + with pytest.raises(RecordParseError): + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + def _read_data(self) -> Generator[Dict[str, str], None, None]: data_generator = self._csv_reader.read_data( self._config, From 4f9d162b86e1d16aca1f153e321f882dbc639443 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Mon, 7 Aug 2023 11:47:05 -0400 Subject: [PATCH 05/27] [ISSUE #28893] fix scenario tests --- .../file_based/file_types/csv_parser.py | 2 +- .../file_based/scenarios/csv_scenarios.py | 17 ++++++++-- .../scenarios/validation_policy_scenarios.py | 34 +++++++++---------- 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 7c470f6d4d0c..953c87fc70a1 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -121,7 +121,7 @@ async def infer_schema( logger: logging.Logger, ) -> Dict[str, Any]: if config.input_schema: - # FIXME: what happens if it's a string + # FIXME change type of method to Mapping return config.input_schema # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index c5714cc570b3..b928d7445c37 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -8,7 +8,7 @@ single_csv_scenario = ( TestScenarioBuilder() - .set_name("single_csv_stream") + .set_name("single_csv_scenario") .set_config( { "streams": [ @@ -249,7 +249,19 @@ "type": "string" }, "uniqueItems": True - } + }, + "infer_datatypes": { + "default": False, + "description": "Whether to autogenerate the schema based the file content.", + "title": "Infer Datatypes", + "type": "boolean" + }, + "infer_datatypes_legacy": { + "default": False, + "description": "Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + "title": "Infer Datatypes (legacy)", + "type": "boolean" + }, } }, { @@ -1999,6 +2011,7 @@ "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0", } ]}) + .set_expected_discover_error(SchemaInferenceError, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) ).build() csv_escape_char_is_set_scenario = ( diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 55d0d40911f9..22bff3f66f13 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -12,8 +12,8 @@ "a.csv": { # The records in this file do not conform to the schema "contents": [ ("col1", "col2"), - ("val_a_11", "val_a_21"), - ("val_a_12", "val_a_22"), + ("val_a_11", "1"), + ("val_a_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -29,7 +29,7 @@ "contents": [ ("col1",), ("val_c_11",), - ("val_c_12","val_c_22"), # This record doesn't conform to the schema + ("val_c_12","val_c_22"), # This record is not parsable ("val_c_13",), ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -55,9 +55,9 @@ "col1": { "type": "string", }, - # "col2": { # remove this so the record does not conform to the schema - # "type": "string", - # }, + "col2": { + "type": "number", + }, "_ab_source_file_last_modified": { "type": "string" }, @@ -100,7 +100,7 @@ "contents": [ ("col1",), ("val_aa3_11",), - ("val_aa3_12", "val_aa3_22"), # This record does not conform to the schema + ("val_aa3_12", "val_aa3_22"), # This record is not parsable ("val_aa3_13",), ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -219,7 +219,7 @@ {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, ] ) @@ -230,8 +230,8 @@ "message": "Records in file did not pass validation policy. stream=stream1 file=a.csv n_skipped=2 validation_policy=skip_record", }, { - "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=c.csv n_skipped=1 validation_policy=skip_record", + "level": "ERROR", + "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=c.csv line_no=2 n_skipped=0", }, ] }) @@ -268,7 +268,7 @@ {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform @@ -285,8 +285,8 @@ "message": "Records in file did not pass validation policy. stream=stream1 file=a/a1.csv n_skipped=2 validation_policy=skip_record", }, { - "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=a/a3.csv n_skipped=1 validation_policy=skip_record", + "level": "ERROR", + "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a/a3.csv line_no=2 n_skipped=0", }, { "level": "WARN", @@ -314,14 +314,14 @@ ) .set_expected_records( [ - {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_11", "col2": "1", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": "2", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error - # {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, ] ) .set_expected_logs({ @@ -366,7 +366,7 @@ {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error - # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, From 0617c823565f6e5924301aca049c5c766b0c8a51 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Mon, 7 Aug 2023 16:40:38 -0400 Subject: [PATCH 06/27] [ISSUE #28893] using discovered schema as part of read --- ...efault_file_based_availability_strategy.py | 4 +- .../file_based/file_types/avro_parser.py | 9 +- .../file_based/file_types/csv_parser.py | 17 +- .../file_based/file_types/file_type_parser.py | 6 +- .../file_based/file_types/jsonl_parser.py | 7 +- .../file_based/file_types/parquet_parser.py | 7 +- .../stream/default_file_based_stream.py | 2 +- .../file_based/file_types/test_csv_parser.py | 5 +- .../file_based/scenarios/csv_scenarios.py | 1 - .../scenarios/validation_policy_scenarios.py | 173 +++++++++++------- 10 files changed, 147 insertions(+), 84 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py index 52563ca1e46e..f982e13c158a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py @@ -16,6 +16,8 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy): + _WITHOUT_SCHEMA = None + def __init__(self, stream_reader: AbstractFileBasedStreamReader): self.stream_reader = stream_reader @@ -82,7 +84,7 @@ def _check_parse_record(self, stream: AbstractFileBasedStream, file: RemoteFile, parser = stream.get_parser(stream.config.file_type) try: - record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger))) + record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger, self._WITHOUT_SCHEMA))) except StopIteration: # The file is empty. We've verified that we can open it, so will # consider the connection check successful even though it means diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py index 91e1cf4eb08b..bca22b191040 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -4,13 +4,13 @@ import logging import uuid -from typing import Any, Dict, Iterable, Mapping +from typing import Any, Dict, Iterable, Mapping, Optional import fastavro from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile AVRO_TYPE_TO_JSON_TYPE = { @@ -45,7 +45,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): raise ValueError(f"Expected ParquetFormat, got {avro_format}") @@ -60,7 +60,7 @@ async def infer_schema( field["name"]: AvroParser._convert_avro_type_to_json(avro_format, field["name"], field["type"]) for field in avro_schema["fields"] } - return json_schema + return json_schema # type: ignore @classmethod def _convert_avro_type_to_json(cls, avro_format: AvroFormat, field_name: str, avro_field: str) -> Mapping[str, Any]: @@ -130,6 +130,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 953c87fc70a1..d12f5e0b761e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -15,7 +15,7 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING @@ -119,10 +119,10 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: if config.input_schema: # FIXME change type of method to Mapping - return config.input_schema + return config.input_schema # type: ignore # conversion to mapping is handled by pydantic and we shouldn't have a str here # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual # sources will likely require one. Rather than modify the interface now we can wait until the real use case @@ -158,10 +158,10 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: - schema: Mapping[str, Any] = config.input_schema # type: ignore config_format = _extract_config_format(config) - cast_fn = CsvParser._get_cast_function(schema, config_format, logger) + cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) for row in data_generator: yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) @@ -206,6 +206,13 @@ def _cast_types( prop_type = property_types.get(key) cast_value: Any = value + if isinstance(prop_type, list): + prop_type_distinct = set(prop_type) + prop_type_distinct.remove("null") + if len(prop_type_distinct) != 1: + raise ValueError(f"Could not get non nullable type from {prop_type}") + (prop_type,) = prop_type_distinct + if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: _, python_type = TYPE_PYTHON_MAPPING[prop_type] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py index 41abc8de37a8..26547cf3d63b 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py @@ -4,13 +4,14 @@ import logging from abc import ABC, abstractmethod -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, Mapping, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode from airbyte_cdk.sources.file_based.remote_file import RemoteFile -Schema = Dict[str, str] +# Schema example: {"column1": {"type": "string"}} +Schema = Mapping[str, Dict[str, Any]] Record = Dict[str, Any] @@ -40,6 +41,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Record]: """ Parse and emit each record. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py index d256b72ee4e3..be595f4acc6c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py @@ -4,11 +4,11 @@ import json import logging -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, merge_schemas @@ -23,7 +23,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: """ Infers the schema for the file by inferring the schema for each line, and merging it with the previously-inferred schema. @@ -52,6 +52,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: with stream_reader.open_file(file, self.file_read_mode, logger) as fp: for line in fp: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py index 09d66a1f25b6..16cf63641744 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -5,14 +5,14 @@ import json import logging import os -from typing import Any, Dict, Iterable, List, Mapping +from typing import Any, Dict, Iterable, List, Mapping, Optional from urllib.parse import unquote import pyarrow as pa import pyarrow.parquet as pq from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile from pyarrow import Scalar @@ -24,7 +24,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): raise ValueError(f"Expected ParquetFormat, got {parquet_format}") @@ -47,6 +47,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 76093016e2d5..e08ec7fdf45a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -84,7 +84,7 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping n_skipped = line_no = 0 try: - for record in parser.parse_records(self.config, file, self._stream_reader, self.logger): + for record in parser.parse_records(self.config, file, self._stream_reader, self.logger, schema): line_no += 1 if self.config.schemaless: record = {"data": record} diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 5547f1254038..24f287054c64 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -28,6 +28,7 @@ "col7": "array", "col8": "array", "col9": "array", + "col10": ["null", "string"], } logger = logging.getLogger() @@ -47,6 +48,7 @@ "col7": '[1, 2]', "col8": '["1", "2"]', "col9": '[{"a": "b"}, {"a": "c"}]', + "col10": 'asdf', }, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, @@ -60,6 +62,7 @@ "col7": [1, 2], "col8": ["1", "2"], "col9": [{"a": "b"}, {"a": "c"}], + "col10": 'asdf', }, id="cast-all-cols"), pytest.param({"col1": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col1": "1"}, id="cannot-cast-to-null"), pytest.param({"col2": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": True}, id="cast-1-to-bool"), @@ -75,7 +78,7 @@ pytest.param({"col7": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col7": "['a', 'b']"}, id="cannot-cast-to-list-of-ints"), pytest.param({"col8": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col8": "['a', 'b']"}, id="cannot-cast-to-list-of-strings"), pytest.param({"col9": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col9": "['a', 'b']"}, id="cannot-cast-to-list-of-objects"), - pytest.param({"col10": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col10": "x"}, id="item-not-in-props-doesn't-error"), + pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col11": "x"}, id="item-not-in-props-doesn't-error"), ] ) def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index b928d7445c37..1517abdf6885 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -594,7 +594,6 @@ }, ] ) - .set_expected_logs({"discover": [{"level": "WARN", "message": "Refusing to infer schema for all 2 files; using 1 files."}]}) .set_discovery_policy(LowInferenceLimitDiscoveryPolicy()) ).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 22bff3f66f13..c036140d9715 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -19,9 +19,9 @@ }, "b.csv": { "contents": [ - ("col1",), - ("val_b_11",), - ("val_b_12",), + ("col1", "col2"), + ("val_b_11", "this is text that will trigger validation policy"), + ("val_b_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -29,7 +29,7 @@ "contents": [ ("col1",), ("val_c_11",), - ("val_c_12","val_c_22"), # This record is not parsable + ("val_c_12", "val_c_22"), # This record is not parsable ("val_c_13",), ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -56,7 +56,7 @@ "type": "string", }, "col2": { - "type": "number", + "type": "integer", }, "_ab_source_file_last_modified": { "type": "string" @@ -80,19 +80,19 @@ TestScenarioBuilder() .set_files( { - "a/a1.csv": { # The records in this file do not conform to the schema + "a/a1.csv": { "contents": [ ("col1", "col2"), - ("val_aa1_11", "val_aa1_21"), - ("val_aa1_12", "val_aa1_22"), + ("val_aa1_11", "1"), + ("val_aa1_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, "a/a2.csv": { "contents": [ - ("col1",), - ("val_aa2_11",), - ("val_aa2_12",), + ("col1", "col2"), + ("val_aa2_11", "this is text that will trigger validation policy"), + ("val_aa2_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -115,17 +115,17 @@ "b/b1.csv": { # The records in this file do not conform to the schema "contents": [ - ("col1",), - ("val_bb1_11",), - ("val_bb1_12",), + ("col1", "col2"), + ("val_bb1_11", "1"), + ("val_bb1_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, "b/b2.csv": { "contents": [ ("col1", "col2"), - ("val_bb2_11", "val_bb2_21"), - ("val_bb2_12", "val_bb2_22"), + ("val_bb2_11", "this is text that will trigger validation policy"), + ("val_bb2_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -152,9 +152,9 @@ "col1": { "type": "string", }, - # "col2": { # remove this so the record does not conform to the schema - # "type": "string", - # }, + "col2": { + "type": "integer", + }, "_ab_source_file_last_modified": { "type": "string" }, @@ -175,9 +175,9 @@ "col1": { "type": "string", }, - # "col2": { # remove this so the record does not conform to the schema - # "type": "string", - # }, + "col2": { + "type": "integer", + }, "_ab_source_file_last_modified": { "type": "string" }, @@ -213,10 +213,10 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_b_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_b_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed @@ -227,12 +227,17 @@ "read": [ { "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=a.csv n_skipped=2 validation_policy=skip_record", + "message": "Records in file did not pass validation policy. stream=stream1 file=b.csv n_skipped=1 validation_policy=skip_record", }, { "level": "ERROR", "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=c.csv line_no=2 n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() @@ -262,18 +267,18 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + #{"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform + {"data": {"col1": "val_bb2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] @@ -282,7 +287,7 @@ "read": [ { "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=a/a1.csv n_skipped=2 validation_policy=skip_record", + "message": "Records in file did not pass validation policy. stream=stream1 file=a/a2.csv n_skipped=1 validation_policy=skip_record", }, { "level": "ERROR", @@ -290,7 +295,17 @@ }, { "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream2 file=b/b2.csv n_skipped=2 validation_policy=skip_record", + "message": "Records in file did not pass validation policy. stream=stream2 file=b/b2.csv n_skipped=1 validation_policy=skip_record", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] }) @@ -314,10 +329,10 @@ ) .set_expected_records( [ - {"data": {"col1": "val_a_11", "col2": "1", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_a_12", "col2": "2", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_b_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_b_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error @@ -330,6 +345,11 @@ "level": "ERROR", "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() @@ -359,18 +379,18 @@ ) .set_expected_records( [ - {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb2_12", "col2": "val_bb2_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] @@ -381,6 +401,16 @@ "level": "ERROR", "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() @@ -401,14 +431,21 @@ ] } ) - .set_expected_records( - [] # No records are expected because the very first file did not conform to the schema - ) + .set_expected_records([ + {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + # No records past that because the first record for the second file did not conform to the schema + ]) .set_expected_logs({ "read": [ { "level": "WARN", - "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=a.csv validation_policy=wait_for_discover n_skipped=0", + "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=b.csv validation_policy=wait_for_discover n_skipped=0", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] }) @@ -439,18 +476,18 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # The first record does not conform so we don't sync anything from this stream - # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # No more records from this stream are emitted after a nonconforming record is encountered - # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, # {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] @@ -459,12 +496,22 @@ "read": [ { "level": "WARN", - "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=a/a1.csv validation_policy=wait_for_discover n_skipped=0", + "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=a/a2.csv validation_policy=wait_for_discover n_skipped=0", }, { "level": "WARN", "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream2 file=b/b2.csv validation_policy=wait_for_discover n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() From d157aa35a478a1788ef3bb434124c5cb0323957b Mon Sep 17 00:00:00 2001 From: maxi297 Date: Tue, 8 Aug 2023 09:38:28 -0400 Subject: [PATCH 07/27] [ISSUE #28893] self-review + cleanup --- .../sources/file_based/config/csv_format.py | 4 ++-- .../file_based/file_types/avro_parser.py | 9 +++++---- .../sources/file_based/file_types/csv_parser.py | 17 ++++++++++------- .../file_based/file_types/file_type_parser.py | 9 ++++----- .../file_based/file_types/jsonl_parser.py | 12 ++++++------ .../file_based/file_types/parquet_parser.py | 7 ++++--- .../sources/file_based/schema_helpers.py | 6 +++--- .../stream/default_file_based_stream.py | 10 +++++----- .../file_based/config/test_csv_format.py | 3 ++- .../file_based/file_types/test_csv_parser.py | 1 - .../scenarios/validation_policy_scenarios.py | 4 ++-- 11 files changed, 43 insertions(+), 39 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index c3a75e21611e..0312f1f55a0a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -86,7 +86,7 @@ class CsvFormat(BaseModel): infer_datatypes_legacy: bool = Field( title="Infer Datatypes (legacy)", default=False, - description="Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + description="Whether to autogenerate the schema based the file content. This inference does not support list and objects.", ) @validator("delimiter") @@ -130,5 +130,5 @@ def validate_option_inference(cls, values: Mapping[str, Any]) -> Mapping[str, An infer_datatypes = values.get("infer_datatypes", False) infer_datatypes_legacy = values.get("infer_datatypes_legacy", False) if infer_datatypes and infer_datatypes_legacy: - raise ValueError("Only one way to inference can be configured at the same time") + raise ValueError("Only one way to infer can be configured but both infer_datatypes and infer_datatypes_legacy are enabled.") return values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py index bca22b191040..333ab5c04169 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -10,8 +10,9 @@ from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType AVRO_TYPE_TO_JSON_TYPE = { "null": "null", @@ -45,7 +46,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): raise ValueError(f"Expected ParquetFormat, got {avro_format}") @@ -60,7 +61,7 @@ async def infer_schema( field["name"]: AvroParser._convert_avro_type_to_json(avro_format, field["name"], field["type"]) for field in avro_schema["fields"] } - return json_schema # type: ignore + return json_schema @classmethod def _convert_avro_type_to_json(cls, avro_format: AvroFormat, field_name: str, avro_field: str) -> Mapping[str, Any]: @@ -130,7 +131,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index d12f5e0b761e..20b2aed0347c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -15,9 +15,9 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING +from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType DIALECT_NAME = "_config_dialect" @@ -119,7 +119,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: if config.input_schema: # FIXME change type of method to Mapping return config.input_schema # type: ignore # conversion to mapping is handled by pydantic and we shouldn't have a str here @@ -132,8 +132,9 @@ async def infer_schema( config_format.true_values, config_format.false_values, config_format.null_values, - config_format.infer_datatypes and not config_format.infer_datatypes_legacy - ) if config_format.infer_datatypes or config_format.infer_datatypes_legacy + config_format.infer_datatypes and not config_format.infer_datatypes_legacy, + ) + if config_format.infer_datatypes or config_format.infer_datatypes_legacy else _DisabledTypeInferrer() ) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) @@ -158,7 +159,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: config_format = _extract_config_format(config) cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) @@ -286,7 +287,9 @@ class _JsonTypeInferrer(_TypeInferrer): _OBJECT_TYPE = "object" _STRING_TYPE = "string" - def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str], allow_for_objects_and_arrays: bool) -> None: + def __init__( + self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str], allow_for_objects_and_arrays: bool + ) -> None: self._boolean_trues = boolean_trues self._boolean_falses = boolean_falses self._null_values = null_values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py index 26547cf3d63b..9591612bb912 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py @@ -4,14 +4,13 @@ import logging from abc import ABC, abstractmethod -from typing import Any, Dict, Iterable, Mapping, Optional +from typing import Any, Dict, Iterable, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType -# Schema example: {"column1": {"type": "string"}} -Schema = Mapping[str, Dict[str, Any]] Record = Dict[str, Any] @@ -28,7 +27,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: """ Infer the JSON Schema for this file. """ @@ -41,7 +40,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Record]: """ Parse and emit each record. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py index be595f4acc6c..19067064cc28 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py @@ -4,13 +4,13 @@ import json import logging -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, Mapping, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, merge_schemas +from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, SchemaType, merge_schemas class JsonlParser(FileTypeParser): @@ -23,12 +23,12 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: """ Infers the schema for the file by inferring the schema for each line, and merging it with the previously-inferred schema. """ - inferred_schema: Dict[str, Any] = {} + inferred_schema: Mapping[str, Any] = {} read_bytes = 0 with stream_reader.open_file(file, self.file_read_mode, logger) as fp: @@ -52,7 +52,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: with stream_reader.open_file(file, self.file_read_mode, logger) as fp: for line in fp: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py index 16cf63641744..2f19069a696e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -12,8 +12,9 @@ import pyarrow.parquet as pq from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType from pyarrow import Scalar @@ -24,7 +25,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): raise ValueError(f"Expected ParquetFormat, got {parquet_format}") @@ -47,7 +48,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py index 597c919e39e6..3f7b2151653f 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py @@ -11,7 +11,7 @@ from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, SchemaInferenceError JsonSchemaSupportedType = Union[List[str], Literal["string"], str] -SchemaType = Dict[str, Dict[str, JsonSchemaSupportedType]] +SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]] schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}} @@ -99,7 +99,7 @@ def merge_schemas(schema1: SchemaType, schema2: SchemaType) -> SchemaType: if not isinstance(t, dict) or "type" not in t or not _is_valid_type(t["type"]): raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=k, type=t) - merged_schema: Dict[str, Any] = deepcopy(schema1) + merged_schema: Dict[str, Any] = deepcopy(schema1) # type: ignore # as of 2023-08-08, deepcopy can copy Mapping for k2, t2 in schema2.items(): t1 = merged_schema.get(k2) if t1 is None: @@ -116,7 +116,7 @@ def _is_valid_type(t: JsonSchemaSupportedType) -> bool: return t == "array" or get_comparable_type(t) is not None -def _choose_wider_type(key: str, t1: Dict[str, Any], t2: Dict[str, Any]) -> Dict[str, Any]: +def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -> Mapping[str, Any]: if (t1["type"] == "array" or t2["type"] == "array") and t1 != t2: raise SchemaInferenceError( FileBasedSourceError.SCHEMA_INFERENCE_ERROR, diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index e08ec7fdf45a..8ac3fd0ee44a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -6,7 +6,7 @@ import itertools import traceback from functools import cache -from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Set, Union from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level from airbyte_cdk.models import Type as MessageType @@ -20,7 +20,7 @@ StopSyncPerValidationPolicy, ) from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import merge_schemas, schemaless_schema +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType, merge_schemas, schemaless_schema from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream from airbyte_cdk.sources.file_based.stream.cursor import FileBasedCursor from airbyte_cdk.sources.file_based.types import StreamSlice @@ -227,8 +227,8 @@ async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: Each file type has a corresponding `infer_schema` handler. Dispatch on file type. """ - base_schema: Dict[str, Any] = {} - pending_tasks: Set[asyncio.tasks.Task[Dict[str, Any]]] = set() + base_schema: SchemaType = {} + pending_tasks: Set[asyncio.tasks.Task[SchemaType]] = set() n_started, n_files = 0, len(files) files_iterator = iter(files) @@ -247,7 +247,7 @@ async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: return base_schema - async def _infer_file_schema(self, file: RemoteFile) -> Dict[str, Any]: + async def _infer_file_schema(self, file: RemoteFile) -> SchemaType: try: return await self.get_parser(self.config.file_type).infer_schema(self.config, file, self._stream_reader, self.logger) except Exception as exc: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index edd39964124c..19133f4c014c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -2,8 +2,9 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -import pytest as pytest from typing import Type + +import pytest as pytest from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 24f287054c64..0cfb837a63a1 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -115,7 +115,6 @@ def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(sel self._config.input_schema = {self._HEADER_NAME: {"type": "potato"}} self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") - def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: self._config_format.infer_datatypes = True self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index c036140d9715..db17894d4273 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -269,7 +269,7 @@ [ {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - #{"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + # {"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted @@ -434,7 +434,7 @@ .set_expected_records([ {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - # No records past that because the first record for the second file did not conform to the schema + # No records past that because the first record for the second file did not conform to the schema ]) .set_expected_logs({ "read": [ From 57b011f4869be34091e29b9f55187da8a50e3c88 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Tue, 8 Aug 2023 09:40:19 -0400 Subject: [PATCH 08/27] [ISSUE #28893] fix test --- .../unit_tests/sources/file_based/scenarios/csv_scenarios.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 1517abdf6885..6bc3b9bb6e97 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -258,7 +258,7 @@ }, "infer_datatypes_legacy": { "default": False, - "description": "Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + "description": "Whether to autogenerate the schema based the file content. This inference does not support list and objects.", "title": "Infer Datatypes (legacy)", "type": "boolean" }, From 71cdca9a17cb9268d8ffe1f4e332b65264471ba1 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 09:19:32 -0400 Subject: [PATCH 09/27] [ISSUE #28893] code review part #1 --- .../sources/file_based/file_types/csv_parser.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 20b2aed0347c..233d63440881 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -55,7 +55,6 @@ def read_data( with stream_reader.open_file(file, file_read_mode, logger) as fp: headers = self._get_headers(fp, config_format, dialect_name) - fp.seek(0) # we assume that if we autogenerate columns, it is because we don't have headers # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows rows_to_skip = ( @@ -78,24 +77,28 @@ def read_data( csv.unregister_dialect(dialect_name) def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + """ + Assumes the fp is pointing to the beginning of the files and will reset it as such + """ # Note that this method assumes the dialect has already been registered if we're parsing the headers self._skip_rows(fp, config_format.skip_rows_before_header) if config_format.autogenerate_column_names: - return self._auto_generate_headers(fp, config_format, dialect_name) + headers = self._auto_generate_headers(fp, dialect_name) else: # Then read the header reader = csv.reader(fp, dialect=dialect_name) # type: ignore - return list(next(reader)) + headers = list(next(reader)) - def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + fp.seek(0) + return headers + + def _auto_generate_headers(self, fp: IOBase, dialect_name: str) -> List[str]: """ Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True. See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html """ reader = csv.reader(fp, dialect=dialect_name) # type: ignore number_of_columns = len(next(reader)) # type: ignore - # Reset the file pointer to the beginning of the file so that the first row is not skipped - fp.seek(0) return [f"f{i}" for i in range(number_of_columns)] @staticmethod @@ -198,7 +201,7 @@ def _cast_types( Array and object types are only handled if they can be deserialized as JSON. - If any errors are encountered, the value will be emitted as a string._to_nullable + If any errors are encountered, the value will be emitted as a string. """ warnings = [] result = {} From f651d03879fe9d711143f16f9dee2457a0c1f539 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 10:44:13 -0400 Subject: [PATCH 10/27] [ISSUE #28893] code review part #2 --- .../sources/file_based/config/csv_format.py | 23 +++------ .../file_based/file_types/csv_parser.py | 25 ++++++---- .../file_based/config/test_csv_format.py | 17 ------- .../file_based/file_types/test_csv_parser.py | 49 +++++++++---------- 4 files changed, 48 insertions(+), 66 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index 77d3fc2d7674..0a5261f54383 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -17,6 +17,12 @@ class QuotingBehavior(Enum): QUOTE_NONE = "Quote None" +class InferenceType(Enum): + NONE = "None" + PRIMITIVE_TYPES_ONLY = "Primitive Types Only" + PRIMITIVE_AND_COMPLEX_TYPES = "Primitive and Complex Types" + + DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"] DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"] @@ -81,16 +87,11 @@ class Config: default=DEFAULT_FALSE_VALUES, description="A set of case-sensitive strings that should be interpreted as false values.", ) - infer_datatypes: bool = Field( + inference_type: InferenceType = Field( title="Infer Datatypes", - default=False, + default=InferenceType.NONE, description="Whether to autogenerate the schema based the file content.", ) - infer_datatypes_legacy: bool = Field( - title="Infer Datatypes (legacy)", - default=False, - description="Whether to autogenerate the schema based the file content. This inference does not support list and objects.", - ) @validator("delimiter") def validate_delimiter(cls, v: str) -> str: @@ -127,11 +128,3 @@ def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, if skip_rows_before_header > 0 and auto_generate_column_names: raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.") return values - - @root_validator - def validate_option_inference(cls, values: Mapping[str, Any]) -> Mapping[str, Any]: - infer_datatypes = values.get("infer_datatypes", False) - infer_datatypes_legacy = values.get("infer_datatypes_legacy", False) - if infer_datatypes and infer_datatypes_legacy: - raise ValueError("Only one way to infer can be configured but both infer_datatypes and infer_datatypes_legacy are enabled.") - return values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index d0d5adef7bef..fedf207c84a4 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -11,7 +11,7 @@ from io import IOBase from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set -from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType, QuotingBehavior from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode @@ -38,7 +38,7 @@ def read_data( logger: logging.Logger, file_read_mode: FileReadMode, ) -> Generator[Dict[str, Any], None, None]: - config_format = config.format or CsvFormat() + config_format = _extract_format(config) # Formats are configured individually per-stream so a unique dialect should be registered for each stream. # We don't unregister the dialect because we are lazily parsing each csv file to generate records @@ -123,21 +123,21 @@ async def infer_schema( stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, ) -> SchemaType: - if config.input_schema: - # FIXME change type of method to Mapping - return config.input_schema # type: ignore # conversion to mapping is handled by pydantic and we shouldn't have a str here + input_schema = config.get_input_schema() + if input_schema: + return input_schema # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual # sources will likely require one. Rather than modify the interface now we can wait until the real use case - config_format = config.format or CsvFormat() + config_format = _extract_format(config) type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( lambda: _JsonTypeInferrer( config_format.true_values, config_format.false_values, config_format.null_values, - config_format.infer_datatypes and not config_format.infer_datatypes_legacy, + config_format.inference_type == InferenceType.PRIMITIVE_AND_COMPLEX_TYPES ) - if config_format.infer_datatypes or config_format.infer_datatypes_legacy + if config_format.inference_type != InferenceType.NONE else _DisabledTypeInferrer() ) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) @@ -164,7 +164,7 @@ def parse_records( logger: logging.Logger, discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: - config_format = config.format or CsvFormat() + config_format = _extract_format(config) cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) for row in data_generator: @@ -411,3 +411,10 @@ def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str: def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]: return row + + +def _extract_format(config: FileBasedStreamConfig) -> CsvFormat: + config_format = config.format or CsvFormat() + if not isinstance(config_format, CsvFormat): + raise ValueError(f"Invalid format config: {config_format}") + return config_format diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index 19133f4c014c..de85b9e3ebae 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -23,20 +23,3 @@ def test_csv_format_skip_rows_and_autogenerate_column_names(skip_rows_before_hea CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) else: CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) - - -@pytest.mark.parametrize( - "infer_datatypes, infer_datatypes_legacy, expected_error", - [ - pytest.param(True, True, ValueError, id="test_many_inferences_configured"), - pytest.param(True, False, None, id="test_infer_datatypes"), - pytest.param(False, True, None, id="test_infer_datatypes_legacy"), - pytest.param(False, False, None, id="test_no_inference"), - ] -) -def test_csv_format_inference(infer_datatypes: bool, infer_datatypes_legacy: bool, expected_error: Type[BaseException]) -> None: - if expected_error: - with pytest.raises(expected_error): - CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) - else: - CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 5421c31bd086..c500bb3b264b 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -12,7 +12,7 @@ from unittest.mock import Mock import pytest -from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat +from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat, InferenceType from airbyte_cdk.sources.file_based.exceptions import RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader @@ -99,10 +99,9 @@ def setUp(self) -> None: self._config_format.true_values = _DEFAULT_TRUE_VALUES self._config_format.false_values = _DEFAULT_FALSE_VALUES self._config_format.null_values = {self._A_NULL_VALUE} - self._config_format.infer_datatypes = False - self._config_format.infer_datatypes_legacy = False + self._config_format.inference_type = InferenceType.NONE self._config = Mock() - self._config.input_schema = None + self._config.get_input_schema.return_value = None self._config.format = self._config_format self._file = Mock(spec=RemoteFile) @@ -112,71 +111,71 @@ def setUp(self) -> None: self._parser = CsvParser(self._csv_reader) def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(self) -> None: - self._config.input_schema = {self._HEADER_NAME: {"type": "potato"}} + self._config.get_input_schema.return_value = {self._HEADER_NAME: {"type": "potato"}} self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") - def test_given_legacy_and_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "5645"], "integer") - def test_given_legacy_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "5645"], "integer") def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "2.312"], "number") - def test_given_legacy_and_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "2.312"], "number") def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") def test_given_arrays_only_when_infer_schema_then_type_is_string(self) -> None: - self._config_format.infer_datatypes_legacy = True + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") - def test_given_legacy_and_objects_only_when_infer_schema_then_type_is_string(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_objects_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["a string", "another string"], "string") def test_given_a_null_value_when_infer_then_ignore_null(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "5645", self._A_NULL_VALUE], "integer") def test_given_only_null_values_when_infer_then_type_is_string(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema([self._A_NULL_VALUE, self._A_NULL_VALUE, self._A_NULL_VALUE], "string") def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered From a573a898095a2f123a3c39e715b9b3936ea1ba69 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 10:47:27 -0400 Subject: [PATCH 11/27] Fix test --- .../sources/file_based/config/csv_format.py | 4 ++-- .../file_based/scenarios/csv_scenarios.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index 0a5261f54383..bc9afb4bc721 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -88,9 +88,9 @@ class Config: description="A set of case-sensitive strings that should be interpreted as false values.", ) inference_type: InferenceType = Field( - title="Infer Datatypes", + title="Inference Type", default=InferenceType.NONE, - description="Whether to autogenerate the schema based the file content.", + description="How to infer the types of the columns.", ) @validator("delimiter") diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 0429a38e10fd..45543775c627 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -133,17 +133,15 @@ ], "type": "string" }, - "infer_datatypes": { - "default": False, - "description": "Whether to autogenerate the schema based the file content.", - "title": "Infer Datatypes", - "type": "boolean" - }, - "infer_datatypes_legacy": { - "default": False, - "description": "Whether to autogenerate the schema based the file content. This inference does not support list and objects.", - "title": "Infer Datatypes (legacy)", - "type": "boolean" + "inference_type": { + "default": "None", + "description": "How to infer the types of the columns.", + "title": "Inference Type", + "enum": [ + "None", + "Primitive Types Only", + "Primitive and Complex Types", + ] }, "delimiter": { "title": "Delimiter", From 0ce37e58ab7de607ff3d4de0ddb6d19e0a7703f8 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 10:48:20 -0400 Subject: [PATCH 12/27] formatcdk --- .../airbyte_cdk/sources/file_based/file_types/csv_parser.py | 2 +- .../unit_tests/sources/file_based/config/test_csv_format.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index fedf207c84a4..9d6ce17420d3 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -135,7 +135,7 @@ async def infer_schema( config_format.true_values, config_format.false_values, config_format.null_values, - config_format.inference_type == InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + config_format.inference_type == InferenceType.PRIMITIVE_AND_COMPLEX_TYPES, ) if config_format.inference_type != InferenceType.NONE else _DisabledTypeInferrer() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index de85b9e3ebae..e04c42b6858c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -2,8 +2,6 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -from typing import Type - import pytest as pytest from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat From ddd7f49107d68c720affa002d297809c88046441 Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Wed, 9 Aug 2023 10:46:24 -0700 Subject: [PATCH 13/27] first pass --- .../sources/file_based/config/csv_format.py | 5 ++ .../file_based/file_types/csv_parser.py | 45 ++++++++++------ .../file_based/file_types/test_csv_parser.py | 20 ++++++- .../file_based/scenarios/csv_scenarios.py | 54 ++++++++++--------- 4 files changed, 83 insertions(+), 41 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index bc9afb4bc721..3c9f255c0b7f 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -64,6 +64,11 @@ class Config: default=[], description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.", ) + strings_can_be_null: bool = Field( + title="Strings Can Be Null", + default=True, + description="Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.", + ) skip_rows_before_header: int = Field( title="Skip Rows Before Header", default=0, diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 9d6ce17420d3..8d635e036e3e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -165,10 +165,15 @@ def parse_records( discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: config_format = _extract_format(config) - cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) + if discovered_schema: + property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} + CsvParser._pre_propcess_property_types(property_types) + else: + property_types = None + cast_fn = CsvParser._get_cast_function(property_types, config_format, logger) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) for row in data_generator: - yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) + yield CsvParser._to_nullable(cast_fn(row), property_types, config_format.null_values, config_format.strings_can_be_null) data_generator.close() @property @@ -177,24 +182,39 @@ def file_read_mode(self) -> FileReadMode: @staticmethod def _get_cast_function( - schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger + deduped_property_types: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger ) -> Callable[[Mapping[str, str]], Mapping[str, str]]: # Only cast values if the schema is provided - if schema: - property_types = {col: prop["type"] for col, prop in schema["properties"].items()} - return partial(CsvParser._cast_types, property_types=property_types, config_format=config_format, logger=logger) + if deduped_property_types: + return partial(CsvParser._cast_types, property_types=deduped_property_types, config_format=config_format, logger=logger) else: # If no schema is provided, yield the rows as they are return _no_cast @staticmethod - def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Optional[str]]: - nullable = row | {k: None if v in null_values else v for k, v in row.items()} + def _to_nullable(row: Mapping[str, str], property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool) -> Dict[str, Optional[str]]: + if not property_types: + property_types = {} + nullable = row | {k: None if CsvParser._value_is_none(v, property_types.get(k), null_values, strings_can_be_null) else v for k, v in row.items()} return nullable + @staticmethod + def _value_is_none(value: Any, property_type: Optional[str], null_values: Set[str], strings_can_be_null: bool) -> bool: + return value in null_values and (strings_can_be_null or property_type != "string") + + @staticmethod + def _pre_propcess_property_types(property_types: Dict[str, Any]): + for prop, prop_type in property_types.items(): + if isinstance(prop_type, list): + prop_type_distinct = set(prop_type) + prop_type_distinct.remove("null") + if len(prop_type_distinct) != 1: + raise ValueError(f"Could not get non nullable type from {prop_type}") + property_types[prop] = next(iter(prop_type_distinct)) + @staticmethod def _cast_types( - row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger + row: Dict[str, str], property_types: Dict[str, str], config_format: CsvFormat, logger: logging.Logger ) -> Dict[str, Any]: """ Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. @@ -210,13 +230,6 @@ def _cast_types( prop_type = property_types.get(key) cast_value: Any = value - if isinstance(prop_type, list): - prop_type_distinct = set(prop_type) - prop_type_distinct.remove("null") - if len(prop_type_distinct) != 1: - raise ValueError(f"Could not get non nullable type from {prop_type}") - (prop_type,) = prop_type_distinct - if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: _, python_type = TYPE_PYTHON_MAPPING[prop_type] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index c500bb3b264b..a2d57d44bc1b 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -28,7 +28,7 @@ "col7": "array", "col8": "array", "col9": "array", - "col10": ["null", "string"], + "col10": "string", } logger = logging.getLogger() @@ -86,6 +86,24 @@ def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_v assert CsvParser._cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output +@pytest.mark.parametrize( + "row, strings_can_be_null, expected_output", [ + pytest.param({"id": "1", "name": "bob", "age": 10, "is_cool": False}, False, {"id": "1", "name": "bob", "age": 10, "is_cool": False}, id="test-no-values-are-null"), + pytest.param({"id": "1", "name": "bob", "age": "null", "is_cool": "null"}, False, {"id": "1", "name": "bob", "age": None, "is_cool": None}, id="test-non-string-values-are-none-if-in-null-values"), + pytest.param({"id": "1", "name": "null", "age": 10, "is_cool": False}, False, {"id": "1", "name": "null", "age": 10, "is_cool": False}, id="test-string-values-are-not-none-if-strings-cannot-be-null"), + pytest.param({"id": "1", "name": "null", "age": 10, "is_cool": False}, True, {"id": "1", "name": None, "age": 10, "is_cool": False}, id="test-string-values-none-if-strings-can-be-null"), + ] +) +def test_to_nullable(row, strings_can_be_null, expected_output): + property_types = {"id": "string", + "name": "string", + "age": "integer", + "is_cool": "boolean"} + null_values = {"null"} + nulled_row = CsvParser._to_nullable(row, property_types, null_values, strings_can_be_null) + assert nulled_row == expected_output + + _DEFAULT_TRUE_VALUES = {"yes", "yeah", "right"} _DEFAULT_FALSE_VALUES = {"no", "nop", "wrong"} diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 45543775c627..fb3ee8310aa6 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -43,6 +43,18 @@ "description": "Used during spec; allows the developer to configure the cloud provider specific options\nthat are needed when users configure a file-based source.", "type": "object", "properties": { + "start_date": { + "title": "Start Date", + "description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", + "examples": [ + "2021-01-01T00:00:00.000000Z" + ], + "format": "date-time", + "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", + "pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ", + "order": 1, + "type": "string" + }, "streams": { "title": "The list of streams to sync", "description": "Each instance of this configuration defines a stream. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.", @@ -77,8 +89,8 @@ "enum": [ "Emit Record", "Skip Record", - "Wait for Discover", - ], + "Wait for Discover" + ] }, "input_schema": { "title": "Input Schema", @@ -133,16 +145,6 @@ ], "type": "string" }, - "inference_type": { - "default": "None", - "description": "How to infer the types of the columns.", - "title": "Inference Type", - "enum": [ - "None", - "Primitive Types Only", - "Primitive and Complex Types", - ] - }, "delimiter": { "title": "Delimiter", "description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.", @@ -193,6 +195,12 @@ }, "uniqueItems": True }, + "strings_can_be_null": { + "title": "Strings Can Be Null", + "description": "Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.", + "default": True, + "type": "boolean" + }, "skip_rows_before_header": { "title": "Skip Rows Before Header", "description": "The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.", @@ -245,6 +253,16 @@ }, "uniqueItems": True }, + "inference_type": { + "title": "Inference Type", + "description": "How to infer the types of the columns.", + "default": "None", + "enum": [ + "None", + "Primitive Types Only", + "Primitive and Complex Types" + ] + } } }, { @@ -295,18 +313,6 @@ "file_type" ] } - }, - "start_date": { - "title": "Start Date", - "description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", - "examples": [ - "2021-01-01T00:00:00.000000Z" - ], - "format": "date-time", - "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", - "pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ", - "order": 1, - "type": "string" } }, "required": [ From 82db6c3f435e67c04d294de69177078df4a69cf2 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 15:04:52 -0400 Subject: [PATCH 14/27] [ISSUE #28893] code review --- .../airbyte_cdk/sources/file_based/config/csv_format.py | 2 +- .../airbyte_cdk/sources/file_based/file_types/avro_parser.py | 2 +- .../airbyte_cdk/sources/file_based/file_types/csv_parser.py | 4 ++-- .../sources/file_based/file_types/file_type_parser.py | 4 ++-- .../airbyte_cdk/sources/file_based/file_types/jsonl_parser.py | 2 +- .../sources/file_based/file_types/parquet_parser.py | 2 +- .../unit_tests/sources/file_based/scenarios/csv_scenarios.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index bc9afb4bc721..c7d016288da5 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -90,7 +90,7 @@ class Config: inference_type: InferenceType = Field( title="Inference Type", default=InferenceType.NONE, - description="How to infer the types of the columns.", + description="How to infer the types of the columns. If none, inference default to strings.", ) @validator("delimiter") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py index d9a351ce8fdf..ff168355c54d 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -131,7 +131,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: avro_format = config.format or AvroFormat() if not isinstance(avro_format, AvroFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 9d6ce17420d3..42f2a911767c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -162,7 +162,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: config_format = _extract_format(config) cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) @@ -177,7 +177,7 @@ def file_read_mode(self) -> FileReadMode: @staticmethod def _get_cast_function( - schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger + schema: Optional[Mapping[str, SchemaType]], config_format: CsvFormat, logger: logging.Logger ) -> Callable[[Mapping[str, str]], Mapping[str, str]]: # Only cast values if the schema is provided if schema: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py index 9591612bb912..0e52fb5e04df 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py @@ -4,7 +4,7 @@ import logging from abc import ABC, abstractmethod -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, Mapping, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode @@ -40,7 +40,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Record]: """ Parse and emit each record. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py index 19067064cc28..609e071a3868 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py @@ -52,7 +52,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: with stream_reader.open_file(file, self.file_read_mode, logger) as fp: for line in fp: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py index 465758551786..fa771e446d9b 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -48,7 +48,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: parquet_format = config.format or ParquetFormat() if not isinstance(parquet_format, ParquetFormat): diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 45543775c627..0991e083b596 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -135,7 +135,7 @@ }, "inference_type": { "default": "None", - "description": "How to infer the types of the columns.", + "description": "How to infer the types of the columns. If none, inference default to strings.", "title": "Inference Type", "enum": [ "None", From fc6028e65b8382a547d53f1ac106308be8e95edc Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Wed, 9 Aug 2023 12:13:50 -0700 Subject: [PATCH 15/27] fix mypy issues --- .../sources/file_based/file_types/csv_parser.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 8d635e036e3e..4c19c0b8fa26 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -166,10 +166,10 @@ def parse_records( ) -> Iterable[Dict[str, Any]]: config_format = _extract_format(config) if discovered_schema: - property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} + property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping CsvParser._pre_propcess_property_types(property_types) else: - property_types = None + property_types = {} cast_fn = CsvParser._get_cast_function(property_types, config_format, logger) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) for row in data_generator: @@ -192,10 +192,14 @@ def _get_cast_function( return _no_cast @staticmethod - def _to_nullable(row: Mapping[str, str], property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool) -> Dict[str, Optional[str]]: + def _to_nullable( + row: Mapping[str, str], property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool + ) -> Dict[str, Optional[str]]: if not property_types: property_types = {} - nullable = row | {k: None if CsvParser._value_is_none(v, property_types.get(k), null_values, strings_can_be_null) else v for k, v in row.items()} + nullable = row | { + k: None if CsvParser._value_is_none(v, property_types.get(k), null_values, strings_can_be_null) else v for k, v in row.items() + } return nullable @staticmethod @@ -203,7 +207,7 @@ def _value_is_none(value: Any, property_type: Optional[str], null_values: Set[st return value in null_values and (strings_can_be_null or property_type != "string") @staticmethod - def _pre_propcess_property_types(property_types: Dict[str, Any]): + def _pre_propcess_property_types(property_types: Dict[str, Any]) -> None: for prop, prop_type in property_types.items(): if isinstance(prop_type, list): prop_type_distinct = set(prop_type) From c063d9a52b1e837c0696348eca8675e1ffaa81e3 Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Wed, 9 Aug 2023 12:19:19 -0700 Subject: [PATCH 16/27] comment --- .../sources/file_based/file_types/csv_parser.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 4c19c0b8fa26..81789fb753a6 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -208,6 +208,22 @@ def _value_is_none(value: Any, property_type: Optional[str], null_values: Set[st @staticmethod def _pre_propcess_property_types(property_types: Dict[str, Any]) -> None: + """ + Transform the property types to be non-nullable and remove duplicate types if any. + Sample input: + { + "col1": ["string", "null"], + "col2": ["string", "string", "null"], + "col3": "integer" + } + + Sample output: + { + "col1": "string", + "col2": "string", + "col3": "integer", + } + """ for prop, prop_type in property_types.items(): if isinstance(prop_type, list): prop_type_distinct = set(prop_type) From 37992a2eb4e1fab606d941f8b541664eaa47813d Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Wed, 9 Aug 2023 12:22:38 -0700 Subject: [PATCH 17/27] rename for clarity --- .../sources/file_based/file_types/csv_parser.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 81789fb753a6..d4c001985698 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -186,25 +186,24 @@ def _get_cast_function( ) -> Callable[[Mapping[str, str]], Mapping[str, str]]: # Only cast values if the schema is provided if deduped_property_types: - return partial(CsvParser._cast_types, property_types=deduped_property_types, config_format=config_format, logger=logger) + return partial(CsvParser._cast_types, deduped_property_types=deduped_property_types, config_format=config_format, logger=logger) else: # If no schema is provided, yield the rows as they are return _no_cast @staticmethod def _to_nullable( - row: Mapping[str, str], property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool + row: Mapping[str, str], deduped_property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool ) -> Dict[str, Optional[str]]: - if not property_types: - property_types = {} nullable = row | { - k: None if CsvParser._value_is_none(v, property_types.get(k), null_values, strings_can_be_null) else v for k, v in row.items() + k: None if CsvParser._value_is_none(v, deduped_property_types.get(k), null_values, strings_can_be_null) else v + for k, v in row.items() } return nullable @staticmethod - def _value_is_none(value: Any, property_type: Optional[str], null_values: Set[str], strings_can_be_null: bool) -> bool: - return value in null_values and (strings_can_be_null or property_type != "string") + def _value_is_none(value: Any, deduped_property_type: Optional[str], null_values: Set[str], strings_can_be_null: bool) -> bool: + return value in null_values and (strings_can_be_null or deduped_property_type != "string") @staticmethod def _pre_propcess_property_types(property_types: Dict[str, Any]) -> None: @@ -234,7 +233,7 @@ def _pre_propcess_property_types(property_types: Dict[str, Any]) -> None: @staticmethod def _cast_types( - row: Dict[str, str], property_types: Dict[str, str], config_format: CsvFormat, logger: logging.Logger + row: Dict[str, str], deduped_property_types: Dict[str, str], config_format: CsvFormat, logger: logging.Logger ) -> Dict[str, Any]: """ Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. @@ -247,7 +246,7 @@ def _cast_types( result = {} for key, value in row.items(): - prop_type = property_types.get(key) + prop_type = deduped_property_types.get(key) cast_value: Any = value if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: From 46d57e91ebce3fc3961f2b91a90f62a72f423581 Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Wed, 9 Aug 2023 12:30:10 -0700 Subject: [PATCH 18/27] Add a scenario test case --- .../file_based/scenarios/csv_scenarios.py | 71 +++++++++++++++++++ .../sources/file_based/test_scenarios.py | 2 + 2 files changed, 73 insertions(+) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index fb3ee8310aa6..1f16b275bf05 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -1587,6 +1587,77 @@ ) ).build() +csv_string_are_not_null_if_strings_can_be_null_is_false_scenario = ( + TestScenarioBuilder() + .set_name("csv_string_are_not_null_if_strings_can_be_null_is_false") + .set_config( + { + "streams": [ + { + "name": "stream1", + "file_type": "csv", + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "string"}', + "format": { + "filetype": "csv", + "null_values": ["null"], + "strings_can_be_null": False, + } + } + ], + "start_date": "2023-06-04T03:54:07.000000Z" + } + ) + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("2", "null"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": { + "type": "string" + }, + "col2": { + "type": "string" + }, + "_ab_source_file_last_modified": { + "type": "string" + }, + "_ab_source_file_url": { + "type": "string" + }, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + } + ] + } + ) + .set_expected_records( + [ + {"data": {"col1": "2", "col2": "null", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + ] + ) +).build() + csv_string_not_null_if_no_null_values_scenario = ( TestScenarioBuilder() .set_name("csv_string_not_null_if_no_null_values") diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py index 24fdfaff5cad..a3a6cdfab26a 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -63,6 +63,7 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario, schemaless_with_user_input_schema_fails_connection_check_scenario, single_csv_scenario, + csv_string_are_not_null_if_strings_can_be_null_is_false_scenario, ) from unit_tests.sources.file_based.scenarios.incremental_scenarios import ( multi_csv_different_timestamps_scenario, @@ -175,6 +176,7 @@ schemaless_jsonl_scenario, schemaless_jsonl_multi_stream_scenario, csv_string_can_be_null_with_input_schemas_scenario, + csv_string_are_not_null_if_strings_can_be_null_is_false_scenario, csv_string_not_null_if_no_null_values_scenario, csv_strings_can_be_null_not_quoted_scenario, csv_newline_in_values_quoted_value_scenario, From af1616098492d2b3aea85723eb0201cf10923a4f Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Wed, 9 Aug 2023 12:32:23 -0700 Subject: [PATCH 19/27] this isn't optional anymore --- .../airbyte_cdk/sources/file_based/file_types/csv_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index d4c001985698..d741a30a3b37 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -182,7 +182,7 @@ def file_read_mode(self) -> FileReadMode: @staticmethod def _get_cast_function( - deduped_property_types: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger + deduped_property_types: Mapping[str, Any], config_format: CsvFormat, logger: logging.Logger ) -> Callable[[Mapping[str, str]], Mapping[str, str]]: # Only cast values if the schema is provided if deduped_property_types: From 3027a4fcfe3adc3be40eb649f77e03c1ab528704 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 16:19:22 -0400 Subject: [PATCH 20/27] FIX test log level --- .../scenarios/validation_policy_scenarios.py | 27 +++++-------- .../sources/file_based/test_scenarios.py | 38 ++++++++++--------- 2 files changed, 29 insertions(+), 36 deletions(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 2db77996ed79..3efb1f81d8b2 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -234,8 +234,7 @@ "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=c.csv line_no=2 n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -298,13 +297,11 @@ "message": "Records in file did not pass validation policy. stream=stream2 file=b/b2.csv n_skipped=1 validation_policy=skip_record", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -346,8 +343,7 @@ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -402,13 +398,11 @@ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -443,8 +437,7 @@ "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=b.csv validation_policy=Wait for Discover n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -503,13 +496,11 @@ "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream2 file=b/b2.csv validation_policy=Wait for Discover n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py index 24fdfaff5cad..9a82efed038a 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -11,6 +11,7 @@ from _pytest.capture import CaptureFixture from _pytest.reports import ExceptionInfo from airbyte_cdk.entrypoint import launch +from airbyte_cdk.logger import AirbyteLogFormatter from airbyte_cdk.models import SyncMode from freezegun import freeze_time from pytest import LogCaptureFixture @@ -217,7 +218,7 @@ def test_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: Te _verify_expected_logs(logs, discover_logs) -read_scenarios = discover_scenarios + [ +read_scenarios = [ emit_record_scenario_multi_stream, emit_record_scenario_single_stream, skip_record_scenario_multi_stream, @@ -230,6 +231,7 @@ def test_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: Te @pytest.mark.parametrize("scenario", read_scenarios, ids=[s.name for s in read_scenarios]) @freeze_time("2023-06-09T00:00:00Z") def test_read(capsys: CaptureFixture[str], caplog: LogCaptureFixture, tmp_path: PosixPath, scenario: TestScenario) -> None: + caplog.handler.setFormatter(AirbyteLogFormatter()) if scenario.incremental_scenario_config: run_test_read_incremental(capsys, caplog, tmp_path, scenario) else: @@ -369,23 +371,23 @@ def discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestSce def read(capsys: CaptureFixture[str], caplog: LogCaptureFixture, tmp_path: PosixPath, scenario: TestScenario) -> Dict[str, Any]: - launch( - scenario.source, - [ - "read", - "--config", - make_file(tmp_path / "config.json", scenario.config), - "--catalog", - make_file(tmp_path / "catalog.json", scenario.configured_catalog(SyncMode.full_refresh)), - ], - ) - captured = capsys.readouterr().out.splitlines() - logs = caplog.records - return { - "records": [msg for msg in (json.loads(line) for line in captured) if msg["type"] == "RECORD"], - "logs": [msg["log"] for msg in (json.loads(line) for line in captured) if msg["type"] == "LOG"] - + [{"level": log.levelname, "message": log.message} for log in logs], - } + with caplog.handler.stream as logger_stream: + launch( + scenario.source, + [ + "read", + "--config", + make_file(tmp_path / "config.json", scenario.config), + "--catalog", + make_file(tmp_path / "catalog.json", scenario.configured_catalog(SyncMode.full_refresh)), + ], + ) + captured = capsys.readouterr().out.splitlines() + logger_stream.getvalue().split("\n")[:-1] + + return { + "records": [msg for msg in (json.loads(line) for line in captured) if msg["type"] == "RECORD"], + "logs": [msg["log"] for msg in (json.loads(line) for line in captured) if msg["type"] == "LOG"], + } def read_with_state( From ac91730a1a48747c97343f8cc24e80cf67b05dec Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 16:29:23 -0400 Subject: [PATCH 21/27] Re-adding failing tests --- .../scenarios/user_input_schema_scenarios.py | 12 ++++++------ .../unit_tests/sources/file_based/test_scenarios.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py index a84f5f9acef1..33c8587a04d2 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py @@ -218,11 +218,11 @@ 'message': 'Records in file did not pass validation policy. stream=stream1 file=a.csv n_skipped=2 validation_policy=skip_record', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col1: value=val11,expected_type=integer', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col1: value=val21,expected_type=integer', }, ] @@ -553,11 +553,11 @@ .set_expected_logs({ "read": [ { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val12b,expected_type=integer', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val22b,expected_type=integer', }, ] @@ -696,11 +696,11 @@ 'message': 'Records in file did not pass validation policy. stream=stream2 file=b.csv n_skipped=2 validation_policy=skip_record', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val12b,expected_type=integer', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val22b,expected_type=integer', }, ] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py index 9a82efed038a..d4947b2cc020 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -218,7 +218,7 @@ def test_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: Te _verify_expected_logs(logs, discover_logs) -read_scenarios = [ +read_scenarios = discover_scenarios + [ emit_record_scenario_multi_stream, emit_record_scenario_single_stream, skip_record_scenario_multi_stream, From f1a60bab04e24f54a4d5724655d048d379f60e3c Mon Sep 17 00:00:00 2001 From: maxi297 Date: Thu, 10 Aug 2023 14:16:44 -0400 Subject: [PATCH 22/27] [ISSUE #28893] improve inferrence to consider multiple types per value --- .../file_based/file_types/csv_parser.py | 51 +++++++++++-------- .../file_based/file_types/test_csv_parser.py | 8 ++- .../scenarios/validation_policy_scenarios.py | 4 +- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 42f2a911767c..7129cd77c743 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -303,41 +303,50 @@ def add_value(self, value: Any) -> None: self._values.add(value) def infer(self) -> str: - types = {self._infer_type(value) for value in self._values} - if types == {self._NULL_TYPE}: + types_by_value = {value: self._infer_type(value) for value in self._values} + types_excluding_null_values = [types for types in types_by_value.values() if self._NULL_TYPE not in types] + if not types_excluding_null_values: # this is highly unusual but we will consider the column as a string return self._STRING_TYPE - types.discard(self._NULL_TYPE) - if types == {self._BOOLEAN_TYPE}: + types = set.intersection(*types_excluding_null_values) + if self._BOOLEAN_TYPE in types: return self._BOOLEAN_TYPE - elif types == {self._INTEGER_TYPE}: + elif self._INTEGER_TYPE in types: return self._INTEGER_TYPE - elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}: + elif self._NUMBER_TYPE in types: return self._NUMBER_TYPE elif not self._allow_for_objects_and_arrays: return self._STRING_TYPE - elif types == {self._ARRAY_TYPE}: + elif self._ARRAY_TYPE in types: return self._ARRAY_TYPE - elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: + elif any(self._OBJECT_TYPE in _type for _type in types_by_value.values()): return self._OBJECT_TYPE return self._STRING_TYPE - def _infer_type(self, value: str) -> str: + def _infer_type(self, value: str) -> Set[str]: + inferred_types = set() + if value in self._null_values: - return self._NULL_TYPE - elif self._is_boolean(value): - return self._BOOLEAN_TYPE - elif self._is_integer(value): - return self._INTEGER_TYPE + inferred_types.add(self._NULL_TYPE) + if self._is_boolean(value): + inferred_types.add(self._BOOLEAN_TYPE) + if self._is_integer(value): + inferred_types.add(self._INTEGER_TYPE) + inferred_types.add(self._NUMBER_TYPE) elif self._is_number(value): - return self._NUMBER_TYPE - elif self._is_array(value): - return self._ARRAY_TYPE - elif self._is_object(value): - return self._OBJECT_TYPE - else: - return self._STRING_TYPE + inferred_types.add(self._NUMBER_TYPE) + + # if it can be infered as a primitive, it can't be a complex type so we'll avoid json parsing + if not inferred_types: + if self._is_array(value): + inferred_types.add(self._ARRAY_TYPE) + inferred_types.add(self._OBJECT_TYPE) + elif self._is_object(value): + inferred_types.add(self._OBJECT_TYPE) + + inferred_types.add(self._STRING_TYPE) + return inferred_types def _is_boolean(self, value: str) -> bool: try: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index c500bb3b264b..819e993d36c7 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -86,8 +86,8 @@ def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_v assert CsvParser._cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output -_DEFAULT_TRUE_VALUES = {"yes", "yeah", "right"} -_DEFAULT_FALSE_VALUES = {"no", "nop", "wrong"} +_DEFAULT_TRUE_VALUES = {"1", "yes", "yeah", "right"} +_DEFAULT_FALSE_VALUES = {"0", "no", "nop", "wrong"} class SchemaInferenceTestCase(TestCase): @@ -126,6 +126,10 @@ def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> Non self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "5645"], "integer") + def test_given_integer_overlap_with_bool_value_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + self._test_infer_schema(["1", "90329", "5645"], "integer") # here, "1" is also considered a boolean + def test_given_primitive_only_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "5645"], "integer") diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 3efb1f81d8b2..fb2abf648e1e 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -9,7 +9,7 @@ TestScenarioBuilder() .set_files( { - "a.csv": { # The records in this file do not conform to the schema + "a.csv": { "contents": [ ("col1", "col2"), ("val_a_11", "1"), @@ -17,7 +17,7 @@ ], "last_modified": "2023-06-05T03:54:07.000Z", }, - "b.csv": { + "b.csv": { # The records in this file do not conform to the schema "contents": [ ("col1", "col2"), ("val_b_11", "this is text that will trigger validation policy"), From c9e20049ab911d2a73c17b592fa337803b48376c Mon Sep 17 00:00:00 2001 From: maxi297 Date: Thu, 10 Aug 2023 19:54:41 +0000 Subject: [PATCH 23/27] Automated Commit - Formatting Changes --- .../sources/file_based/file_types/test_csv_parser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 85718cf8648e..aa080d06c228 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -9,8 +9,7 @@ import unittest from datetime import datetime from typing import Any, Dict, Generator, List, Set -from unittest import TestCase -from unittest import mock +from unittest import TestCase, mock from unittest.mock import Mock import pytest From 9f3479f4fe5cb99b6408d48dbff0ea513a14b615 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 11 Aug 2023 08:52:03 -0400 Subject: [PATCH 24/27] [ISSUE #28893] remove InferenceType.PRIMITIVE_AND_COMPLEX_TYPES --- .../sources/file_based/config/csv_format.py | 1 - .../file_based/file_types/csv_parser.py | 51 +------------------ .../file_based/file_types/test_csv_parser.py | 44 +++------------- .../file_based/scenarios/csv_scenarios.py | 1 - 4 files changed, 10 insertions(+), 87 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index fc02d18dc6fe..0d4d437f45a6 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -20,7 +20,6 @@ class QuotingBehavior(Enum): class InferenceType(Enum): NONE = "None" PRIMITIVE_TYPES_ONLY = "Primitive Types Only" - PRIMITIVE_AND_COMPLEX_TYPES = "Primitive and Complex Types" DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 3a7562dcd323..60ceb825d9f5 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -131,12 +131,7 @@ async def infer_schema( # sources will likely require one. Rather than modify the interface now we can wait until the real use case config_format = _extract_format(config) type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( - lambda: _JsonTypeInferrer( - config_format.true_values, - config_format.false_values, - config_format.null_values, - config_format.inference_type == InferenceType.PRIMITIVE_AND_COMPLEX_TYPES, - ) + lambda: _JsonTypeInferrer(config_format.true_values, config_format.false_values, config_format.null_values) if config_format.inference_type != InferenceType.NONE else _DisabledTypeInferrer() ) @@ -286,17 +281,12 @@ class _JsonTypeInferrer(_TypeInferrer): _BOOLEAN_TYPE = "boolean" _INTEGER_TYPE = "integer" _NUMBER_TYPE = "number" - _ARRAY_TYPE = "array" - _OBJECT_TYPE = "object" _STRING_TYPE = "string" - def __init__( - self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str], allow_for_objects_and_arrays: bool - ) -> None: + def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str]) -> None: self._boolean_trues = boolean_trues self._boolean_falses = boolean_falses self._null_values = null_values - self._allow_for_objects_and_arrays = allow_for_objects_and_arrays self._values: Set[str] = set() def add_value(self, value: Any) -> None: @@ -316,12 +306,6 @@ def infer(self) -> str: return self._INTEGER_TYPE elif self._NUMBER_TYPE in types: return self._NUMBER_TYPE - elif not self._allow_for_objects_and_arrays: - return self._STRING_TYPE - elif self._ARRAY_TYPE in types: - return self._ARRAY_TYPE - elif any(self._OBJECT_TYPE in _type for _type in types_by_value.values()): - return self._OBJECT_TYPE return self._STRING_TYPE def _infer_type(self, value: str) -> Set[str]: @@ -337,14 +321,6 @@ def _infer_type(self, value: str) -> Set[str]: elif self._is_number(value): inferred_types.add(self._NUMBER_TYPE) - # if it can be infered as a primitive, it can't be a complex type so we'll avoid json parsing - if not inferred_types: - if self._is_array(value): - inferred_types.add(self._ARRAY_TYPE) - inferred_types.add(self._OBJECT_TYPE) - elif self._is_object(value): - inferred_types.add(self._OBJECT_TYPE) - inferred_types.add(self._STRING_TYPE) return inferred_types @@ -371,22 +347,6 @@ def _is_number(value: str) -> bool: except ValueError: return False - @staticmethod - def _is_array(value: str) -> bool: - try: - _value_to_list(value) - return True - except (ValueError, json.JSONDecodeError): - return False - - @staticmethod - def _is_object(value: str) -> bool: - try: - _value_to_object(value) - return True - except (ValueError, json.JSONDecodeError): - return False - def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool: if value in true_values: @@ -396,13 +356,6 @@ def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> raise ValueError(f"Value {value} is not a valid boolean value") -def _value_to_object(value: str) -> Dict[Any, Any]: - parsed_value = json.loads(value) - if isinstance(parsed_value, dict): - return parsed_value - raise ValueError(f"Value {parsed_value} is not a valid dict value") - - def _value_to_list(value: str) -> List[Any]: parsed_value = json.loads(value) if isinstance(parsed_value, list): diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index aa080d06c228..64832bc8d39b 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -117,71 +117,43 @@ def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(sel self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES - self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") - - def test_given_primitive_only_and_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "5645"], "integer") def test_given_integer_overlap_with_bool_value_only_when_infer_schema_then_type_is_integer(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES - self._test_infer_schema(["1", "90329", "5645"], "integer") # here, "1" is also considered a boolean - - def test_given_primitive_only_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY - self._test_infer_schema(["2", "90329", "5645"], "integer") + self._test_infer_schema(["1", "90329", "5645"], "integer") # here, "1" is also considered a boolean def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES - self._test_infer_schema(["2", "90329", "2.312"], "number") - - def test_given_primitive_only_and_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "2.312"], "number") - def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES - self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") - - def test_given_arrays_only_when_infer_schema_then_type_is_string(self) -> None: + def test_given_arrays_when_infer_schema_then_type_is_string(self) -> None: self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") - def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES - self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") - - def test_given_primitive_only_and_objects_only_when_infer_schema_then_type_is_string(self) -> None: + def test_given_objects_when_infer_schema_then_type_is_object(self) -> None: self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") - def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES - self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") - - def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES - self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") - def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["a string", "another string"], "string") def test_given_a_null_value_when_infer_then_ignore_null(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "5645", self._A_NULL_VALUE], "integer") def test_given_only_null_values_when_infer_then_type_is_string(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema([self._A_NULL_VALUE, self._A_NULL_VALUE, self._A_NULL_VALUE], "string") def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 0991e083b596..1acf40fd64ef 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -140,7 +140,6 @@ "enum": [ "None", "Primitive Types Only", - "Primitive and Complex Types", ] }, "delimiter": { From 29c68589fc99209738481b48eedf77a754a80fab Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 11 Aug 2023 09:05:19 -0400 Subject: [PATCH 25/27] Code review --- .../default_file_based_availability_strategy.py | 3 +-- .../airbyte_cdk/sources/file_based/file_types/csv_parser.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py index a72293e7f9d4..84186c064280 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py @@ -18,7 +18,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy): - _WITHOUT_SCHEMA = None def __init__(self, stream_reader: AbstractFileBasedStreamReader): self.stream_reader = stream_reader @@ -84,7 +83,7 @@ def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFil parser = stream.get_parser(stream.config.file_type) try: - record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger, self._WITHOUT_SCHEMA))) + record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger, discovered_schema=None))) except StopIteration: # The file is empty. We've verified that we can open it, so will # consider the connection check successful even though it means diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 60ceb825d9f5..b6240d079d25 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -212,7 +212,7 @@ def _cast_types( raise ValueError(f"Could not get non nullable type from {prop_type}") (prop_type,) = prop_type_distinct - if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: + if prop_type in TYPE_PYTHON_MAPPING: _, python_type = TYPE_PYTHON_MAPPING[prop_type] if python_type is None: From dc2913f567460a9384e0508113f0e8dfc8d7caf5 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 11 Aug 2023 13:12:33 +0000 Subject: [PATCH 26/27] Automated Commit - Formatting Changes --- .../default_file_based_availability_strategy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py index 84186c064280..2b81d44be66e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py @@ -18,7 +18,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy): - def __init__(self, stream_reader: AbstractFileBasedStreamReader): self.stream_reader = stream_reader From 8ace2f13b2af2f46d8502f4801fb7aaedb382351 Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Mon, 14 Aug 2023 11:59:06 -0700 Subject: [PATCH 27/27] fix unit tests --- .../file_types/test_jsonl_parser.py | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_jsonl_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_jsonl_parser.py index be6bc50ace98..2214ed1c506d 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_jsonl_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_jsonl_parser.py @@ -18,23 +18,23 @@ b'{"a": 2, "b": "2"}', ] JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS = [ - b'{', + b"{", b' "a": 1,', b' "b": "1"', - b'}', - b'{', + b"}", + b"{", b' "a": 2,', b' "b": "2"', - b'}', + b"}", ] INVALID_JSON_CONTENT = [ - b'{', + b"{", b' "a": 1,', b' "b": "1"', - b'{', + b"{", b' "a": 2,', b' "b": "2"', - b'}', + b"}", ] @@ -52,9 +52,7 @@ def _infer_schema(stream_reader: MagicMock) -> Dict[str, Any]: def test_when_infer_then_return_proper_types(stream_reader: MagicMock) -> None: record = {"col1": 1, "col2": 2.2, "col3": "3", "col4": ["a", "list"], "col5": {"inner": "obj"}, "col6": None, "col7": True} - stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO( - json.dumps(record).encode("utf-8") - ) + stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO(json.dumps(record).encode("utf-8")) schema = _infer_schema(stream_reader) @@ -82,7 +80,7 @@ def test_given_no_records_when_infer_then_return_empty_schema(stream_reader: Mag def test_given_limit_hit_when_infer_then_stop_considering_records(stream_reader: MagicMock) -> None: - jsonl_file_content = ('{"key": 2.' + "2" * JsonlParser.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE + '}\n{"key": "a string"}') + jsonl_file_content = '{"key": 2.' + "2" * JsonlParser.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE + '}\n{"key": "a string"}' stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO(jsonl_file_content.encode("utf-8")) schema = _infer_schema(stream_reader) @@ -91,9 +89,9 @@ def test_given_limit_hit_when_infer_then_stop_considering_records(stream_reader: def test_given_multiline_json_objects_and_read_limit_hit_when_infer_then_return_parse_until_at_least_one_record( - stream_reader: MagicMock + stream_reader: MagicMock, ) -> None: - jsonl_file_content = ('{\n"key": 2.' + "2" * JsonlParser.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE + "\n}") + jsonl_file_content = '{\n"key": 2.' + "2" * JsonlParser.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE + "\n}" stream_reader.open_file.return_value.__enter__.return_value = io.BytesIO(jsonl_file_content.encode("utf-8")) schema = _infer_schema(stream_reader) @@ -101,9 +99,7 @@ def test_given_multiline_json_objects_and_read_limit_hit_when_infer_then_return_ assert schema == {"key": {"type": "number"}} -def test_given_multiline_json_objects_and_hits_read_limit_when_infer_then_return_proper_types( - stream_reader: MagicMock -) -> None: +def test_given_multiline_json_objects_and_hits_read_limit_when_infer_then_return_proper_types(stream_reader: MagicMock) -> None: stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS schema = _infer_schema(stream_reader) assert schema == {"a": {"type": "integer"}, "b": {"type": "string"}} @@ -117,7 +113,7 @@ def test_given_multiple_records_then_merge_types(stream_reader: MagicMock) -> No def test_given_one_json_per_line_when_parse_records_then_return_records(stream_reader: MagicMock) -> None: stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITHOUT_MULTILINE_JSON_OBJECTS - records = list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, Mock())) + records = list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, Mock(), None)) assert records == [{"a": 1, "b": "1"}, {"a": 2, "b": "2"}] @@ -125,14 +121,14 @@ def test_given_one_json_per_line_when_parse_records_then_do_not_send_warning(str stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITHOUT_MULTILINE_JSON_OBJECTS logger = Mock() - list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger)) + list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger, None)) assert logger.warning.call_count == 0 def test_given_multiline_json_object_when_parse_records_then_return_records(stream_reader: MagicMock) -> None: stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS - records = list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, Mock())) + records = list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, Mock(), None)) assert records == [{"a": 1, "b": "1"}, {"a": 2, "b": "2"}] @@ -140,7 +136,7 @@ def test_given_multiline_json_object_when_parse_records_then_log_once_one_record stream_reader.open_file.return_value.__enter__.return_value = JSONL_CONTENT_WITH_MULTILINE_JSON_OBJECTS logger = Mock() - next(iter(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger))) + next(iter(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger, None))) assert logger.warning.call_count == 1 @@ -150,5 +146,5 @@ def test_given_unparsable_json_when_parse_records_then_raise_error(stream_reader logger = Mock() with pytest.raises(RecordParseError): - list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger)) + list(JsonlParser().parse_records(Mock(), Mock(), stream_reader, logger, None)) assert logger.warning.call_count == 0