From eaba48398265fc31b77cb7a8c86f8182ce3a7200 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 4 Aug 2023 13:11:21 -0400 Subject: [PATCH 01/21] [ISSUE #28893] infer csv schema --- .../file_based/file_types/csv_parser.py | 387 +++++++++++------- .../file_based/file_types/test_csv_parser.py | 249 ++++++++++- 2 files changed, 476 insertions(+), 160 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 479402877272..3b6e6f7dc915 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -5,9 +5,10 @@ import csv import json import logging +from collections import defaultdict from functools import partial from io import IOBase -from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set +from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig @@ -27,17 +28,20 @@ } -class CsvParser(FileTypeParser): - async def infer_schema( +class _CsvReader: + def read_data( self, config: FileBasedStreamConfig, file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: - config_format = config.format.get(config.file_type) if config.format else CsvFormat() - if not isinstance(config_format, CsvFormat): - raise ValueError(f"Invalid format config: {config_format}") + file_read_mode: FileReadMode, + ) -> Generator[Dict[str, Any], None, None]: + config_format = _extract_config_format(config) + + # Formats are configured individually per-stream so a unique dialect should be registered for each stream. + # We don't unregister the dialect because we are lazily parsing each csv file to generate records + # This will potentially be a problem if we ever process multiple streams concurrently dialect_name = config.name + DIALECT_NAME csv.register_dialect( dialect_name, @@ -47,13 +51,91 @@ async def infer_schema( doublequote=config_format.double_quote, quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL), ) - with stream_reader.open_file(file, self.file_read_mode, logger) as fp: - # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual - # sources will likely require one. Rather than modify the interface now we can wait until the real use case + with stream_reader.open_file(file, file_read_mode, logger) as fp: headers = self._get_headers(fp, config_format, dialect_name) - schema = {field.strip(): {"type": "string"} for field in headers} - csv.unregister_dialect(dialect_name) - return schema + + fp.seek(0) + # we assume that if we autogenerate columns, it is because we don't have headers + # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows + rows_to_skip = config_format.skip_rows_before_header + (0 if config_format.autogenerate_column_names else 1) + config_format.skip_rows_after_header + self._skip_rows(fp, rows_to_skip) + + reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore + try: + for row in reader: + # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns than + # headers + if None in row: + raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD) + yield row + finally: + # due to RecordParseError or GeneratorExit + csv.unregister_dialect(dialect_name) + + def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + # Note that this method assumes the dialect has already been registered if we're parsing the headers + self._skip_rows(fp, config_format.skip_rows_before_header) + if config_format.autogenerate_column_names: + return self._auto_generate_headers(fp, config_format, dialect_name) + else: + # Then read the header + reader = csv.reader(fp, dialect=dialect_name) # type: ignore + return list(next(reader)) + + def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + """ + Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True. + See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html + """ + reader = csv.reader(fp, dialect=dialect_name) # type: ignore + number_of_columns = len(next(reader)) # type: ignore + # Reset the file pointer to the beginning of the file so that the first row is not skipped + fp.seek(0) + return [f"f{i}" for i in range(number_of_columns)] + + @staticmethod + def _skip_rows(fp: IOBase, rows_to_skip: int) -> None: + """ + Skip rows before the header. This has to be done on the file object itself, not the reader + """ + for _ in range(rows_to_skip): + fp.readline() + + +class CsvParser(FileTypeParser): + _MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000 + + def __init__(self, csv_reader: Optional[_CsvReader] = None): + self._csv_reader = csv_reader if csv_reader else _CsvReader() + + async def infer_schema( + self, + config: FileBasedStreamConfig, + file: RemoteFile, + stream_reader: AbstractFileBasedStreamReader, + logger: logging.Logger, + ) -> Dict[str, Any]: + # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual + # sources will likely require one. Rather than modify the interface now we can wait until the real use case + config_format = _extract_config_format(config) + type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( + lambda: _TypeInferrer(config_format.true_values, config_format.false_values) + ) + data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) + read_bytes = 0 + for row in data_generator: + for header, value in row.items(): + type_inferrer_by_field[header].add_value(value) + # This is not accurate as a representation of how many bytes were read because csv does some processing on the actual value + # before returning. Given we would like to be more accurate, we could wrap the IO file using a decorator + read_bytes += len(value) + read_bytes += len(row) - 1 # for separators + if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE: + break + + schema = {header.strip(): {"type": type_inferred.infer()} for header, type_inferred in type_inferrer_by_field.items()} + data_generator.close() + return schema def parse_records( self, @@ -63,54 +145,17 @@ def parse_records( logger: logging.Logger, ) -> Iterable[Dict[str, Any]]: schema: Mapping[str, Any] = config.input_schema # type: ignore - config_format = config.format.get(config.file_type) if config.format else CsvFormat() - if not isinstance(config_format, CsvFormat): - raise ValueError(f"Invalid format config: {config_format}") - # Formats are configured individually per-stream so a unique dialect should be registered for each stream. - # We don't unregister the dialect because we are lazily parsing each csv file to generate records - # This will potentially be a problem if we ever process multiple streams concurrently - dialect_name = config.name + DIALECT_NAME - csv.register_dialect( - dialect_name, - delimiter=config_format.delimiter, - quotechar=config_format.quote_char, - escapechar=config_format.escape_char, - doublequote=config_format.double_quote, - quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL), - ) - with stream_reader.open_file(file, self.file_read_mode, logger) as fp: - # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual - # sources will likely require one. Rather than modify the interface now we can wait until the real use case - self._skip_rows_before_header(fp, config_format.skip_rows_before_header) - field_names = self._auto_generate_headers(fp, config_format) if config_format.autogenerate_column_names else None - reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=field_names) # type: ignore - yield from self._read_and_cast_types(reader, schema, config_format, logger) + config_format = _extract_config_format(config) + cast_fn = CsvParser._get_cast_function(schema, config_format, logger) + data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) + for row in data_generator: + yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) + data_generator.close() @property def file_read_mode(self) -> FileReadMode: return FileReadMode.READ - @staticmethod - def _read_and_cast_types( - reader: csv.DictReader, schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger # type: ignore - ) -> Iterable[Dict[str, Any]]: - """ - If the user provided a schema, attempt to cast the record values to the associated type. - - If a column is not in the schema or cannot be cast to an appropriate python type, - cast it to a string. Downstream, the user's validation policy will determine whether the - record should be emitted. - """ - cast_fn = CsvParser._get_cast_function(schema, config_format, logger) - for i, row in enumerate(reader): - if i < config_format.skip_rows_after_header: - continue - # The row was not properly parsed if any of the values are None - if any(val is None for val in row.values()): - raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD) - else: - yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) - @staticmethod def _get_cast_function( schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger @@ -118,7 +163,7 @@ def _get_cast_function( # Only cast values if the schema is provided if schema: property_types = {col: prop["type"] for col, prop in schema["properties"].items()} - return partial(_cast_types, property_types=property_types, config_format=config_format, logger=logger) + return partial(CsvParser._cast_types, property_types=property_types, config_format=config_format, logger=logger) else: # If no schema is provided, yield the rows as they are return _no_cast @@ -129,96 +174,137 @@ def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Opt return nullable @staticmethod - def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None: - """ - Skip rows before the header. This has to be done on the file object itself, not the reader + def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]: """ - for _ in range(rows_to_skip): - fp.readline() + Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. - def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: - # Note that this method assumes the dialect has already been registered if we're parsing the headers - if config_format.autogenerate_column_names: - return self._auto_generate_headers(fp, config_format) - else: - # If we're not autogenerating column names, we need to skip the rows before the header - self._skip_rows_before_header(fp, config_format.skip_rows_before_header) - # Then read the header - reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore - return next(reader) # type: ignore + Array and object types are only handled if they can be deserialized as JSON. - def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat) -> List[str]: - """ - Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True. - See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html + If any errors are encountered, the value will be emitted as a string._to_nullable """ - next_line = next(fp).strip() - number_of_columns = len(next_line.split(config_format.delimiter)) # type: ignore - # Reset the file pointer to the beginning of the file so that the first row is not skipped - fp.seek(0) - return [f"f{i}" for i in range(number_of_columns)] + warnings = [] + result = {} + + for key, value in row.items(): + prop_type = property_types.get(key) + cast_value: Any = value + + if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: + _, python_type = TYPE_PYTHON_MAPPING[prop_type] + + if python_type is None: + if value == "": + cast_value = None + else: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == bool: + try: + cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values) + except ValueError: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == dict: + try: + # we don't re-use _value_to_object here because we type the column as object as long as there is only one object + cast_value = json.loads(value) + except json.JSONDecodeError: + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type == list: + try: + cast_value = _value_to_list(value) + except (ValueError, json.JSONDecodeError): + warnings.append(_format_warning(key, value, prop_type)) + + elif python_type: + try: + cast_value = _value_to_python_type(value, python_type) + except ValueError: + warnings.append(_format_warning(key, value, prop_type)) + else: + warnings.append(_format_warning(key, value, prop_type)) + + result[key] = cast_value + + if warnings: + logger.warning( + f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}", + ) + return result + + +class _TypeInferrer: + _BOOLEAN_TYPE = "boolean" + _NUMBER_TYPE = "number" + _ARRAY_TYPE = "array" + _OBJECT_TYPE = "object" + _STRING_TYPE = "string" + + def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str]) -> None: + self._boolean_trues = boolean_trues + self._boolean_falses = boolean_falses + self._values: Set[str] = set() + + def add_value(self, value: Any) -> None: + self._values.add(value) + + def infer(self) -> str: + types = {self._infer_type(value) for value in self._values} + + if types == {self._BOOLEAN_TYPE}: + return self._BOOLEAN_TYPE + elif types == {self._NUMBER_TYPE}: + return self._NUMBER_TYPE + elif types == {self._ARRAY_TYPE}: + return self._ARRAY_TYPE + elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: + return self._OBJECT_TYPE + return self._STRING_TYPE + + def _infer_type(self, value: str) -> str: + if self._is_boolean(value): + return self._BOOLEAN_TYPE + elif self._is_number(value): + return self._NUMBER_TYPE + elif self._is_array(value): + return self._ARRAY_TYPE + elif self._is_object(value): + return self._OBJECT_TYPE + else: + return self._STRING_TYPE -def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]: - """ - Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. - - Array and object types are only handled if they can be deserialized as JSON. - - If any errors are encountered, the value will be emitted as a string. - """ - warnings = [] - result = {} - - for key, value in row.items(): - prop_type = property_types.get(key) - cast_value: Any = value - - if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: - _, python_type = TYPE_PYTHON_MAPPING[prop_type] - - if python_type is None: - if value == "": - cast_value = None - else: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type == bool: - try: - cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values) - except ValueError: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type == dict: - try: - cast_value = json.loads(value) - except json.JSONDecodeError: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type == list: - try: - parsed_value = json.loads(value) - if isinstance(parsed_value, list): - cast_value = parsed_value - except json.JSONDecodeError: - warnings.append(_format_warning(key, value, prop_type)) - - elif python_type: - try: - cast_value = python_type(value) - except ValueError: - warnings.append(_format_warning(key, value, prop_type)) + def _is_boolean(self, value: str) -> bool: + try: + _value_to_bool(value, self._boolean_trues, self._boolean_falses) + return True + except ValueError: + return False - else: - warnings.append(_format_warning(key, value, prop_type)) + @staticmethod + def _is_number(value: str) -> bool: + try: + _value_to_python_type(value, float) + return True + except ValueError: + return False - result[key] = cast_value + @staticmethod + def _is_array(value: str) -> bool: + try: + _value_to_list(value) + return True + except (ValueError, json.JSONDecodeError): + return False - if warnings: - logger.warning( - f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}", - ) - return result + @staticmethod + def _is_object(value: str) -> bool: + try: + _value_to_object(value) + return True + except (ValueError, json.JSONDecodeError): + return False def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool: @@ -229,9 +315,34 @@ def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> raise ValueError(f"Value {value} is not a valid boolean value") +def _value_to_object(value: str) -> Dict[Any, Any]: + parsed_value = json.loads(value) + if isinstance(parsed_value, dict): + return parsed_value + raise ValueError(f"Value {parsed_value} is not a valid dict value") + + +def _value_to_list(value: str) -> List[Any]: + parsed_value = json.loads(value) + if isinstance(parsed_value, list): + return parsed_value + raise ValueError(f"Value {parsed_value} is not a valid list value") + + +def _value_to_python_type(value: str, python_type: type) -> Any: + return python_type(value) + + def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str: return f"{key}: value={value},expected_type={expected_type}" def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]: return row + + +def _extract_config_format(config: FileBasedStreamConfig) -> CsvFormat: + config_format = config.format.get(config.file_type) if config.format else CsvFormat() + if not isinstance(config_format, CsvFormat): + raise ValueError(f"Invalid format config: {config_format}") + return config_format diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 1d2079396383..48fb991f2195 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -2,13 +2,21 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # +import asyncio +import csv +import io import logging -from unittest.mock import MagicMock, Mock +import unittest +from typing import Any, Dict, Generator, List, Set +from unittest.mock import Mock +from unittest import TestCase import pytest from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat from airbyte_cdk.sources.file_based.exceptions import RecordParseError -from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _cast_types +from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode +from airbyte_cdk.sources.file_based.file_types.csv_parser import _CsvReader, CsvParser +from airbyte_cdk.sources.file_based.remote_file import RemoteFile PROPERTY_TYPES = { "col1": "null", @@ -70,29 +78,226 @@ pytest.param({"col10": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col10": "x"}, id="item-not-in-props-doesn't-error"), ] ) -def test_cast_to_python_type(row, true_values, false_values, expected_output): +def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None: csv_format = CsvFormat(true_values=true_values, false_values=false_values) - assert _cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output + assert CsvParser._cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output -@pytest.mark.parametrize( - "reader_values, expected_rows", [ - pytest.param([{"col1": "1", "col2": None}], None, id="raise_exception_if_any_value_is_none"), - pytest.param([{"col1": "1", "col2": "2"}], [{"col1": "1", "col2": "2"}], id="read_no_cast"), - ] -) -def test_read_and_cast_types(reader_values, expected_rows): - reader = MagicMock() - reader.__iter__.return_value = reader_values - schema = {} - config_format = CsvFormat() - logger = Mock() +_DEFAULT_TRUE_VALUES = {"yes", "yeah", "right"} +_DEFAULT_FALSE_VALUES = {"no", "nop", "wrong"} + + +class SchemaInferrenceTestCase(TestCase): + def setUp(self) -> None: + self._config_format = CsvFormat() + self._config_format.true_values = _DEFAULT_TRUE_VALUES + self._config_format.false_values = _DEFAULT_FALSE_VALUES + self._config = Mock() + self._config.format.get.return_value = self._config_format + + self._file = Mock(spec=RemoteFile) + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._logger = Mock(spec=logging.Logger) + self._csv_reader = Mock(spec=_CsvReader) + self._parser = CsvParser(self._csv_reader) + + def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") + + def test_given_numbers_only_when_infer_schema_then_type_is_number(self) -> None: + self._test_infer_schema(["2", "90329", "2.312"], "number") + + def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") + + def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") + + def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") + + def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") + + def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: + self._test_infer_schema(["a string", "another string"], "string") + + def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: + self._csv_reader.read_data.return_value = ({"header": row} for row in rows) + inferred_schema = self._infer_schema() + assert inferred_schema == {"header": {"type": expected_type}} + + def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: + self._csv_reader.read_data.return_value = ({"header": row} for row in ["2" * 1_000_000] + ["this is a string"]) + inferred_schema = self._infer_schema() + # since the type is number, we know the string at the end was not considered + assert inferred_schema == {"header": {"type": "number"}} + + def _infer_schema(self): + loop = asyncio.new_event_loop() + task = loop.create_task(self._parser.infer_schema(self._config, self._file, self._stream_reader, self._logger)) + loop.run_until_complete(task) + return task.result() + + +class CsvFileBuilder: + def __init__(self) -> None: + self._prefixed_rows: List[str] = [] + self._data: List[str] = [] + + def with_prefixed_rows(self, rows: List[str]) -> 'CsvFileBuilder': + self._prefixed_rows = rows + return self + + def with_data(self, data: List[str]) -> 'CsvFileBuilder': + self._data = data + return self + + def build(self) -> io.StringIO: + return io.StringIO("\n".join(self._prefixed_rows + self._data)) + + +class CsvReaderTest(unittest.TestCase): + _CONFIG_NAME = "config_name" + + def setUp(self) -> None: + self._config_format = CsvFormat() + self._config = Mock() + self._config.name = self._CONFIG_NAME + self._config.format.get.return_value = self._config_format + + self._file = Mock(spec=RemoteFile) + self._stream_reader = Mock(spec=AbstractFileBasedStreamReader) + self._logger = Mock(spec=logging.Logger) + self._csv_reader = _CsvReader() + + def test_given_skip_rows_when_read_data_then_do_not_considered_prefixed_rows(self) -> None: + self._config_format.skip_rows_before_header = 2 + self._stream_reader.open_file.return_value = CsvFileBuilder().with_prefixed_rows(["first line", "second line"]).with_data([ + "header", + "a value", + "another value", + ]).build() + + data_generator = self._read_data() - parser = CsvParser() + assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}] + + def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None: + self._config_format.autogenerate_column_names = True + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + '0,1,2,3,4,5,6' + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}] + + def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None: + self._config_format.skip_rows_after_header = 1 + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + "skipped row: important that the is no comma in this string to test if columns do not match in skipped rows", + "a value 1,a value 2", + "another value 1,another value 2" + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [ + {"header1": "a value 1", "header2": "a value 2"}, + {"header1": "another value 1", "header2": "another value 2"} + ] + + def test_given_quote_delimiter_when_read_data_then_parse_properly(self) -> None: + self._config_format.delimiter = "|" + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1|header2", + "a value 1|a value 2", + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "a value 1", "header2": "a value 2"}] + + def test_given_quote_char_when_read_data_then_parse_properly(self) -> None: + self._config_format.quote_char = "|" + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + "|a,value,1|,|a,value,2|", + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "a,value,1", "header2": "a,value,2"}] + + def test_given_escape_char_when_read_data_then_parse_properly(self) -> None: + self._config_format.escape_char = "|" + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + '"a |"value|", 1",a value 2', + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": 'a "value", 1', "header2": "a value 2"}] + + def test_given_double_quote_on_when_read_data_then_parse_properly(self) -> None: + self._config_format.double_quote = True + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "1", "header2": 'Text with doublequote: "This is a text."'}] + + def test_given_double_quote_off_when_read_data_then_parse_properly(self) -> None: + self._config_format.double_quote = False + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ]).build() + + data_generator = self._read_data() + + assert list(data_generator) == [{"header1": "1", "header2": 'Text with doublequote: "This is a text."""'}] + + def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header", + "a value", + "another value", + ]).build() + + data_generator = self._read_data() + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" in csv.list_dialects() + data_generator.close() + assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + + def test_given_exception_when_read_data_then_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header", + "a value", + "too many values,value,value,value", + ]).build() + + data_generator = self._read_data() + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" in csv.list_dialects() - expected_rows = expected_rows - if expected_rows is None: with pytest.raises(RecordParseError): - list(parser._read_and_cast_types(reader, schema, config_format, logger)) - else: - assert expected_rows == list(parser._read_and_cast_types(reader, schema, config_format, logger)) + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + + def _read_data(self) -> Generator[Dict[str, str], None, None]: + data_generator = self._csv_reader.read_data( + self._config, + self._file, + self._stream_reader, + self._logger, + FileReadMode.READ, + ) + return data_generator From 60757c1e21b6752b53ac4b93a1d5c493daea8621 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 4 Aug 2023 15:03:25 -0400 Subject: [PATCH 02/21] [ISSUE #28893] align with pyarrow --- .../file_based/file_types/csv_parser.py | 20 ++++++++++++++----- .../file_based/file_types/test_csv_parser.py | 15 ++++++++------ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 3b6e6f7dc915..9c7b58aadca4 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -237,6 +237,7 @@ def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_form class _TypeInferrer: _BOOLEAN_TYPE = "boolean" + _INTEGER_TYPE = "integer" _NUMBER_TYPE = "number" _ARRAY_TYPE = "array" _OBJECT_TYPE = "object" @@ -255,17 +256,18 @@ def infer(self) -> str: if types == {self._BOOLEAN_TYPE}: return self._BOOLEAN_TYPE - elif types == {self._NUMBER_TYPE}: + elif types == {self._INTEGER_TYPE}: + return self._INTEGER_TYPE + elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}: return self._NUMBER_TYPE - elif types == {self._ARRAY_TYPE}: - return self._ARRAY_TYPE - elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: - return self._OBJECT_TYPE + # to keep backward compatibility with PyArrow, we will not parse types return self._STRING_TYPE def _infer_type(self, value: str) -> str: if self._is_boolean(value): return self._BOOLEAN_TYPE + elif self._is_integer(value): + return self._INTEGER_TYPE elif self._is_number(value): return self._NUMBER_TYPE elif self._is_array(value): @@ -282,6 +284,14 @@ def _is_boolean(self, value: str) -> bool: except ValueError: return False + @staticmethod + def _is_integer(value: str) -> bool: + try: + _value_to_python_type(value, int) + return True + except ValueError: + return False + @staticmethod def _is_number(value: str) -> bool: try: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 48fb991f2195..4d3658091c84 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -104,20 +104,23 @@ def setUp(self) -> None: def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") - def test_given_numbers_only_when_infer_schema_then_type_is_number(self) -> None: + def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._test_infer_schema(["2", "90329", "5645"], "integer") + + def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: self._test_infer_schema(["2", "90329", "2.312"], "number") def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") + self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "string") def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") + self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "string") def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: self._test_infer_schema(["a string", "another string"], "string") @@ -128,7 +131,7 @@ def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: assert inferred_schema == {"header": {"type": expected_type}} def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._csv_reader.read_data.return_value = ({"header": row} for row in ["2" * 1_000_000] + ["this is a string"]) + self._csv_reader.read_data.return_value = ({"header": row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered assert inferred_schema == {"header": {"type": "number"}} From f49235d22b038c925fd54ee5f572bb6f5fd8fd51 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Fri, 4 Aug 2023 19:08:14 +0000 Subject: [PATCH 03/21] Automated Commit - Formatting Changes --- .../sources/file_based/file_types/csv_parser.py | 10 ++++++++-- .../sources/file_based/file_types/test_csv_parser.py | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 9c7b58aadca4..412e460846e9 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -57,7 +57,11 @@ def read_data( fp.seek(0) # we assume that if we autogenerate columns, it is because we don't have headers # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows - rows_to_skip = config_format.skip_rows_before_header + (0 if config_format.autogenerate_column_names else 1) + config_format.skip_rows_after_header + rows_to_skip = ( + config_format.skip_rows_before_header + + (0 if config_format.autogenerate_column_names else 1) + + config_format.skip_rows_after_header + ) self._skip_rows(fp, rows_to_skip) reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore @@ -174,7 +178,9 @@ def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Opt return nullable @staticmethod - def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]: + def _cast_types( + row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger + ) -> Dict[str, Any]: """ Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 4d3658091c84..811cb8f9fc7c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -8,14 +8,14 @@ import logging import unittest from typing import Any, Dict, Generator, List, Set -from unittest.mock import Mock from unittest import TestCase +from unittest.mock import Mock import pytest from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat from airbyte_cdk.sources.file_based.exceptions import RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.csv_parser import _CsvReader, CsvParser +from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader from airbyte_cdk.sources.file_based.remote_file import RemoteFile PROPERTY_TYPES = { From a3946660772937d63fe9da58c930d50d533cea9a Mon Sep 17 00:00:00 2001 From: maxi297 Date: Mon, 7 Aug 2023 10:14:50 -0400 Subject: [PATCH 04/21] [ISSUE #28893] legacy inference and infer only when needed --- .../sources/file_based/config/csv_format.py | 18 ++++ .../file_based/file_types/csv_parser.py | 59 +++++++++++-- .../file_based/config/test_csv_format.py | 20 ++++- .../file_based/file_types/test_csv_parser.py | 84 ++++++++++++++++--- 4 files changed, 162 insertions(+), 19 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index 1fda1016a00c..c3a75e21611e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -78,6 +78,16 @@ class CsvFormat(BaseModel): default=DEFAULT_FALSE_VALUES, description="A set of case-sensitive strings that should be interpreted as false values.", ) + infer_datatypes: bool = Field( + title="Infer Datatypes", + default=False, + description="Whether to autogenerate the schema based the file content.", + ) + infer_datatypes_legacy: bool = Field( + title="Infer Datatypes (legacy)", + default=False, + description="Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + ) @validator("delimiter") def validate_delimiter(cls, v: str) -> str: @@ -114,3 +124,11 @@ def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, if skip_rows_before_header > 0 and auto_generate_column_names: raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.") return values + + @root_validator + def validate_option_inference(cls, values: Mapping[str, Any]) -> Mapping[str, Any]: + infer_datatypes = values.get("infer_datatypes", False) + infer_datatypes_legacy = values.get("infer_datatypes_legacy", False) + if infer_datatypes and infer_datatypes_legacy: + raise ValueError("Only one way to inference can be configured at the same time") + return values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 412e460846e9..7c470f6d4d0c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -5,6 +5,7 @@ import csv import json import logging +from abc import ABC, abstractmethod from collections import defaultdict from functools import partial from io import IOBase @@ -67,9 +68,9 @@ def read_data( reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore try: for row in reader: - # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns than - # headers - if None in row: + # The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns + # than headers or more headers dans columns + if None in row or None in row.values(): raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD) yield row finally: @@ -119,11 +120,21 @@ async def infer_schema( stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, ) -> Dict[str, Any]: + if config.input_schema: + # FIXME: what happens if it's a string + return config.input_schema + # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual # sources will likely require one. Rather than modify the interface now we can wait until the real use case config_format = _extract_config_format(config) type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( - lambda: _TypeInferrer(config_format.true_values, config_format.false_values) + lambda: _JsonTypeInferrer( + config_format.true_values, + config_format.false_values, + config_format.null_values, + config_format.infer_datatypes and not config_format.infer_datatypes_legacy + ) if config_format.infer_datatypes or config_format.infer_datatypes_legacy + else _DisabledTypeInferrer() ) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) read_bytes = 0 @@ -241,7 +252,26 @@ def _cast_types( return result -class _TypeInferrer: +class _TypeInferrer(ABC): + @abstractmethod + def add_value(self, value: Any) -> None: + pass + + @abstractmethod + def infer(self) -> str: + pass + + +class _DisabledTypeInferrer(_TypeInferrer): + def add_value(self, value: Any) -> None: + pass + + def infer(self) -> str: + return "string" + + +class _JsonTypeInferrer(_TypeInferrer): + _NULL_TYPE = "null" _BOOLEAN_TYPE = "boolean" _INTEGER_TYPE = "integer" _NUMBER_TYPE = "number" @@ -249,9 +279,11 @@ class _TypeInferrer: _OBJECT_TYPE = "object" _STRING_TYPE = "string" - def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str]) -> None: + def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str], allow_for_objects_and_arrays: bool) -> None: self._boolean_trues = boolean_trues self._boolean_falses = boolean_falses + self._null_values = null_values + self._allow_for_objects_and_arrays = allow_for_objects_and_arrays self._values: Set[str] = set() def add_value(self, value: Any) -> None: @@ -259,18 +291,29 @@ def add_value(self, value: Any) -> None: def infer(self) -> str: types = {self._infer_type(value) for value in self._values} + if types == {self._NULL_TYPE}: + # this is highly unusual but we will consider the column as a string + return self._STRING_TYPE + types.discard(self._NULL_TYPE) if types == {self._BOOLEAN_TYPE}: return self._BOOLEAN_TYPE elif types == {self._INTEGER_TYPE}: return self._INTEGER_TYPE elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}: return self._NUMBER_TYPE - # to keep backward compatibility with PyArrow, we will not parse types + elif not self._allow_for_objects_and_arrays: + return self._STRING_TYPE + elif types == {self._ARRAY_TYPE}: + return self._ARRAY_TYPE + elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: + return self._OBJECT_TYPE return self._STRING_TYPE def _infer_type(self, value: str) -> str: - if self._is_boolean(value): + if value in self._null_values: + return self._NULL_TYPE + elif self._is_boolean(value): return self._BOOLEAN_TYPE elif self._is_integer(value): return self._INTEGER_TYPE diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index 6903f126af30..edd39964124c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -3,6 +3,7 @@ # import pytest as pytest +from typing import Type from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat @@ -15,9 +16,26 @@ pytest.param(0, False, None, id="test_no_skip_rows_before_header_and_no_autogenerate_column_names"), ] ) -def test_csv_format(skip_rows_before_header, autogenerate_column_names, expected_error): +def test_csv_format_skip_rows_and_autogenerate_column_names(skip_rows_before_header, autogenerate_column_names, expected_error) -> None: if expected_error: with pytest.raises(expected_error): CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) else: CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) + + +@pytest.mark.parametrize( + "infer_datatypes, infer_datatypes_legacy, expected_error", + [ + pytest.param(True, True, ValueError, id="test_many_inferences_configured"), + pytest.param(True, False, None, id="test_infer_datatypes"), + pytest.param(False, True, None, id="test_infer_datatypes_legacy"), + pytest.param(False, False, None, id="test_no_inference"), + ] +) +def test_csv_format_inference(infer_datatypes: bool, infer_datatypes_legacy: bool, expected_error: Type[BaseException]) -> None: + if expected_error: + with pytest.raises(expected_error): + CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) + else: + CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 811cb8f9fc7c..5547f1254038 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -87,12 +87,19 @@ def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_v _DEFAULT_FALSE_VALUES = {"no", "nop", "wrong"} -class SchemaInferrenceTestCase(TestCase): +class SchemaInferenceTestCase(TestCase): + _A_NULL_VALUE = "null" + _HEADER_NAME = "header" + def setUp(self) -> None: self._config_format = CsvFormat() self._config_format.true_values = _DEFAULT_TRUE_VALUES self._config_format.false_values = _DEFAULT_FALSE_VALUES + self._config_format.null_values = {self._A_NULL_VALUE} + self._config_format.infer_datatypes = False + self._config_format.infer_datatypes_legacy = False self._config = Mock() + self._config.input_schema = None self._config.format.get.return_value = self._config_format self._file = Mock(spec=RemoteFile) @@ -101,40 +108,82 @@ def setUp(self) -> None: self._csv_reader = Mock(spec=_CsvReader) self._parser = CsvParser(self._csv_reader) + def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(self) -> None: + self._config.input_schema = {self._HEADER_NAME: {"type": "potato"}} + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") + + def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") + + def test_given_legacy_and_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(["2", "90329", "5645"], "integer") + + def test_given_legacy_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(["2", "90329", "5645"], "integer") def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(["2", "90329", "2.312"], "number") + + def test_given_legacy_and_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(["2", "90329", "2.312"], "number") def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") + + def test_given_arrays_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") + + def test_given_legacy_and_objects_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.infer_datatypes_legacy = True self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "string") + self._config_format.infer_datatypes = True + self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "string") + self._config_format.infer_datatypes = True + self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.infer_datatypes = True self._test_infer_schema(["a string", "another string"], "string") - def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: - self._csv_reader.read_data.return_value = ({"header": row} for row in rows) - inferred_schema = self._infer_schema() - assert inferred_schema == {"header": {"type": expected_type}} + def test_given_a_null_value_when_infer_then_ignore_null(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema(["2", "90329", "5645", self._A_NULL_VALUE], "integer") + + def test_given_only_null_values_when_infer_then_type_is_string(self) -> None: + self._config_format.infer_datatypes = True + self._test_infer_schema([self._A_NULL_VALUE, self._A_NULL_VALUE, self._A_NULL_VALUE], "string") def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._csv_reader.read_data.return_value = ({"header": row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) + self._config_format.infer_datatypes = True + self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered - assert inferred_schema == {"header": {"type": "number"}} + assert inferred_schema == {self._HEADER_NAME: {"type": "number"}} + + def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: + self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in rows) + inferred_schema = self._infer_schema() + assert inferred_schema == {self._HEADER_NAME: {"type": expected_type}} def _infer_schema(self): loop = asyncio.new_event_loop() @@ -280,7 +329,7 @@ def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> data_generator.close() assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() - def test_given_exception_when_read_data_then_unregister_dialect(self) -> None: + def test_given_too_many_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ "header", "a value", @@ -295,6 +344,21 @@ def test_given_exception_when_read_data_then_unregister_dialect(self) -> None: next(data_generator) assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + def test_given_too_few_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ + "header1,header2,header3", + "value1,value2,value3", + "a value", + ]).build() + + data_generator = self._read_data() + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" in csv.list_dialects() + + with pytest.raises(RecordParseError): + next(data_generator) + assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() + def _read_data(self) -> Generator[Dict[str, str], None, None]: data_generator = self._csv_reader.read_data( self._config, From 4f9d162b86e1d16aca1f153e321f882dbc639443 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Mon, 7 Aug 2023 11:47:05 -0400 Subject: [PATCH 05/21] [ISSUE #28893] fix scenario tests --- .../file_based/file_types/csv_parser.py | 2 +- .../file_based/scenarios/csv_scenarios.py | 17 ++++++++-- .../scenarios/validation_policy_scenarios.py | 34 +++++++++---------- 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 7c470f6d4d0c..953c87fc70a1 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -121,7 +121,7 @@ async def infer_schema( logger: logging.Logger, ) -> Dict[str, Any]: if config.input_schema: - # FIXME: what happens if it's a string + # FIXME change type of method to Mapping return config.input_schema # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index c5714cc570b3..b928d7445c37 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -8,7 +8,7 @@ single_csv_scenario = ( TestScenarioBuilder() - .set_name("single_csv_stream") + .set_name("single_csv_scenario") .set_config( { "streams": [ @@ -249,7 +249,19 @@ "type": "string" }, "uniqueItems": True - } + }, + "infer_datatypes": { + "default": False, + "description": "Whether to autogenerate the schema based the file content.", + "title": "Infer Datatypes", + "type": "boolean" + }, + "infer_datatypes_legacy": { + "default": False, + "description": "Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + "title": "Infer Datatypes (legacy)", + "type": "boolean" + }, } }, { @@ -1999,6 +2011,7 @@ "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0", } ]}) + .set_expected_discover_error(SchemaInferenceError, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) ).build() csv_escape_char_is_set_scenario = ( diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 55d0d40911f9..22bff3f66f13 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -12,8 +12,8 @@ "a.csv": { # The records in this file do not conform to the schema "contents": [ ("col1", "col2"), - ("val_a_11", "val_a_21"), - ("val_a_12", "val_a_22"), + ("val_a_11", "1"), + ("val_a_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -29,7 +29,7 @@ "contents": [ ("col1",), ("val_c_11",), - ("val_c_12","val_c_22"), # This record doesn't conform to the schema + ("val_c_12","val_c_22"), # This record is not parsable ("val_c_13",), ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -55,9 +55,9 @@ "col1": { "type": "string", }, - # "col2": { # remove this so the record does not conform to the schema - # "type": "string", - # }, + "col2": { + "type": "number", + }, "_ab_source_file_last_modified": { "type": "string" }, @@ -100,7 +100,7 @@ "contents": [ ("col1",), ("val_aa3_11",), - ("val_aa3_12", "val_aa3_22"), # This record does not conform to the schema + ("val_aa3_12", "val_aa3_22"), # This record is not parsable ("val_aa3_13",), ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -219,7 +219,7 @@ {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, ] ) @@ -230,8 +230,8 @@ "message": "Records in file did not pass validation policy. stream=stream1 file=a.csv n_skipped=2 validation_policy=skip_record", }, { - "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=c.csv n_skipped=1 validation_policy=skip_record", + "level": "ERROR", + "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=c.csv line_no=2 n_skipped=0", }, ] }) @@ -268,7 +268,7 @@ {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform @@ -285,8 +285,8 @@ "message": "Records in file did not pass validation policy. stream=stream1 file=a/a1.csv n_skipped=2 validation_policy=skip_record", }, { - "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=a/a3.csv n_skipped=1 validation_policy=skip_record", + "level": "ERROR", + "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a/a3.csv line_no=2 n_skipped=0", }, { "level": "WARN", @@ -314,14 +314,14 @@ ) .set_expected_records( [ - {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_11", "col2": "1", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": "2", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error - # {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, ] ) .set_expected_logs({ @@ -366,7 +366,7 @@ {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error - # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, From 0617c823565f6e5924301aca049c5c766b0c8a51 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Mon, 7 Aug 2023 16:40:38 -0400 Subject: [PATCH 06/21] [ISSUE #28893] using discovered schema as part of read --- ...efault_file_based_availability_strategy.py | 4 +- .../file_based/file_types/avro_parser.py | 9 +- .../file_based/file_types/csv_parser.py | 17 +- .../file_based/file_types/file_type_parser.py | 6 +- .../file_based/file_types/jsonl_parser.py | 7 +- .../file_based/file_types/parquet_parser.py | 7 +- .../stream/default_file_based_stream.py | 2 +- .../file_based/file_types/test_csv_parser.py | 5 +- .../file_based/scenarios/csv_scenarios.py | 1 - .../scenarios/validation_policy_scenarios.py | 173 +++++++++++------- 10 files changed, 147 insertions(+), 84 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py index 52563ca1e46e..f982e13c158a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py @@ -16,6 +16,8 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy): + _WITHOUT_SCHEMA = None + def __init__(self, stream_reader: AbstractFileBasedStreamReader): self.stream_reader = stream_reader @@ -82,7 +84,7 @@ def _check_parse_record(self, stream: AbstractFileBasedStream, file: RemoteFile, parser = stream.get_parser(stream.config.file_type) try: - record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger))) + record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger, self._WITHOUT_SCHEMA))) except StopIteration: # The file is empty. We've verified that we can open it, so will # consider the connection check successful even though it means diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py index 91e1cf4eb08b..bca22b191040 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -4,13 +4,13 @@ import logging import uuid -from typing import Any, Dict, Iterable, Mapping +from typing import Any, Dict, Iterable, Mapping, Optional import fastavro from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile AVRO_TYPE_TO_JSON_TYPE = { @@ -45,7 +45,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): raise ValueError(f"Expected ParquetFormat, got {avro_format}") @@ -60,7 +60,7 @@ async def infer_schema( field["name"]: AvroParser._convert_avro_type_to_json(avro_format, field["name"], field["type"]) for field in avro_schema["fields"] } - return json_schema + return json_schema # type: ignore @classmethod def _convert_avro_type_to_json(cls, avro_format: AvroFormat, field_name: str, avro_field: str) -> Mapping[str, Any]: @@ -130,6 +130,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 953c87fc70a1..d12f5e0b761e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -15,7 +15,7 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING @@ -119,10 +119,10 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: if config.input_schema: # FIXME change type of method to Mapping - return config.input_schema + return config.input_schema # type: ignore # conversion to mapping is handled by pydantic and we shouldn't have a str here # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual # sources will likely require one. Rather than modify the interface now we can wait until the real use case @@ -158,10 +158,10 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: - schema: Mapping[str, Any] = config.input_schema # type: ignore config_format = _extract_config_format(config) - cast_fn = CsvParser._get_cast_function(schema, config_format, logger) + cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) for row in data_generator: yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) @@ -206,6 +206,13 @@ def _cast_types( prop_type = property_types.get(key) cast_value: Any = value + if isinstance(prop_type, list): + prop_type_distinct = set(prop_type) + prop_type_distinct.remove("null") + if len(prop_type_distinct) != 1: + raise ValueError(f"Could not get non nullable type from {prop_type}") + (prop_type,) = prop_type_distinct + if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: _, python_type = TYPE_PYTHON_MAPPING[prop_type] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py index 41abc8de37a8..26547cf3d63b 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py @@ -4,13 +4,14 @@ import logging from abc import ABC, abstractmethod -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, Mapping, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode from airbyte_cdk.sources.file_based.remote_file import RemoteFile -Schema = Dict[str, str] +# Schema example: {"column1": {"type": "string"}} +Schema = Mapping[str, Dict[str, Any]] Record = Dict[str, Any] @@ -40,6 +41,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Record]: """ Parse and emit each record. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py index d256b72ee4e3..be595f4acc6c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py @@ -4,11 +4,11 @@ import json import logging -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, merge_schemas @@ -23,7 +23,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: """ Infers the schema for the file by inferring the schema for each line, and merging it with the previously-inferred schema. @@ -52,6 +52,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: with stream_reader.open_file(file, self.file_read_mode, logger) as fp: for line in fp: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py index 09d66a1f25b6..16cf63641744 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -5,14 +5,14 @@ import json import logging import os -from typing import Any, Dict, Iterable, List, Mapping +from typing import Any, Dict, Iterable, List, Mapping, Optional from urllib.parse import unquote import pyarrow as pa import pyarrow.parquet as pq from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema from airbyte_cdk.sources.file_based.remote_file import RemoteFile from pyarrow import Scalar @@ -24,7 +24,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Dict[str, Any]: + ) -> Schema: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): raise ValueError(f"Expected ParquetFormat, got {parquet_format}") @@ -47,6 +47,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, + discovered_schema: Optional[Schema], ) -> Iterable[Dict[str, Any]]: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 76093016e2d5..e08ec7fdf45a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -84,7 +84,7 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping n_skipped = line_no = 0 try: - for record in parser.parse_records(self.config, file, self._stream_reader, self.logger): + for record in parser.parse_records(self.config, file, self._stream_reader, self.logger, schema): line_no += 1 if self.config.schemaless: record = {"data": record} diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 5547f1254038..24f287054c64 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -28,6 +28,7 @@ "col7": "array", "col8": "array", "col9": "array", + "col10": ["null", "string"], } logger = logging.getLogger() @@ -47,6 +48,7 @@ "col7": '[1, 2]', "col8": '["1", "2"]', "col9": '[{"a": "b"}, {"a": "c"}]', + "col10": 'asdf', }, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, @@ -60,6 +62,7 @@ "col7": [1, 2], "col8": ["1", "2"], "col9": [{"a": "b"}, {"a": "c"}], + "col10": 'asdf', }, id="cast-all-cols"), pytest.param({"col1": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col1": "1"}, id="cannot-cast-to-null"), pytest.param({"col2": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": True}, id="cast-1-to-bool"), @@ -75,7 +78,7 @@ pytest.param({"col7": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col7": "['a', 'b']"}, id="cannot-cast-to-list-of-ints"), pytest.param({"col8": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col8": "['a', 'b']"}, id="cannot-cast-to-list-of-strings"), pytest.param({"col9": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col9": "['a', 'b']"}, id="cannot-cast-to-list-of-objects"), - pytest.param({"col10": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col10": "x"}, id="item-not-in-props-doesn't-error"), + pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col11": "x"}, id="item-not-in-props-doesn't-error"), ] ) def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index b928d7445c37..1517abdf6885 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -594,7 +594,6 @@ }, ] ) - .set_expected_logs({"discover": [{"level": "WARN", "message": "Refusing to infer schema for all 2 files; using 1 files."}]}) .set_discovery_policy(LowInferenceLimitDiscoveryPolicy()) ).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 22bff3f66f13..c036140d9715 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -19,9 +19,9 @@ }, "b.csv": { "contents": [ - ("col1",), - ("val_b_11",), - ("val_b_12",), + ("col1", "col2"), + ("val_b_11", "this is text that will trigger validation policy"), + ("val_b_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -29,7 +29,7 @@ "contents": [ ("col1",), ("val_c_11",), - ("val_c_12","val_c_22"), # This record is not parsable + ("val_c_12", "val_c_22"), # This record is not parsable ("val_c_13",), ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -56,7 +56,7 @@ "type": "string", }, "col2": { - "type": "number", + "type": "integer", }, "_ab_source_file_last_modified": { "type": "string" @@ -80,19 +80,19 @@ TestScenarioBuilder() .set_files( { - "a/a1.csv": { # The records in this file do not conform to the schema + "a/a1.csv": { "contents": [ ("col1", "col2"), - ("val_aa1_11", "val_aa1_21"), - ("val_aa1_12", "val_aa1_22"), + ("val_aa1_11", "1"), + ("val_aa1_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, "a/a2.csv": { "contents": [ - ("col1",), - ("val_aa2_11",), - ("val_aa2_12",), + ("col1", "col2"), + ("val_aa2_11", "this is text that will trigger validation policy"), + ("val_aa2_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -115,17 +115,17 @@ "b/b1.csv": { # The records in this file do not conform to the schema "contents": [ - ("col1",), - ("val_bb1_11",), - ("val_bb1_12",), + ("col1", "col2"), + ("val_bb1_11", "1"), + ("val_bb1_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, "b/b2.csv": { "contents": [ ("col1", "col2"), - ("val_bb2_11", "val_bb2_21"), - ("val_bb2_12", "val_bb2_22"), + ("val_bb2_11", "this is text that will trigger validation policy"), + ("val_bb2_12", "2"), ], "last_modified": "2023-06-05T03:54:07.000Z", }, @@ -152,9 +152,9 @@ "col1": { "type": "string", }, - # "col2": { # remove this so the record does not conform to the schema - # "type": "string", - # }, + "col2": { + "type": "integer", + }, "_ab_source_file_last_modified": { "type": "string" }, @@ -175,9 +175,9 @@ "col1": { "type": "string", }, - # "col2": { # remove this so the record does not conform to the schema - # "type": "string", - # }, + "col2": { + "type": "integer", + }, "_ab_source_file_last_modified": { "type": "string" }, @@ -213,10 +213,10 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_b_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_b_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed @@ -227,12 +227,17 @@ "read": [ { "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=a.csv n_skipped=2 validation_policy=skip_record", + "message": "Records in file did not pass validation policy. stream=stream1 file=b.csv n_skipped=1 validation_policy=skip_record", }, { "level": "ERROR", "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=c.csv line_no=2 n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() @@ -262,18 +267,18 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + #{"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform + {"data": {"col1": "val_bb2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] @@ -282,7 +287,7 @@ "read": [ { "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream1 file=a/a1.csv n_skipped=2 validation_policy=skip_record", + "message": "Records in file did not pass validation policy. stream=stream1 file=a/a2.csv n_skipped=1 validation_policy=skip_record", }, { "level": "ERROR", @@ -290,7 +295,17 @@ }, { "level": "WARN", - "message": "Records in file did not pass validation policy. stream=stream2 file=b/b2.csv n_skipped=2 validation_policy=skip_record", + "message": "Records in file did not pass validation policy. stream=stream2 file=b/b2.csv n_skipped=1 validation_policy=skip_record", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] }) @@ -314,10 +329,10 @@ ) .set_expected_records( [ - {"data": {"col1": "val_a_11", "col2": "1", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_a_12", "col2": "2", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_b_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_b_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error @@ -330,6 +345,11 @@ "level": "ERROR", "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() @@ -359,18 +379,18 @@ ) .set_expected_records( [ - {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # Skipped since previous record is malformed {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb2_12", "col2": "val_bb2_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] @@ -381,6 +401,16 @@ "level": "ERROR", "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() @@ -401,14 +431,21 @@ ] } ) - .set_expected_records( - [] # No records are expected because the very first file did not conform to the schema - ) + .set_expected_records([ + {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + # No records past that because the first record for the second file did not conform to the schema + ]) .set_expected_logs({ "read": [ { "level": "WARN", - "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=a.csv validation_policy=wait_for_discover n_skipped=0", + "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=b.csv validation_policy=wait_for_discover n_skipped=0", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] }) @@ -439,18 +476,18 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # The first record does not conform so we don't sync anything from this stream - # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # No more records from this stream are emitted after a nonconforming record is encountered - # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, # {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] @@ -459,12 +496,22 @@ "read": [ { "level": "WARN", - "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=a/a1.csv validation_policy=wait_for_discover n_skipped=0", + "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=a/a2.csv validation_policy=wait_for_discover n_skipped=0", }, { "level": "WARN", "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream2 file=b/b2.csv validation_policy=wait_for_discover n_skipped=0", }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, + { + # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol + "level": "WARNING", + "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", + }, ] }) ).build() From d157aa35a478a1788ef3bb434124c5cb0323957b Mon Sep 17 00:00:00 2001 From: maxi297 Date: Tue, 8 Aug 2023 09:38:28 -0400 Subject: [PATCH 07/21] [ISSUE #28893] self-review + cleanup --- .../sources/file_based/config/csv_format.py | 4 ++-- .../file_based/file_types/avro_parser.py | 9 +++++---- .../sources/file_based/file_types/csv_parser.py | 17 ++++++++++------- .../file_based/file_types/file_type_parser.py | 9 ++++----- .../file_based/file_types/jsonl_parser.py | 12 ++++++------ .../file_based/file_types/parquet_parser.py | 7 ++++--- .../sources/file_based/schema_helpers.py | 6 +++--- .../stream/default_file_based_stream.py | 10 +++++----- .../file_based/config/test_csv_format.py | 3 ++- .../file_based/file_types/test_csv_parser.py | 1 - .../scenarios/validation_policy_scenarios.py | 4 ++-- 11 files changed, 43 insertions(+), 39 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index c3a75e21611e..0312f1f55a0a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -86,7 +86,7 @@ class CsvFormat(BaseModel): infer_datatypes_legacy: bool = Field( title="Infer Datatypes (legacy)", default=False, - description="Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + description="Whether to autogenerate the schema based the file content. This inference does not support list and objects.", ) @validator("delimiter") @@ -130,5 +130,5 @@ def validate_option_inference(cls, values: Mapping[str, Any]) -> Mapping[str, An infer_datatypes = values.get("infer_datatypes", False) infer_datatypes_legacy = values.get("infer_datatypes_legacy", False) if infer_datatypes and infer_datatypes_legacy: - raise ValueError("Only one way to inference can be configured at the same time") + raise ValueError("Only one way to infer can be configured but both infer_datatypes and infer_datatypes_legacy are enabled.") return values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py index bca22b191040..333ab5c04169 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -10,8 +10,9 @@ from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType AVRO_TYPE_TO_JSON_TYPE = { "null": "null", @@ -45,7 +46,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): raise ValueError(f"Expected ParquetFormat, got {avro_format}") @@ -60,7 +61,7 @@ async def infer_schema( field["name"]: AvroParser._convert_avro_type_to_json(avro_format, field["name"], field["type"]) for field in avro_schema["fields"] } - return json_schema # type: ignore + return json_schema @classmethod def _convert_avro_type_to_json(cls, avro_format: AvroFormat, field_name: str, avro_field: str) -> Mapping[str, Any]: @@ -130,7 +131,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: avro_format = config.format[config.file_type] if config.format else AvroFormat() if not isinstance(avro_format, AvroFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index d12f5e0b761e..20b2aed0347c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -15,9 +15,9 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING +from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType DIALECT_NAME = "_config_dialect" @@ -119,7 +119,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: if config.input_schema: # FIXME change type of method to Mapping return config.input_schema # type: ignore # conversion to mapping is handled by pydantic and we shouldn't have a str here @@ -132,8 +132,9 @@ async def infer_schema( config_format.true_values, config_format.false_values, config_format.null_values, - config_format.infer_datatypes and not config_format.infer_datatypes_legacy - ) if config_format.infer_datatypes or config_format.infer_datatypes_legacy + config_format.infer_datatypes and not config_format.infer_datatypes_legacy, + ) + if config_format.infer_datatypes or config_format.infer_datatypes_legacy else _DisabledTypeInferrer() ) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) @@ -158,7 +159,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: config_format = _extract_config_format(config) cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) @@ -286,7 +287,9 @@ class _JsonTypeInferrer(_TypeInferrer): _OBJECT_TYPE = "object" _STRING_TYPE = "string" - def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str], allow_for_objects_and_arrays: bool) -> None: + def __init__( + self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str], allow_for_objects_and_arrays: bool + ) -> None: self._boolean_trues = boolean_trues self._boolean_falses = boolean_falses self._null_values = null_values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py index 26547cf3d63b..9591612bb912 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py @@ -4,14 +4,13 @@ import logging from abc import ABC, abstractmethod -from typing import Any, Dict, Iterable, Mapping, Optional +from typing import Any, Dict, Iterable, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType -# Schema example: {"column1": {"type": "string"}} -Schema = Mapping[str, Dict[str, Any]] Record = Dict[str, Any] @@ -28,7 +27,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: """ Infer the JSON Schema for this file. """ @@ -41,7 +40,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Record]: """ Parse and emit each record. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py index be595f4acc6c..19067064cc28 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py @@ -4,13 +4,13 @@ import json import logging -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, Mapping, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, merge_schemas +from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, SchemaType, merge_schemas class JsonlParser(FileTypeParser): @@ -23,12 +23,12 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: """ Infers the schema for the file by inferring the schema for each line, and merging it with the previously-inferred schema. """ - inferred_schema: Dict[str, Any] = {} + inferred_schema: Mapping[str, Any] = {} read_bytes = 0 with stream_reader.open_file(file, self.file_read_mode, logger) as fp: @@ -52,7 +52,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: with stream_reader.open_file(file, self.file_read_mode, logger) as fp: for line in fp: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py index 16cf63641744..2f19069a696e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -12,8 +12,9 @@ import pyarrow.parquet as pq from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser, Schema +from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser from airbyte_cdk.sources.file_based.remote_file import RemoteFile +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType from pyarrow import Scalar @@ -24,7 +25,7 @@ async def infer_schema( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - ) -> Schema: + ) -> SchemaType: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): raise ValueError(f"Expected ParquetFormat, got {parquet_format}") @@ -47,7 +48,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[Schema], + discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: parquet_format = config.format[config.file_type] if config.format else ParquetFormat() if not isinstance(parquet_format, ParquetFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py index 597c919e39e6..3f7b2151653f 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/schema_helpers.py @@ -11,7 +11,7 @@ from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, SchemaInferenceError JsonSchemaSupportedType = Union[List[str], Literal["string"], str] -SchemaType = Dict[str, Dict[str, JsonSchemaSupportedType]] +SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]] schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}} @@ -99,7 +99,7 @@ def merge_schemas(schema1: SchemaType, schema2: SchemaType) -> SchemaType: if not isinstance(t, dict) or "type" not in t or not _is_valid_type(t["type"]): raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=k, type=t) - merged_schema: Dict[str, Any] = deepcopy(schema1) + merged_schema: Dict[str, Any] = deepcopy(schema1) # type: ignore # as of 2023-08-08, deepcopy can copy Mapping for k2, t2 in schema2.items(): t1 = merged_schema.get(k2) if t1 is None: @@ -116,7 +116,7 @@ def _is_valid_type(t: JsonSchemaSupportedType) -> bool: return t == "array" or get_comparable_type(t) is not None -def _choose_wider_type(key: str, t1: Dict[str, Any], t2: Dict[str, Any]) -> Dict[str, Any]: +def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -> Mapping[str, Any]: if (t1["type"] == "array" or t2["type"] == "array") and t1 != t2: raise SchemaInferenceError( FileBasedSourceError.SCHEMA_INFERENCE_ERROR, diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index e08ec7fdf45a..8ac3fd0ee44a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -6,7 +6,7 @@ import itertools import traceback from functools import cache -from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Set, Union from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level from airbyte_cdk.models import Type as MessageType @@ -20,7 +20,7 @@ StopSyncPerValidationPolicy, ) from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import merge_schemas, schemaless_schema +from airbyte_cdk.sources.file_based.schema_helpers import SchemaType, merge_schemas, schemaless_schema from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream from airbyte_cdk.sources.file_based.stream.cursor import FileBasedCursor from airbyte_cdk.sources.file_based.types import StreamSlice @@ -227,8 +227,8 @@ async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: Each file type has a corresponding `infer_schema` handler. Dispatch on file type. """ - base_schema: Dict[str, Any] = {} - pending_tasks: Set[asyncio.tasks.Task[Dict[str, Any]]] = set() + base_schema: SchemaType = {} + pending_tasks: Set[asyncio.tasks.Task[SchemaType]] = set() n_started, n_files = 0, len(files) files_iterator = iter(files) @@ -247,7 +247,7 @@ async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]: return base_schema - async def _infer_file_schema(self, file: RemoteFile) -> Dict[str, Any]: + async def _infer_file_schema(self, file: RemoteFile) -> SchemaType: try: return await self.get_parser(self.config.file_type).infer_schema(self.config, file, self._stream_reader, self.logger) except Exception as exc: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index edd39964124c..19133f4c014c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -2,8 +2,9 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -import pytest as pytest from typing import Type + +import pytest as pytest from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 24f287054c64..0cfb837a63a1 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -115,7 +115,6 @@ def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(sel self._config.input_schema = {self._HEADER_NAME: {"type": "potato"}} self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") - def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: self._config_format.infer_datatypes = True self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index c036140d9715..db17894d4273 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -269,7 +269,7 @@ [ {"data": {"col1": "val_aa1_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa1_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - #{"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + # {"data": {"col1": "val_aa2_11", "col2": "this is text that will trigger validation policy", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform {"data": {"col1": "val_aa2_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted @@ -434,7 +434,7 @@ .set_expected_records([ {"data": {"col1": "val_a_11", "col2": 1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, {"data": {"col1": "val_a_12", "col2": 2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - # No records past that because the first record for the second file did not conform to the schema + # No records past that because the first record for the second file did not conform to the schema ]) .set_expected_logs({ "read": [ From 57b011f4869be34091e29b9f55187da8a50e3c88 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Tue, 8 Aug 2023 09:40:19 -0400 Subject: [PATCH 08/21] [ISSUE #28893] fix test --- .../unit_tests/sources/file_based/scenarios/csv_scenarios.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 1517abdf6885..6bc3b9bb6e97 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -258,7 +258,7 @@ }, "infer_datatypes_legacy": { "default": False, - "description": "Whether to autogenerate the schema based the file content. This inferrence does not support list and objects", + "description": "Whether to autogenerate the schema based the file content. This inference does not support list and objects.", "title": "Infer Datatypes (legacy)", "type": "boolean" }, From 71cdca9a17cb9268d8ffe1f4e332b65264471ba1 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 09:19:32 -0400 Subject: [PATCH 09/21] [ISSUE #28893] code review part #1 --- .../sources/file_based/file_types/csv_parser.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 20b2aed0347c..233d63440881 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -55,7 +55,6 @@ def read_data( with stream_reader.open_file(file, file_read_mode, logger) as fp: headers = self._get_headers(fp, config_format, dialect_name) - fp.seek(0) # we assume that if we autogenerate columns, it is because we don't have headers # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows rows_to_skip = ( @@ -78,24 +77,28 @@ def read_data( csv.unregister_dialect(dialect_name) def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + """ + Assumes the fp is pointing to the beginning of the files and will reset it as such + """ # Note that this method assumes the dialect has already been registered if we're parsing the headers self._skip_rows(fp, config_format.skip_rows_before_header) if config_format.autogenerate_column_names: - return self._auto_generate_headers(fp, config_format, dialect_name) + headers = self._auto_generate_headers(fp, dialect_name) else: # Then read the header reader = csv.reader(fp, dialect=dialect_name) # type: ignore - return list(next(reader)) + headers = list(next(reader)) - def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]: + fp.seek(0) + return headers + + def _auto_generate_headers(self, fp: IOBase, dialect_name: str) -> List[str]: """ Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True. See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html """ reader = csv.reader(fp, dialect=dialect_name) # type: ignore number_of_columns = len(next(reader)) # type: ignore - # Reset the file pointer to the beginning of the file so that the first row is not skipped - fp.seek(0) return [f"f{i}" for i in range(number_of_columns)] @staticmethod @@ -198,7 +201,7 @@ def _cast_types( Array and object types are only handled if they can be deserialized as JSON. - If any errors are encountered, the value will be emitted as a string._to_nullable + If any errors are encountered, the value will be emitted as a string. """ warnings = [] result = {} From f651d03879fe9d711143f16f9dee2457a0c1f539 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 10:44:13 -0400 Subject: [PATCH 10/21] [ISSUE #28893] code review part #2 --- .../sources/file_based/config/csv_format.py | 23 +++------ .../file_based/file_types/csv_parser.py | 25 ++++++---- .../file_based/config/test_csv_format.py | 17 ------- .../file_based/file_types/test_csv_parser.py | 49 +++++++++---------- 4 files changed, 48 insertions(+), 66 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index 77d3fc2d7674..0a5261f54383 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -17,6 +17,12 @@ class QuotingBehavior(Enum): QUOTE_NONE = "Quote None" +class InferenceType(Enum): + NONE = "None" + PRIMITIVE_TYPES_ONLY = "Primitive Types Only" + PRIMITIVE_AND_COMPLEX_TYPES = "Primitive and Complex Types" + + DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"] DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"] @@ -81,16 +87,11 @@ class Config: default=DEFAULT_FALSE_VALUES, description="A set of case-sensitive strings that should be interpreted as false values.", ) - infer_datatypes: bool = Field( + inference_type: InferenceType = Field( title="Infer Datatypes", - default=False, + default=InferenceType.NONE, description="Whether to autogenerate the schema based the file content.", ) - infer_datatypes_legacy: bool = Field( - title="Infer Datatypes (legacy)", - default=False, - description="Whether to autogenerate the schema based the file content. This inference does not support list and objects.", - ) @validator("delimiter") def validate_delimiter(cls, v: str) -> str: @@ -127,11 +128,3 @@ def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, if skip_rows_before_header > 0 and auto_generate_column_names: raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.") return values - - @root_validator - def validate_option_inference(cls, values: Mapping[str, Any]) -> Mapping[str, Any]: - infer_datatypes = values.get("infer_datatypes", False) - infer_datatypes_legacy = values.get("infer_datatypes_legacy", False) - if infer_datatypes and infer_datatypes_legacy: - raise ValueError("Only one way to infer can be configured but both infer_datatypes and infer_datatypes_legacy are enabled.") - return values diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index d0d5adef7bef..fedf207c84a4 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -11,7 +11,7 @@ from io import IOBase from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set -from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior +from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType, QuotingBehavior from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode @@ -38,7 +38,7 @@ def read_data( logger: logging.Logger, file_read_mode: FileReadMode, ) -> Generator[Dict[str, Any], None, None]: - config_format = config.format or CsvFormat() + config_format = _extract_format(config) # Formats are configured individually per-stream so a unique dialect should be registered for each stream. # We don't unregister the dialect because we are lazily parsing each csv file to generate records @@ -123,21 +123,21 @@ async def infer_schema( stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, ) -> SchemaType: - if config.input_schema: - # FIXME change type of method to Mapping - return config.input_schema # type: ignore # conversion to mapping is handled by pydantic and we shouldn't have a str here + input_schema = config.get_input_schema() + if input_schema: + return input_schema # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual # sources will likely require one. Rather than modify the interface now we can wait until the real use case - config_format = config.format or CsvFormat() + config_format = _extract_format(config) type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict( lambda: _JsonTypeInferrer( config_format.true_values, config_format.false_values, config_format.null_values, - config_format.infer_datatypes and not config_format.infer_datatypes_legacy, + config_format.inference_type == InferenceType.PRIMITIVE_AND_COMPLEX_TYPES ) - if config_format.infer_datatypes or config_format.infer_datatypes_legacy + if config_format.inference_type != InferenceType.NONE else _DisabledTypeInferrer() ) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) @@ -164,7 +164,7 @@ def parse_records( logger: logging.Logger, discovered_schema: Optional[SchemaType], ) -> Iterable[Dict[str, Any]]: - config_format = config.format or CsvFormat() + config_format = _extract_format(config) cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) for row in data_generator: @@ -411,3 +411,10 @@ def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str: def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]: return row + + +def _extract_format(config: FileBasedStreamConfig) -> CsvFormat: + config_format = config.format or CsvFormat() + if not isinstance(config_format, CsvFormat): + raise ValueError(f"Invalid format config: {config_format}") + return config_format diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index 19133f4c014c..de85b9e3ebae 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -23,20 +23,3 @@ def test_csv_format_skip_rows_and_autogenerate_column_names(skip_rows_before_hea CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) else: CsvFormat(skip_rows_before_header=skip_rows_before_header, autogenerate_column_names=autogenerate_column_names) - - -@pytest.mark.parametrize( - "infer_datatypes, infer_datatypes_legacy, expected_error", - [ - pytest.param(True, True, ValueError, id="test_many_inferences_configured"), - pytest.param(True, False, None, id="test_infer_datatypes"), - pytest.param(False, True, None, id="test_infer_datatypes_legacy"), - pytest.param(False, False, None, id="test_no_inference"), - ] -) -def test_csv_format_inference(infer_datatypes: bool, infer_datatypes_legacy: bool, expected_error: Type[BaseException]) -> None: - if expected_error: - with pytest.raises(expected_error): - CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) - else: - CsvFormat(infer_datatypes=infer_datatypes, infer_datatypes_legacy=infer_datatypes_legacy) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 5421c31bd086..c500bb3b264b 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -12,7 +12,7 @@ from unittest.mock import Mock import pytest -from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat +from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat, InferenceType from airbyte_cdk.sources.file_based.exceptions import RecordParseError from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader @@ -99,10 +99,9 @@ def setUp(self) -> None: self._config_format.true_values = _DEFAULT_TRUE_VALUES self._config_format.false_values = _DEFAULT_FALSE_VALUES self._config_format.null_values = {self._A_NULL_VALUE} - self._config_format.infer_datatypes = False - self._config_format.infer_datatypes_legacy = False + self._config_format.inference_type = InferenceType.NONE self._config = Mock() - self._config.input_schema = None + self._config.get_input_schema.return_value = None self._config.format = self._config_format self._file = Mock(spec=RemoteFile) @@ -112,71 +111,71 @@ def setUp(self) -> None: self._parser = CsvParser(self._csv_reader) def test_given_user_schema_defined_when_infer_schema_then_return_user_schema(self) -> None: - self._config.input_schema = {self._HEADER_NAME: {"type": "potato"}} + self._config.get_input_schema.return_value = {self._HEADER_NAME: {"type": "potato"}} self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "potato") def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") - def test_given_legacy_and_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "5645"], "integer") - def test_given_legacy_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "5645"], "integer") def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "2.312"], "number") - def test_given_legacy_and_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "2.312"], "number") def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") def test_given_arrays_only_when_infer_schema_then_type_is_string(self) -> None: - self._config_format.infer_datatypes_legacy = True + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") - def test_given_legacy_and_objects_only_when_infer_schema_then_type_is_string(self) -> None: - self._config_format.infer_datatypes_legacy = True + def test_given_primitive_only_and_objects_only_when_infer_schema_then_type_is_string(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["a string", "another string"], "string") def test_given_a_null_value_when_infer_then_ignore_null(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "5645", self._A_NULL_VALUE], "integer") def test_given_only_null_values_when_infer_then_type_is_string(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema([self._A_NULL_VALUE, self._A_NULL_VALUE, self._A_NULL_VALUE], "string") def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._config_format.infer_datatypes = True + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered From a573a898095a2f123a3c39e715b9b3936ea1ba69 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 10:47:27 -0400 Subject: [PATCH 11/21] Fix test --- .../sources/file_based/config/csv_format.py | 4 ++-- .../file_based/scenarios/csv_scenarios.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index 0a5261f54383..bc9afb4bc721 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -88,9 +88,9 @@ class Config: description="A set of case-sensitive strings that should be interpreted as false values.", ) inference_type: InferenceType = Field( - title="Infer Datatypes", + title="Inference Type", default=InferenceType.NONE, - description="Whether to autogenerate the schema based the file content.", + description="How to infer the types of the columns.", ) @validator("delimiter") diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 0429a38e10fd..45543775c627 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -133,17 +133,15 @@ ], "type": "string" }, - "infer_datatypes": { - "default": False, - "description": "Whether to autogenerate the schema based the file content.", - "title": "Infer Datatypes", - "type": "boolean" - }, - "infer_datatypes_legacy": { - "default": False, - "description": "Whether to autogenerate the schema based the file content. This inference does not support list and objects.", - "title": "Infer Datatypes (legacy)", - "type": "boolean" + "inference_type": { + "default": "None", + "description": "How to infer the types of the columns.", + "title": "Inference Type", + "enum": [ + "None", + "Primitive Types Only", + "Primitive and Complex Types", + ] }, "delimiter": { "title": "Delimiter", From 0ce37e58ab7de607ff3d4de0ddb6d19e0a7703f8 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 10:48:20 -0400 Subject: [PATCH 12/21] formatcdk --- .../airbyte_cdk/sources/file_based/file_types/csv_parser.py | 2 +- .../unit_tests/sources/file_based/config/test_csv_format.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index fedf207c84a4..9d6ce17420d3 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -135,7 +135,7 @@ async def infer_schema( config_format.true_values, config_format.false_values, config_format.null_values, - config_format.inference_type == InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + config_format.inference_type == InferenceType.PRIMITIVE_AND_COMPLEX_TYPES, ) if config_format.inference_type != InferenceType.NONE else _DisabledTypeInferrer() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py index de85b9e3ebae..e04c42b6858c 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/config/test_csv_format.py @@ -2,8 +2,6 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -from typing import Type - import pytest as pytest from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat From 82db6c3f435e67c04d294de69177078df4a69cf2 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 15:04:52 -0400 Subject: [PATCH 13/21] [ISSUE #28893] code review --- .../airbyte_cdk/sources/file_based/config/csv_format.py | 2 +- .../airbyte_cdk/sources/file_based/file_types/avro_parser.py | 2 +- .../airbyte_cdk/sources/file_based/file_types/csv_parser.py | 4 ++-- .../sources/file_based/file_types/file_type_parser.py | 4 ++-- .../airbyte_cdk/sources/file_based/file_types/jsonl_parser.py | 2 +- .../sources/file_based/file_types/parquet_parser.py | 2 +- .../unit_tests/sources/file_based/scenarios/csv_scenarios.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index bc9afb4bc721..c7d016288da5 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -90,7 +90,7 @@ class Config: inference_type: InferenceType = Field( title="Inference Type", default=InferenceType.NONE, - description="How to infer the types of the columns.", + description="How to infer the types of the columns. If none, inference default to strings.", ) @validator("delimiter") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py index d9a351ce8fdf..ff168355c54d 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/avro_parser.py @@ -131,7 +131,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: avro_format = config.format or AvroFormat() if not isinstance(avro_format, AvroFormat): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 9d6ce17420d3..42f2a911767c 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -162,7 +162,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: config_format = _extract_format(config) cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) @@ -177,7 +177,7 @@ def file_read_mode(self) -> FileReadMode: @staticmethod def _get_cast_function( - schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger + schema: Optional[Mapping[str, SchemaType]], config_format: CsvFormat, logger: logging.Logger ) -> Callable[[Mapping[str, str]], Mapping[str, str]]: # Only cast values if the schema is provided if schema: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py index 9591612bb912..0e52fb5e04df 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py @@ -4,7 +4,7 @@ import logging from abc import ABC, abstractmethod -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, Mapping, Optional from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode @@ -40,7 +40,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Record]: """ Parse and emit each record. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py index 19067064cc28..609e071a3868 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py @@ -52,7 +52,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: with stream_reader.open_file(file, self.file_read_mode, logger) as fp: for line in fp: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py index 465758551786..fa771e446d9b 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -48,7 +48,7 @@ def parse_records( file: RemoteFile, stream_reader: AbstractFileBasedStreamReader, logger: logging.Logger, - discovered_schema: Optional[SchemaType], + discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: parquet_format = config.format or ParquetFormat() if not isinstance(parquet_format, ParquetFormat): diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 45543775c627..0991e083b596 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -135,7 +135,7 @@ }, "inference_type": { "default": "None", - "description": "How to infer the types of the columns.", + "description": "How to infer the types of the columns. If none, inference default to strings.", "title": "Inference Type", "enum": [ "None", From 3027a4fcfe3adc3be40eb649f77e03c1ab528704 Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 16:19:22 -0400 Subject: [PATCH 14/21] FIX test log level --- .../scenarios/validation_policy_scenarios.py | 27 +++++-------- .../sources/file_based/test_scenarios.py | 38 ++++++++++--------- 2 files changed, 29 insertions(+), 36 deletions(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 2db77996ed79..3efb1f81d8b2 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -234,8 +234,7 @@ "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=c.csv line_no=2 n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -298,13 +297,11 @@ "message": "Records in file did not pass validation policy. stream=stream2 file=b/b2.csv n_skipped=1 validation_policy=skip_record", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -346,8 +343,7 @@ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -402,13 +398,11 @@ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -443,8 +437,7 @@ "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream1 file=b.csv validation_policy=Wait for Discover n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] @@ -503,13 +496,11 @@ "message": "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream=stream2 file=b/b2.csv validation_policy=Wait for Discover n_skipped=0", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, { - # FIXME using logging.logger return level WARNING and not WARN which does not align with the airbyte protocol - "level": "WARNING", + "level": "WARN", "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer", }, ] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py index 24fdfaff5cad..9a82efed038a 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -11,6 +11,7 @@ from _pytest.capture import CaptureFixture from _pytest.reports import ExceptionInfo from airbyte_cdk.entrypoint import launch +from airbyte_cdk.logger import AirbyteLogFormatter from airbyte_cdk.models import SyncMode from freezegun import freeze_time from pytest import LogCaptureFixture @@ -217,7 +218,7 @@ def test_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: Te _verify_expected_logs(logs, discover_logs) -read_scenarios = discover_scenarios + [ +read_scenarios = [ emit_record_scenario_multi_stream, emit_record_scenario_single_stream, skip_record_scenario_multi_stream, @@ -230,6 +231,7 @@ def test_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: Te @pytest.mark.parametrize("scenario", read_scenarios, ids=[s.name for s in read_scenarios]) @freeze_time("2023-06-09T00:00:00Z") def test_read(capsys: CaptureFixture[str], caplog: LogCaptureFixture, tmp_path: PosixPath, scenario: TestScenario) -> None: + caplog.handler.setFormatter(AirbyteLogFormatter()) if scenario.incremental_scenario_config: run_test_read_incremental(capsys, caplog, tmp_path, scenario) else: @@ -369,23 +371,23 @@ def discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: TestSce def read(capsys: CaptureFixture[str], caplog: LogCaptureFixture, tmp_path: PosixPath, scenario: TestScenario) -> Dict[str, Any]: - launch( - scenario.source, - [ - "read", - "--config", - make_file(tmp_path / "config.json", scenario.config), - "--catalog", - make_file(tmp_path / "catalog.json", scenario.configured_catalog(SyncMode.full_refresh)), - ], - ) - captured = capsys.readouterr().out.splitlines() - logs = caplog.records - return { - "records": [msg for msg in (json.loads(line) for line in captured) if msg["type"] == "RECORD"], - "logs": [msg["log"] for msg in (json.loads(line) for line in captured) if msg["type"] == "LOG"] - + [{"level": log.levelname, "message": log.message} for log in logs], - } + with caplog.handler.stream as logger_stream: + launch( + scenario.source, + [ + "read", + "--config", + make_file(tmp_path / "config.json", scenario.config), + "--catalog", + make_file(tmp_path / "catalog.json", scenario.configured_catalog(SyncMode.full_refresh)), + ], + ) + captured = capsys.readouterr().out.splitlines() + logger_stream.getvalue().split("\n")[:-1] + + return { + "records": [msg for msg in (json.loads(line) for line in captured) if msg["type"] == "RECORD"], + "logs": [msg["log"] for msg in (json.loads(line) for line in captured) if msg["type"] == "LOG"], + } def read_with_state( From ac91730a1a48747c97343f8cc24e80cf67b05dec Mon Sep 17 00:00:00 2001 From: maxi297 Date: Wed, 9 Aug 2023 16:29:23 -0400 Subject: [PATCH 15/21] Re-adding failing tests --- .../scenarios/user_input_schema_scenarios.py | 12 ++++++------ .../unit_tests/sources/file_based/test_scenarios.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py index a84f5f9acef1..33c8587a04d2 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py @@ -218,11 +218,11 @@ 'message': 'Records in file did not pass validation policy. stream=stream1 file=a.csv n_skipped=2 validation_policy=skip_record', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col1: value=val11,expected_type=integer', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col1: value=val21,expected_type=integer', }, ] @@ -553,11 +553,11 @@ .set_expected_logs({ "read": [ { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val12b,expected_type=integer', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val22b,expected_type=integer', }, ] @@ -696,11 +696,11 @@ 'message': 'Records in file did not pass validation policy. stream=stream2 file=b.csv n_skipped=2 validation_policy=skip_record', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val12b,expected_type=integer', }, { - 'level': 'WARNING', + 'level': "WARN", 'message': 'Could not cast the value to the expected type.: col2: value=val22b,expected_type=integer', }, ] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py index 9a82efed038a..d4947b2cc020 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -218,7 +218,7 @@ def test_discover(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: Te _verify_expected_logs(logs, discover_logs) -read_scenarios = [ +read_scenarios = discover_scenarios + [ emit_record_scenario_multi_stream, emit_record_scenario_single_stream, skip_record_scenario_multi_stream, From f1a60bab04e24f54a4d5724655d048d379f60e3c Mon Sep 17 00:00:00 2001 From: maxi297 Date: Thu, 10 Aug 2023 14:16:44 -0400 Subject: [PATCH 16/21] [ISSUE #28893] improve inferrence to consider multiple types per value --- .../file_based/file_types/csv_parser.py | 51 +++++++++++-------- .../file_based/file_types/test_csv_parser.py | 8 ++- .../scenarios/validation_policy_scenarios.py | 4 +- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 42f2a911767c..7129cd77c743 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -303,41 +303,50 @@ def add_value(self, value: Any) -> None: self._values.add(value) def infer(self) -> str: - types = {self._infer_type(value) for value in self._values} - if types == {self._NULL_TYPE}: + types_by_value = {value: self._infer_type(value) for value in self._values} + types_excluding_null_values = [types for types in types_by_value.values() if self._NULL_TYPE not in types] + if not types_excluding_null_values: # this is highly unusual but we will consider the column as a string return self._STRING_TYPE - types.discard(self._NULL_TYPE) - if types == {self._BOOLEAN_TYPE}: + types = set.intersection(*types_excluding_null_values) + if self._BOOLEAN_TYPE in types: return self._BOOLEAN_TYPE - elif types == {self._INTEGER_TYPE}: + elif self._INTEGER_TYPE in types: return self._INTEGER_TYPE - elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}: + elif self._NUMBER_TYPE in types: return self._NUMBER_TYPE elif not self._allow_for_objects_and_arrays: return self._STRING_TYPE - elif types == {self._ARRAY_TYPE}: + elif self._ARRAY_TYPE in types: return self._ARRAY_TYPE - elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: + elif any(self._OBJECT_TYPE in _type for _type in types_by_value.values()): return self._OBJECT_TYPE return self._STRING_TYPE - def _infer_type(self, value: str) -> str: + def _infer_type(self, value: str) -> Set[str]: + inferred_types = set() + if value in self._null_values: - return self._NULL_TYPE - elif self._is_boolean(value): - return self._BOOLEAN_TYPE - elif self._is_integer(value): - return self._INTEGER_TYPE + inferred_types.add(self._NULL_TYPE) + if self._is_boolean(value): + inferred_types.add(self._BOOLEAN_TYPE) + if self._is_integer(value): + inferred_types.add(self._INTEGER_TYPE) + inferred_types.add(self._NUMBER_TYPE) elif self._is_number(value): - return self._NUMBER_TYPE - elif self._is_array(value): - return self._ARRAY_TYPE - elif self._is_object(value): - return self._OBJECT_TYPE - else: - return self._STRING_TYPE + inferred_types.add(self._NUMBER_TYPE) + + # if it can be infered as a primitive, it can't be a complex type so we'll avoid json parsing + if not inferred_types: + if self._is_array(value): + inferred_types.add(self._ARRAY_TYPE) + inferred_types.add(self._OBJECT_TYPE) + elif self._is_object(value): + inferred_types.add(self._OBJECT_TYPE) + + inferred_types.add(self._STRING_TYPE) + return inferred_types def _is_boolean(self, value: str) -> bool: try: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index c500bb3b264b..819e993d36c7 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -86,8 +86,8 @@ def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_v assert CsvParser._cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output -_DEFAULT_TRUE_VALUES = {"yes", "yeah", "right"} -_DEFAULT_FALSE_VALUES = {"no", "nop", "wrong"} +_DEFAULT_TRUE_VALUES = {"1", "yes", "yeah", "right"} +_DEFAULT_FALSE_VALUES = {"0", "no", "nop", "wrong"} class SchemaInferenceTestCase(TestCase): @@ -126,6 +126,10 @@ def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> Non self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES self._test_infer_schema(["2", "90329", "5645"], "integer") + def test_given_integer_overlap_with_bool_value_only_when_infer_schema_then_type_is_integer(self) -> None: + self._config_format.inference_type = InferenceType.PRIMITIVE_AND_COMPLEX_TYPES + self._test_infer_schema(["1", "90329", "5645"], "integer") # here, "1" is also considered a boolean + def test_given_primitive_only_and_integers_only_when_infer_schema_then_type_is_integer(self) -> None: self._config_format.inference_type = InferenceType.PRIMITIVE_TYPES_ONLY self._test_infer_schema(["2", "90329", "5645"], "integer") diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index 3efb1f81d8b2..fb2abf648e1e 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -9,7 +9,7 @@ TestScenarioBuilder() .set_files( { - "a.csv": { # The records in this file do not conform to the schema + "a.csv": { "contents": [ ("col1", "col2"), ("val_a_11", "1"), @@ -17,7 +17,7 @@ ], "last_modified": "2023-06-05T03:54:07.000Z", }, - "b.csv": { + "b.csv": { # The records in this file do not conform to the schema "contents": [ ("col1", "col2"), ("val_b_11", "this is text that will trigger validation policy"), From c9e20049ab911d2a73c17b592fa337803b48376c Mon Sep 17 00:00:00 2001 From: maxi297 Date: Thu, 10 Aug 2023 19:54:41 +0000 Subject: [PATCH 17/21] Automated Commit - Formatting Changes --- .../sources/file_based/file_types/test_csv_parser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 85718cf8648e..aa080d06c228 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -9,8 +9,7 @@ import unittest from datetime import datetime from typing import Any, Dict, Generator, List, Set -from unittest import TestCase -from unittest import mock +from unittest import TestCase, mock from unittest.mock import Mock import pytest From e6a0b4dd2eacccfe2925c0ff9c96252f702858ce Mon Sep 17 00:00:00 2001 From: brianjlai Date: Thu, 10 Aug 2023 18:51:36 -0700 Subject: [PATCH 18/21] add file adapters for avro, csv, jsonl, and parquet --- .../source_s3/v4/legacy_config_transformer.py | 41 ++++- .../v4/test_legacy_config_transformer.py | 161 ++++++++++++++++-- 2 files changed, 188 insertions(+), 14 deletions(-) diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py index 8406b682f249..6acd996d4cac 100644 --- a/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py +++ b/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py @@ -3,9 +3,13 @@ # from datetime import datetime -from typing import Any, List, Mapping +from typing import Any, List, Mapping, Union from source_s3.source import SourceS3Spec +from source_s3.source_files_abstract.formats.avro_spec import AvroFormat +from source_s3.source_files_abstract.formats.csv_spec import CsvFormat +from source_s3.source_files_abstract.formats.jsonl_spec import JsonlFormat +from source_s3.source_files_abstract.formats.parquet_spec import ParquetFormat SECONDS_FORMAT = "%Y-%m-%dT%H:%M:%SZ" MICROS_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" @@ -27,7 +31,6 @@ def convert(cls, legacy_config: SourceS3Spec) -> Mapping[str, Any]: "file_type": legacy_config.format.filetype, "globs": cls._create_globs(legacy_config.path_pattern, legacy_config.provider.path_prefix), "validation_policy": "Emit Record", - # todo: add formats on a per-type basis in follow up PRs } ], } @@ -42,6 +45,8 @@ def convert(cls, legacy_config: SourceS3Spec) -> Mapping[str, Any]: transformed_config["endpoint"] = legacy_config.provider.endpoint if legacy_config.user_schema and legacy_config.user_schema != "{}": transformed_config["streams"][0]["input_schema"] = legacy_config.user_schema + if legacy_config.format: + transformed_config["streams"][0]["format"] = cls._transform_file_format(legacy_config.format) return transformed_config @@ -55,6 +60,36 @@ def _create_globs(cls, path_pattern: str, path_prefix: str) -> List[str]: def _transform_seconds_to_micros(cls, datetime_str: str) -> str: try: parsed_datetime = datetime.strptime(datetime_str, SECONDS_FORMAT) - return datetime.strftime(parsed_datetime, MICROS_FORMAT) + return parsed_datetime.strftime(MICROS_FORMAT) except ValueError as e: raise e + + @classmethod + def _transform_file_format(cls, format_options: Union[CsvFormat, ParquetFormat, AvroFormat, JsonlFormat]) -> Mapping[str, Any]: + if isinstance(format_options, AvroFormat): + return {"filetype": "avro"} + elif isinstance(format_options, CsvFormat): + csv_options = { + "filetype": "csv", + "delimiter": format_options.delimiter, + "quote_char": format_options.quote_char, + "double_quote": format_options.double_quote, + "null_values": ["", "null", "NULL", "N/A", "NA", "NaN", "None"], + "true_values": ["y", "yes", "t", "true", "on", "1"], + "false_values": ["n", "no", "f", "false", "off", "0"], + "inference_type": "Primitive Types Only" if format_options.infer_datatypes else "None", + "strings_can_be_null": True, + } + + if format_options.escape_char: + csv_options["escape_char"] = format_options.escape_char + if format_options.encoding: + csv_options["encoding"] = format_options.encoding + return csv_options + elif isinstance(format_options, JsonlFormat): + return {"filetype": "jsonl"} + elif isinstance(format_options, ParquetFormat): + return {"filetype": "parquet"} + else: + # This should never happen because it would fail schema validation + raise ValueError(f"Format filetype {format_options} is not a supported file type") diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py index e0b5dd777be1..09d81674a99b 100644 --- a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py +++ b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py @@ -24,14 +24,9 @@ }, "format": { - "filetype": "csv", - "delimiter": "^", - "quote_char": "|", - "escape_char": "!", - "double_quote": True, - "quoting_behavior": "Quote All" + "filetype": "avro", }, - "path_pattern": "**/*.csv", + "path_pattern": "**/*.avro", "schema": '{"col1": "string", "col2": "integer"}' }, { @@ -43,10 +38,13 @@ "streams": [ { "name": "test_data", - "file_type": "csv", - "globs": ["a_folder/**/*.csv"], + "file_type": "avro", + "globs": ["a_folder/**/*.avro"], "validation_policy": "Emit Record", "input_schema": '{"col1": "string", "col2": "integer"}', + "format": { + "filetype": "avro" + } } ] } @@ -62,7 +60,7 @@ "format": { "filetype": "avro", }, - "path_pattern": "**/*.csv", + "path_pattern": "**/*.avro", }, { "bucket": "test_bucket", @@ -70,8 +68,11 @@ { "name": "test_data", "file_type": "avro", - "globs": ["**/*.csv"], + "globs": ["**/*.avro"], "validation_policy": "Emit Record", + "format": { + "filetype": "avro" + } } ] } @@ -84,3 +85,141 @@ def test_convert_legacy_config(legacy_config, expected_config): actual_config = LegacyConfigTransformer.convert(parsed_legacy_config) assert actual_config == expected_config + + +@pytest.mark.parametrize( + "file_type,legacy_format_config,expected_format_config", + [ + pytest.param( + "csv", + { + "filetype": "csv", + "delimiter": "&", + "infer_datatypes": False, + "quote_char": "^", + "escape_char": "$", + "encoding": "ansi", + "double_quote": False, + "newlines_in_values": True, + "blocksize": 20000, + }, + { + "filetype": "csv", + "delimiter": "&", + "quote_char": "^", + "escape_char": "$", + "encoding": "ansi", + "double_quote": False, + "null_values": ["", "null", "NULL", "N/A", "NA", "NaN", "None"], + "true_values": ["y", "yes", "t", "true", "on", "1"], + "false_values": ["n", "no", "f", "false", "off", "0"], + "inference_type": "None", + "strings_can_be_null": True, + } + , id="test_csv_all_legacy_options_set"), + pytest.param( + "csv", + { + "filetype": "csv", + "delimiter": "&", + "quote_char": "^", + "double_quote": True, + "newlines_in_values": False, + }, + { + "filetype": "csv", + "delimiter": "&", + "quote_char": "^", + "encoding": "utf8", + "double_quote": True, + "null_values": ["", "null", "NULL", "N/A", "NA", "NaN", "None"], + "true_values": ["y", "yes", "t", "true", "on", "1"], + "false_values": ["n", "no", "f", "false", "off", "0"], + "inference_type": "Primitive Types Only", + "strings_can_be_null": True, + } + , id="test_csv_only_required_options"), + pytest.param( + "csv", + {}, + { + "filetype": "csv", + "delimiter": ",", + "quote_char": "\"", + "encoding": "utf8", + "double_quote": True, + "null_values": ["", "null", "NULL", "N/A", "NA", "NaN", "None"], + "true_values": ["y", "yes", "t", "true", "on", "1"], + "false_values": ["n", "no", "f", "false", "off", "0"], + "inference_type": "Primitive Types Only", + "strings_can_be_null": True, + } + , id="test_csv_empty_format"), + pytest.param( + "jsonl", + { + "filetype": "jsonl", + "newlines_in_values": True, + "unexpected_field_behavior": "ignore", + "block_size": 0, + }, + { + "filetype": "jsonl" + } + , id="test_jsonl_format"), + pytest.param( + "parquet", + { + "filetype": "parquet", + "columns": ["test"], + "batch_size": 65536, + "buffer_size": 100, + }, + { + "filetype": "parquet" + } + , id="test_parquet_format"), + pytest.param( + "avro", + { + "filetype": "avro", + }, + { + "filetype": "avro" + } + , id="test_avro_format"), + ] +) +def test_convert_file_format(file_type, legacy_format_config, expected_format_config): + legacy_config = { + "dataset": "test_data", + "provider": { + "storage": "S3", + "bucket": "test_bucket", + "aws_access_key_id": "some_access_key", + "aws_secret_access_key": "some_secret", + + }, + "format": legacy_format_config, + "path_pattern": f"**/*.{file_type}", + } + + expected_config = { + "bucket": "test_bucket", + "aws_access_key_id": "some_access_key", + "aws_secret_access_key": "some_secret", + "streams": [ + { + "name": "test_data", + "file_type": file_type, + "globs": [f"**/*.{file_type}"], + "validation_policy": "Emit Record", + "format": expected_format_config + } + ] + } + + parsed_legacy_config = SourceS3Spec(**legacy_config) + actual_config = LegacyConfigTransformer.convert(parsed_legacy_config) + + assert actual_config == expected_config From 2f827c0334e36750edd61582ddbd40955af53d98 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Thu, 10 Aug 2023 22:56:21 -0700 Subject: [PATCH 19/21] fix try catch --- .../source-s3/source_s3/v4/legacy_config_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py index 6acd996d4cac..607cc2e068de 100644 --- a/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py +++ b/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py @@ -62,7 +62,7 @@ def _transform_seconds_to_micros(cls, datetime_str: str) -> str: parsed_datetime = datetime.strptime(datetime_str, SECONDS_FORMAT) return parsed_datetime.strftime(MICROS_FORMAT) except ValueError as e: - raise e + raise ValueError("Timestamp could not be parsed when transforming legacy connector config") from e @classmethod def _transform_file_format(cls, format_options: Union[CsvFormat, ParquetFormat, AvroFormat, JsonlFormat]) -> Mapping[str, Any]: From 868a597e3f63dac777abbbdb20783eb30319f8ff Mon Sep 17 00:00:00 2001 From: brianjlai Date: Fri, 11 Aug 2023 15:07:48 -0700 Subject: [PATCH 20/21] pr feedback with a few additional default options set --- .../source_s3/v4/legacy_config_transformer.py | 22 +++++- .../v4/test_legacy_config_transformer.py | 67 ++++++++++++++----- 2 files changed, 69 insertions(+), 20 deletions(-) diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py index 607cc2e068de..d7ec5c7e955d 100644 --- a/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py +++ b/airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py @@ -2,8 +2,9 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # +import json from datetime import datetime -from typing import Any, List, Mapping, Union +from typing import Any, List, Mapping, Optional, Union from source_s3.source import SourceS3Spec from source_s3.source_files_abstract.formats.avro_spec import AvroFormat @@ -69,6 +70,9 @@ def _transform_file_format(cls, format_options: Union[CsvFormat, ParquetFormat, if isinstance(format_options, AvroFormat): return {"filetype": "avro"} elif isinstance(format_options, CsvFormat): + additional_reader_options = cls.parse_config_options_str("additional_reader_options", format_options.additional_reader_options) + advanced_options = cls.parse_config_options_str("advanced_options", format_options.advanced_options) + csv_options = { "filetype": "csv", "delimiter": format_options.delimiter, @@ -78,13 +82,19 @@ def _transform_file_format(cls, format_options: Union[CsvFormat, ParquetFormat, "true_values": ["y", "yes", "t", "true", "on", "1"], "false_values": ["n", "no", "f", "false", "off", "0"], "inference_type": "Primitive Types Only" if format_options.infer_datatypes else "None", - "strings_can_be_null": True, + "strings_can_be_null": additional_reader_options.get("strings_can_be_null", False), } if format_options.escape_char: csv_options["escape_char"] = format_options.escape_char if format_options.encoding: csv_options["encoding"] = format_options.encoding + if "skip_rows" in advanced_options: + csv_options["skip_rows_before_header"] = advanced_options["skip_rows"] + if "skip_rows_after_names" in advanced_options: + csv_options["skip_rows_after_header"] = advanced_options["skip_rows_after_names"] + if "autogenerate_column_names" in advanced_options: + csv_options["autogenerate_column_names"] = advanced_options["autogenerate_column_names"] return csv_options elif isinstance(format_options, JsonlFormat): return {"filetype": "jsonl"} @@ -93,3 +103,11 @@ def _transform_file_format(cls, format_options: Union[CsvFormat, ParquetFormat, else: # This should never happen because it would fail schema validation raise ValueError(f"Format filetype {format_options} is not a supported file type") + + @classmethod + def parse_config_options_str(cls, options_field: str, options_value: Optional[str]) -> Mapping[str, Any]: + options_str = options_value or "{}" + try: + return json.loads(options_str) + except json.JSONDecodeError as error: + raise ValueError(f"Malformed {options_field} config json: {error}. Please ensure that it is a valid JSON.") diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py index 09d81674a99b..aabbca08da71 100644 --- a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py +++ b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py @@ -88,7 +88,7 @@ def test_convert_legacy_config(legacy_config, expected_config): @pytest.mark.parametrize( - "file_type,legacy_format_config,expected_format_config", + "file_type,legacy_format_config,expected_format_config, expected_error", [ pytest.param( "csv", @@ -101,6 +101,8 @@ def test_convert_legacy_config(legacy_config, expected_config): "encoding": "ansi", "double_quote": False, "newlines_in_values": True, + "additional_reader_options": "{\"strings_can_be_null\": true}", + "advanced_options": "{\"skip_rows\": 3, \"skip_rows_after_names\": 5, \"autogenerate_column_names\": true}", "blocksize": 20000, }, { @@ -115,8 +117,12 @@ def test_convert_legacy_config(legacy_config, expected_config): "false_values": ["n", "no", "f", "false", "off", "0"], "inference_type": "None", "strings_can_be_null": True, - } - , id="test_csv_all_legacy_options_set"), + "skip_rows_before_header": 3, + "skip_rows_after_header": 5, + "autogenerate_column_names": True, + }, + None, + id="test_csv_all_legacy_options_set"), pytest.param( "csv", { @@ -136,9 +142,10 @@ def test_convert_legacy_config(legacy_config, expected_config): "true_values": ["y", "yes", "t", "true", "on", "1"], "false_values": ["n", "no", "f", "false", "off", "0"], "inference_type": "Primitive Types Only", - "strings_can_be_null": True, - } - , id="test_csv_only_required_options"), + "strings_can_be_null": False, + }, + None, + id="test_csv_only_required_options"), pytest.param( "csv", {}, @@ -152,9 +159,26 @@ def test_convert_legacy_config(legacy_config, expected_config): "true_values": ["y", "yes", "t", "true", "on", "1"], "false_values": ["n", "no", "f", "false", "off", "0"], "inference_type": "Primitive Types Only", - "strings_can_be_null": True, - } - , id="test_csv_empty_format"), + "strings_can_be_null": False, + }, + None, + id="test_csv_empty_format"), + pytest.param( + "csv", + { + "additional_reader_options": "{\"not_valid\": \"at all}", + }, + None, + ValueError, + id="test_malformed_additional_reader_options"), + pytest.param( + "csv", + { + "advanced_options": "{\"not_valid\": \"at all}", + }, + None, + ValueError, + id="test_malformed_advanced_options"), pytest.param( "jsonl", { @@ -165,8 +189,9 @@ def test_convert_legacy_config(legacy_config, expected_config): }, { "filetype": "jsonl" - } - , id="test_jsonl_format"), + }, + None, + id="test_jsonl_format"), pytest.param( "parquet", { @@ -177,8 +202,9 @@ def test_convert_legacy_config(legacy_config, expected_config): }, { "filetype": "parquet" - } - , id="test_parquet_format"), + }, + None, + id="test_parquet_format"), pytest.param( "avro", { @@ -186,11 +212,12 @@ def test_convert_legacy_config(legacy_config, expected_config): }, { "filetype": "avro" - } - , id="test_avro_format"), + }, + None, + id="test_avro_format"), ] ) -def test_convert_file_format(file_type, legacy_format_config, expected_format_config): +def test_convert_file_format(file_type, legacy_format_config, expected_format_config, expected_error): legacy_config = { "dataset": "test_data", "provider": { @@ -220,6 +247,10 @@ def test_convert_file_format(file_type, legacy_format_config, expected_format_co } parsed_legacy_config = SourceS3Spec(**legacy_config) - actual_config = LegacyConfigTransformer.convert(parsed_legacy_config) - assert actual_config == expected_config + if expected_error: + with pytest.raises(expected_error): + LegacyConfigTransformer.convert(parsed_legacy_config) + else: + actual_config = LegacyConfigTransformer.convert(parsed_legacy_config) + assert actual_config == expected_config From bbfbd80cc66dbf1071075add4bb2c6fa87a02fb6 Mon Sep 17 00:00:00 2001 From: brianjlai Date: Mon, 14 Aug 2023 15:19:33 -0700 Subject: [PATCH 21/21] fix things from the rebase of master --- .../default_file_based_availability_strategy.py | 3 --- .../sources/file_based/file_types/test_csv_parser.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py index 244763d6c901..2b81d44be66e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py @@ -18,8 +18,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy): - _WITHOUT_SCHEMA = None - def __init__(self, stream_reader: AbstractFileBasedStreamReader): self.stream_reader = stream_reader @@ -84,7 +82,6 @@ def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFil parser = stream.get_parser(stream.config.file_type) try: - record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger, discovered_schema=None))) except StopIteration: # The file is empty. We've verified that we can open it, so will diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 9b2882f9b9c2..7690fe3f3b1e 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -412,6 +412,18 @@ def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() def test_given_too_many_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header", + "a value", + "too many values,value,value,value", + ] + ) + .build() + ) + data_generator = self._read_data() next(data_generator) assert f"{self._CONFIG_NAME}_config_dialect" in csv.list_dialects()