diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index 3b6e6f7dc915..9c7b58aadca4 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -237,6 +237,7 @@ def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_form class _TypeInferrer: _BOOLEAN_TYPE = "boolean" + _INTEGER_TYPE = "integer" _NUMBER_TYPE = "number" _ARRAY_TYPE = "array" _OBJECT_TYPE = "object" @@ -255,17 +256,18 @@ def infer(self) -> str: if types == {self._BOOLEAN_TYPE}: return self._BOOLEAN_TYPE - elif types == {self._NUMBER_TYPE}: + elif types == {self._INTEGER_TYPE}: + return self._INTEGER_TYPE + elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}: return self._NUMBER_TYPE - elif types == {self._ARRAY_TYPE}: - return self._ARRAY_TYPE - elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types: - return self._OBJECT_TYPE + # to keep backward compatibility with PyArrow, we will not parse types return self._STRING_TYPE def _infer_type(self, value: str) -> str: if self._is_boolean(value): return self._BOOLEAN_TYPE + elif self._is_integer(value): + return self._INTEGER_TYPE elif self._is_number(value): return self._NUMBER_TYPE elif self._is_array(value): @@ -282,6 +284,14 @@ def _is_boolean(self, value: str) -> bool: except ValueError: return False + @staticmethod + def _is_integer(value: str) -> bool: + try: + _value_to_python_type(value, int) + return True + except ValueError: + return False + @staticmethod def _is_number(value: str) -> bool: try: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 48fb991f2195..4d3658091c84 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -104,20 +104,23 @@ def setUp(self) -> None: def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None: self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean") - def test_given_numbers_only_when_infer_schema_then_type_is_number(self) -> None: + def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None: + self._test_infer_schema(["2", "90329", "5645"], "integer") + + def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None: self._test_infer_schema(["2", "90329", "2.312"], "number") def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array") + self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string") def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object") + self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string") def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object") + self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "string") def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None: - self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object") + self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "string") def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None: self._test_infer_schema(["a string", "another string"], "string") @@ -128,7 +131,7 @@ def _test_infer_schema(self, rows: List[str], expected_type: str) -> None: assert inferred_schema == {"header": {"type": expected_type}} def test_given_big_file_when_infer_schema_then_stop_early(self) -> None: - self._csv_reader.read_data.return_value = ({"header": row} for row in ["2" * 1_000_000] + ["this is a string"]) + self._csv_reader.read_data.return_value = ({"header": row} for row in ["2." + "2" * 1_000_000] + ["this is a string"]) inferred_schema = self._infer_schema() # since the type is number, we know the string at the end was not considered assert inferred_schema == {"header": {"type": "number"}}