Skip to content

Commit

Permalink
[ISSUE #28893] align with pyarrow
Browse files Browse the repository at this point in the history
  • Loading branch information
maxi297 committed Aug 4, 2023
1 parent eaba483 commit 60757c1
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_form

class _TypeInferrer:
_BOOLEAN_TYPE = "boolean"
_INTEGER_TYPE = "integer"
_NUMBER_TYPE = "number"
_ARRAY_TYPE = "array"
_OBJECT_TYPE = "object"
Expand All @@ -255,17 +256,18 @@ def infer(self) -> str:

if types == {self._BOOLEAN_TYPE}:
return self._BOOLEAN_TYPE
elif types == {self._NUMBER_TYPE}:
elif types == {self._INTEGER_TYPE}:
return self._INTEGER_TYPE
elif types == {self._NUMBER_TYPE} or types == {self._INTEGER_TYPE, self._NUMBER_TYPE}:
return self._NUMBER_TYPE
elif types == {self._ARRAY_TYPE}:
return self._ARRAY_TYPE
elif self._ARRAY_TYPE in types or self._OBJECT_TYPE in types:
return self._OBJECT_TYPE
# to keep backward compatibility with PyArrow, we will not parse types
return self._STRING_TYPE

def _infer_type(self, value: str) -> str:
if self._is_boolean(value):
return self._BOOLEAN_TYPE
elif self._is_integer(value):
return self._INTEGER_TYPE
elif self._is_number(value):
return self._NUMBER_TYPE
elif self._is_array(value):
Expand All @@ -282,6 +284,14 @@ def _is_boolean(self, value: str) -> bool:
except ValueError:
return False

@staticmethod
def _is_integer(value: str) -> bool:
try:
_value_to_python_type(value, int)
return True
except ValueError:
return False

@staticmethod
def _is_number(value: str) -> bool:
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,20 +104,23 @@ def setUp(self) -> None:
def test_given_booleans_only_when_infer_schema_then_type_is_boolean(self) -> None:
self._test_infer_schema(list(_DEFAULT_TRUE_VALUES.union(_DEFAULT_FALSE_VALUES)), "boolean")

def test_given_numbers_only_when_infer_schema_then_type_is_number(self) -> None:
def test_given_integers_only_when_infer_schema_then_type_is_integer(self) -> None:
self._test_infer_schema(["2", "90329", "5645"], "integer")

def test_given_numbers_and_integers_when_infer_schema_then_type_is_number(self) -> None:
self._test_infer_schema(["2", "90329", "2.312"], "number")

def test_given_arrays_only_when_infer_schema_then_type_is_array(self) -> None:
self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "array")
self._test_infer_schema(['["first_item", "second_item"]', '["first_item_again", "second_item_again"]'], "string")

def test_given_objects_only_when_infer_schema_then_type_is_object(self) -> None:
self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "object")
self._test_infer_schema(['{"object1_key": 1}', '{"object2_key": 2}'], "string")

def test_given_arrays_and_objects_only_when_infer_schema_then_type_is_object(self) -> None:
self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "object")
self._test_infer_schema(['["first_item", "second_item"]', '{"an_object_key": "an_object_value"}'], "string")

def test_given_strings_and_objects_only_when_infer_schema_then_type_is_object(self) -> None:
self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "object")
self._test_infer_schema(['["first_item", "second_item"]', "this is a string"], "string")

def test_given_strings_only_when_infer_schema_then_type_is_string(self) -> None:
self._test_infer_schema(["a string", "another string"], "string")
Expand All @@ -128,7 +131,7 @@ def _test_infer_schema(self, rows: List[str], expected_type: str) -> None:
assert inferred_schema == {"header": {"type": expected_type}}

def test_given_big_file_when_infer_schema_then_stop_early(self) -> None:
self._csv_reader.read_data.return_value = ({"header": row} for row in ["2" * 1_000_000] + ["this is a string"])
self._csv_reader.read_data.return_value = ({"header": row} for row in ["2." + "2" * 1_000_000] + ["this is a string"])
inferred_schema = self._infer_schema()
# since the type is number, we know the string at the end was not considered
assert inferred_schema == {"header": {"type": "number"}}
Expand Down

0 comments on commit 60757c1

Please sign in to comment.