Skip to content

Commit

Permalink
fix concats for extension array for old versions of pyarrow (#944)
Browse files Browse the repository at this point in the history
* Fix `from_arrow` for Extension ChunkedArrays for pyarrow <= 7.0
* Fix tests for list dtype comparisions since it checks for the field
name
  • Loading branch information
samster25 authored May 20, 2023
1 parent 2f0546b commit daf26c0
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 15 deletions.
8 changes: 7 additions & 1 deletion daft/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,13 @@ def from_arrow(array: pa.Array | pa.ChunkedArray, name: str = "arrow_series") ->
return Series._from_pyseries(pys)
elif isinstance(array, pa.ChunkedArray):
array = ensure_chunked_array(array)
combined_array = array.combine_chunks()
arr_type = array.type
if isinstance(arr_type, pa.BaseExtensionType):
combined_storage_array = array.cast(arr_type.storage_type).combine_chunks()
combined_array = arr_type.wrap_array(combined_storage_array)
else:
combined_array = array.combine_chunks()

pys = PySeries.from_arrow(name, combined_array)
return Series._from_pyseries(pys)
else:
Expand Down
2 changes: 1 addition & 1 deletion src/datatypes/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ impl Display for DataType {
// `f` is a buffer, and this method must write the formatted string into it
fn fmt(&self, f: &mut Formatter) -> Result {
match self {
DataType::List(nested) => write!(f, "List[{}]", nested.dtype),
DataType::List(nested) => write!(f, "List[{}:{}]", nested.name, nested.dtype),
DataType::FixedSizeList(inner, size) => {
write!(f, "FixedSizeList[{}; {}]", inner.dtype, size)
}
Expand Down
5 changes: 4 additions & 1 deletion tests/series/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,10 @@ def test_series_concat_extension_type(uuid_ext_type, chunks) -> None:
concated_arrow = concated.to_arrow()
assert isinstance(concated_arrow.type, UuidType)
assert concated_arrow.type == uuid_ext_type
assert concated_arrow == pa.concat_arrays(ext_arrays)

expected = uuid_ext_type.wrap_array(pa.concat_arrays(storage_arrays))

assert concated_arrow == expected


@pytest.mark.parametrize("chunks", [1, 2, 3, 10])
Expand Down
39 changes: 27 additions & 12 deletions tests/table/test_from_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ def test_from_pydict_arrow_extension_array(uuid_ext_type) -> None:
assert "a" in daft_table.column_names()
# Although Daft will internally represent the binary storage array as a large_binary array,
# it should be cast back to the ingress extension type.
result = daft_table.to_arrow()["a"].combine_chunks()
result = daft_table.to_arrow()["a"]
assert result.type == uuid_ext_type
assert result == arrow_arr
assert result.to_pylist() == arrow_arr.to_pylist()


def test_from_pydict_arrow_deeply_nested() -> None:
Expand All @@ -276,7 +276,8 @@ def test_from_pydict_arrow_deeply_nested() -> None:
# Perform the expected Daft cast, where each list array is cast to a large list array and
# the string array is cast to a large string array.
expected = pa.array(
data, type=pa.struct([("a", pa.large_list(pa.struct([("b", pa.large_list(pa.large_string()))])))])
data,
type=pa.struct([("a", pa.large_list(pa.field("a", pa.struct([("b", pa.large_list(pa.large_string()))]))))]),
)
assert daft_table.to_arrow()["a"].combine_chunks() == expected

Expand Down Expand Up @@ -434,9 +435,9 @@ def test_from_arrow_extension_array(uuid_ext_type) -> None:
assert "a" in daft_table.column_names()
# Although Daft will internally represent the binary storage array as a large_binary array,
# it should be cast back to the ingress extension type.
result = daft_table.to_arrow()["a"].combine_chunks()
result = daft_table.to_arrow()["a"]
assert result.type == uuid_ext_type
assert result == arrow_arr
assert result.to_pylist() == arrow_arr.to_pylist()


def test_from_arrow_deeply_nested() -> None:
Expand All @@ -448,8 +449,19 @@ def test_from_arrow_deeply_nested() -> None:
# Perform the expected Daft cast, where each list array is cast to a large list array and
# the string array is cast to a large string array.
expected = pa.array(
data, type=pa.struct([("a", pa.large_list(pa.struct([("b", pa.large_list(pa.large_string()))])))])
data,
type=pa.struct(
[
(
"a",
pa.large_list(
pa.field("a", pa.struct([("b", pa.large_list(pa.field("item", pa.large_string())))]))
),
)
]
),
)

assert daft_table.to_arrow()["a"].combine_chunks() == expected


Expand All @@ -472,18 +484,20 @@ def test_nested_list_dates(levels: int) -> None:
data = [datetime.date.today(), datetime.date.today()]
for _ in range(levels):
data = [data, data]
table = Table.from_pydict({"data": data})
back_again = table.get_column("data")
table = Table.from_pydict({"item": data})
back_again = table.get_column("item")

dtype = back_again.datatype()

expected_dtype = DataType.date()
expected_arrow_type = pa.date32()
for _ in range(levels):
expected_dtype = DataType.list("item", expected_dtype)
expected_arrow_type = pa.large_list(expected_arrow_type)
expected_arrow_type = pa.large_list(pa.field("item", expected_arrow_type))

assert dtype == expected_dtype
assert back_again.to_arrow() == pa.array(data, type=expected_arrow_type)

assert back_again.to_arrow().type == expected_arrow_type
assert back_again.to_pylist() == data


Expand All @@ -508,7 +522,7 @@ def test_nested_fixed_size_list_dates(levels: int) -> None:
dtype = back_again.datatype()

assert dtype == expected_dtype
assert back_again.to_arrow() == pa.array(data, type=expected_arrow_type)
assert back_again.to_arrow().type == expected_arrow_type
assert back_again.to_pylist() == data


Expand All @@ -528,5 +542,6 @@ def test_nested_struct_dates(levels: int) -> None:
expected_dtype = DataType.struct({"data": expected_dtype})
expected_arrow_type = pa.struct([("data", expected_arrow_type)])
assert dtype == expected_dtype
assert back_again.to_arrow() == pa.array(data, type=expected_arrow_type)

assert back_again.to_arrow().type == expected_arrow_type
assert back_again.to_pylist() == data

0 comments on commit daf26c0

Please sign in to comment.