Skip to content

Commit

Permalink
Remove dead code for pyarrow < 15.0.0 (#7023)
Browse files Browse the repository at this point in the history
* Remove dead code related to pa.concat_tables

* Remove dead code related to pa.FixedSizeListArray
  • Loading branch information
albertvillanova committed Aug 13, 2024
1 parent 0cdd324 commit e3b33eb
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 68 deletions.
5 changes: 1 addition & 4 deletions src/datasets/packaged_modules/webdataset/webdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,7 @@ def _split_generators(self, dl_manager):
pa.Table.from_pylist(cast_to_python_objects([example], only_1d_for_numpy=True))
for example in first_examples
]
if datasets.config.PYARROW_VERSION.major < 14:
inferred_arrow_schema = pa.concat_tables(pa_tables, promote=True).schema
else:
inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options="default").schema
inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options="default").schema
features = datasets.Features.from_arrow_schema(inferred_arrow_schema)

# Set Image types
Expand Down
79 changes: 15 additions & 64 deletions src/datasets/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pyarrow.compute as pc
import pyarrow.types

from . import config
from .utils.logging import get_logger


Expand Down Expand Up @@ -1320,22 +1319,16 @@ def __setstate__(self, state):
if schema is not None and table.schema != schema:
# We fix the columns by concatenating with an empty table with the right columns
empty_table = pa.Table.from_batches([], schema=schema)
# we set promote=True to fill missing columns with null values
if config.PYARROW_VERSION.major < 14:
table = pa.concat_tables([table, empty_table], promote=True)
else:
table = pa.concat_tables([table, empty_table], promote_options="default")
# We set promote_options="default" to fill missing columns with null values
table = pa.concat_tables([table, empty_table], promote_options="default")
ConcatenationTable.__init__(self, table, blocks=blocks)

@staticmethod
def _concat_blocks(blocks: List[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table:
pa_tables = [table.table if hasattr(table, "table") else table for table in blocks]
if axis == 0:
# we set promote=True to fill missing columns with null values
if config.PYARROW_VERSION.major < 14:
return pa.concat_tables(pa_tables, promote=True)
else:
return pa.concat_tables(pa_tables, promote_options="default")
# We set promote_options="default" to fill missing columns with null values
return pa.concat_tables(pa_tables, promote_options="default")
elif axis == 1:
for i, table in enumerate(pa_tables):
if i == 0:
Expand Down Expand Up @@ -1906,17 +1899,9 @@ def array_cast(
else:
array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
array_values = array.values
if config.PYARROW_VERSION.major < 15:
return pa.Array.from_buffers(
pa_type,
len(array),
[array.is_valid().buffers()[1]],
children=[_c(array_values, pa_type.value_type)],
)
else:
return pa.FixedSizeListArray.from_arrays(
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
)
return pa.FixedSizeListArray.from_arrays(
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
)
else:
array_values = array.values[
array.offset * pa_type.list_size : (array.offset + len(array)) * pa_type.list_size
Expand All @@ -1932,17 +1917,9 @@ def array_cast(
array_values = array.values[
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
]
if config.PYARROW_VERSION.major < 15:
return pa.Array.from_buffers(
pa_type,
len(array),
[array.is_valid().buffers()[1]],
children=[_c(array_values, pa_type.value_type)],
)
else:
return pa.FixedSizeListArray.from_arrays(
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
)
return pa.FixedSizeListArray.from_arrays(
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
)
elif pa.types.is_list(pa_type):
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())
Expand Down Expand Up @@ -2055,17 +2032,9 @@ def cast_array_to_feature(
array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
array_values = array.values
casted_array_values = _c(array_values, feature.feature)
if config.PYARROW_VERSION.major < 15:
return pa.Array.from_buffers(
pa.list_(casted_array_values.type, feature.length),
len(array),
[array.is_valid().buffers()[1]],
children=[casted_array_values],
)
else:
return pa.FixedSizeListArray.from_arrays(
casted_array_values, feature.length, mask=array.is_null()
)
return pa.FixedSizeListArray.from_arrays(
casted_array_values, feature.length, mask=array.is_null()
)
else:
array_values = array.values[
array.offset * feature.length : (array.offset + len(array)) * feature.length
Expand All @@ -2091,17 +2060,7 @@ def cast_array_to_feature(
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
]
casted_array_values = _c(array_values, feature.feature)
if config.PYARROW_VERSION.major < 15:
return pa.Array.from_buffers(
pa.list_(casted_array_values.type, feature.length),
len(array),
[array.is_valid().buffers()[1]],
children=[casted_array_values],
)
else:
return pa.FixedSizeListArray.from_arrays(
casted_array_values, feature.length, mask=array.is_null()
)
return pa.FixedSizeListArray.from_arrays(casted_array_values, feature.length, mask=array.is_null())
else:
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null())
Expand Down Expand Up @@ -2176,15 +2135,7 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType"):
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
]
embedded_array_values = _e(array_values, feature.feature)
if config.PYARROW_VERSION.major < 15:
return pa.Array.from_buffers(
pa.list_(array_values.type, feature.length),
len(array),
[array.is_valid().buffers()[1]],
children=[embedded_array_values],
)
else:
return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())
return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())
if not isinstance(feature, (Sequence, dict, list, tuple)):
return array
raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}")
Expand Down

0 comments on commit e3b33eb

Please sign in to comment.