Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix KeyError raised by add_files when parquet file doe not have column stats #1354

Merged
merged 5 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata(
split_offsets.sort()

for field_id in invalidate_col:
del col_aggs[field_id]
del null_value_counts[field_id]
col_aggs.pop(field_id, None)
null_value_counts.pop(field_id, None)

return DataFileStatistics(
record_count=parquet_metadata.num_rows,
Expand Down
41 changes: 39 additions & 2 deletions tests/io/test_pyarrow_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ class TestStruct:
y: Optional[float]


def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
def construct_test_table(
write_statistics: bool | List[str] = True,
) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
table_metadata = {
"format-version": 2,
"location": "s3://bucket/test/location",
Expand Down Expand Up @@ -169,7 +171,9 @@ def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, Tabl
metadata_collector: List[Any] = []

with pa.BufferOutputStream() as f:
with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer:
with pq.ParquetWriter(
f, table.schema, metadata_collector=metadata_collector, write_statistics=write_statistics
) as writer:
writer.write_table(table)

return metadata_collector[0], table_metadata
Expand Down Expand Up @@ -681,6 +685,39 @@ def test_stats_types(table_schema_nested: Schema) -> None:
]


def test_read_missing_statistics() -> None:
# write statistics for only for "strings" column
metadata, table_metadata = construct_test_table(write_statistics=["strings"])

# expect only "strings" column to have statistics in metadata
assert metadata.row_group(0).column(0).is_stats_set is True
assert metadata.row_group(0).column(0).statistics is not None

# expect all other columns to have no statistics
for r in range(metadata.num_row_groups):
for pos in range(1, metadata.num_columns):
assert metadata.row_group(r).column(pos).is_stats_set is False
assert metadata.row_group(r).column(pos).statistics is None
binayakd marked this conversation as resolved.
Show resolved Hide resolved

schema = get_current_schema(table_metadata)
statistics = data_file_statistics_from_parquet_metadata(
parquet_metadata=metadata,
stats_columns=compute_statistics_plan(schema, table_metadata.properties),
parquet_column_mapping=parquet_path_to_id_mapping(schema),
)

datafile = DataFile(**statistics.to_serialized_dict())

# expect only "strings" column values to be reflected in the
# upper_bound, lower_bound and null_value_counts props of datafile
assert len(datafile.lower_bounds) == 1
assert datafile.lower_bounds[1].decode() == "aaaaaaaaaaaaaaaa"
Fokko marked this conversation as resolved.
Show resolved Hide resolved
assert len(datafile.upper_bounds) == 1
assert datafile.upper_bounds[1].decode() == "zzzzzzzzzzzzzzz{"
assert len(datafile.null_value_counts) == 1
assert datafile.null_value_counts[1] == 1
Fokko marked this conversation as resolved.
Show resolved Hide resolved


# This is commented out for now because write_to_dataset drops the partition
# columns making it harder to calculate the mapping from the column index to
# datatype id
Expand Down
Loading