Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix KeyError raised by add_files when parquet file doe not have column stats #1354

Merged
merged 5 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata(
split_offsets.sort()

for field_id in invalidate_col:
del col_aggs[field_id]
del null_value_counts[field_id]
col_aggs.pop(field_id, None)
null_value_counts.pop(field_id, None)

return DataFileStatistics(
record_count=parquet_metadata.num_rows,
Expand Down
67 changes: 67 additions & 0 deletions tests/io/test_pyarrow_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None:
]


def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
binayakd marked this conversation as resolved.
Show resolved Hide resolved
table_metadata = {
"format-version": 2,
"location": "s3://bucket/test/location",
"last-column-id": 7,
"current-schema-id": 0,
"schemas": [
{
"type": "struct",
"schema-id": 0,
"fields": [
{"id": 1, "name": "strings", "required": False, "type": "string"},
{"id": 2, "name": "floats", "required": False, "type": "float"}
]
}
],
"default-spec-id": 0,
"partition-specs": [{"spec-id": 0, "fields": []}],
"properties": {},
}

table_metadata = TableMetadataUtil.parse_obj(table_metadata)
arrow_schema = schema_to_pyarrow(table_metadata.schemas[0])
_strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"]
_floats = [3.14, math.nan, 1.69, 100]

table = pa.Table.from_pydict(
{
"strings": _strings,
"floats": _floats
},
schema=arrow_schema,
)

metadata_collector: List[Any] = []

with pa.BufferOutputStream() as f:
with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer:
writer.write_table(table)

return metadata_collector[0], table_metadata


def test_is_stats_set_false() -> None:
binayakd marked this conversation as resolved.
Show resolved Hide resolved
metadata, table_metadata = construct_test_table_without_stats()
schema = get_current_schema(table_metadata)
statistics = data_file_statistics_from_parquet_metadata(
parquet_metadata=metadata,
stats_columns=compute_statistics_plan(schema, table_metadata.properties),
parquet_column_mapping=parquet_path_to_id_mapping(schema),
)
datafile = DataFile(**statistics.to_serialized_dict())

# assert attributes except for column_aggregates and null_value_counts are present
binayakd marked this conversation as resolved.
Show resolved Hide resolved
assert datafile.record_count == 4

assert len(datafile.column_sizes) == 2
assert datafile.column_sizes[1] > 0
assert datafile.column_sizes[2] > 0

assert len(datafile.nan_value_counts) == 0

assert datafile.split_offsets is not None
assert len(datafile.split_offsets) == 1
assert datafile.split_offsets[0] == 4


# This is commented out for now because write_to_dataset drops the partition
# columns making it harder to calculate the mapping from the column index to
# datatype id
Expand Down
Loading