From fe20d4dd88322a5f8a994cd5dd4aa33642c1e475 Mon Sep 17 00:00:00 2001 From: Binayak Date: Thu, 21 Nov 2024 10:37:22 +0800 Subject: [PATCH 1/5] fix KeyError, by switching del to pop --- pyiceberg/io/pyarrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 9ab198106..23aec2d35 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata( split_offsets.sort() for field_id in invalidate_col: - del col_aggs[field_id] - del null_value_counts[field_id] + col_aggs.pop(field_id, None) + null_value_counts.pop(field_id, None) return DataFileStatistics( record_count=parquet_metadata.num_rows, From 6de54c61b7f289930ec3f95fd19ead16e619c8d0 Mon Sep 17 00:00:00 2001 From: Binayak Date: Thu, 21 Nov 2024 16:39:06 +0800 Subject: [PATCH 2/5] added unit test --- tests/io/test_pyarrow_stats.py | 67 ++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index 41f1432db..0959a6260 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None: ] +def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: + table_metadata = { + "format-version": 2, + "location": "s3://bucket/test/location", + "last-column-id": 7, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "strings", "required": False, "type": "string"}, + {"id": 2, "name": "floats", "required": False, "type": "float"} + ] + } + ], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": []}], + "properties": {}, + } + + table_metadata = TableMetadataUtil.parse_obj(table_metadata) + arrow_schema = schema_to_pyarrow(table_metadata.schemas[0]) + _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"] + _floats = [3.14, math.nan, 1.69, 100] + + table = pa.Table.from_pydict( + { + "strings": _strings, + "floats": _floats + }, + schema=arrow_schema, + ) + + metadata_collector: List[Any] = [] + + with pa.BufferOutputStream() as f: + with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer: + writer.write_table(table) + + return metadata_collector[0], table_metadata + + +def test_is_stats_set_false() -> None: + metadata, table_metadata = construct_test_table_without_stats() + schema = get_current_schema(table_metadata) + statistics = data_file_statistics_from_parquet_metadata( + parquet_metadata=metadata, + stats_columns=compute_statistics_plan(schema, table_metadata.properties), + parquet_column_mapping=parquet_path_to_id_mapping(schema), + ) + datafile = DataFile(**statistics.to_serialized_dict()) + + # assert attributes except for column_aggregates and null_value_counts are present + assert datafile.record_count == 4 + + assert len(datafile.column_sizes) == 2 + assert datafile.column_sizes[1] > 0 + assert datafile.column_sizes[2] > 0 + + assert len(datafile.nan_value_counts) == 0 + + assert datafile.split_offsets is not None + assert len(datafile.split_offsets) == 1 + assert datafile.split_offsets[0] == 4 + + # This is commented out for now because write_to_dataset drops the partition # columns making it harder to calculate the mapping from the column index to # datatype id From 5c676b83056b45f525926b6684f26140a3faccd8 Mon Sep 17 00:00:00 2001 From: Binayak Date: Fri, 22 Nov 2024 14:04:29 +0800 Subject: [PATCH 3/5] update test --- tests/io/test_pyarrow_stats.py | 82 +++++++++++----------------------- 1 file changed, 26 insertions(+), 56 deletions(-) diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index 0959a6260..f1c092991 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -81,7 +81,9 @@ class TestStruct: y: Optional[float] -def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: +def construct_test_table( + write_statistics: bool | List[str] = True, +) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -169,7 +171,9 @@ def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, Tabl metadata_collector: List[Any] = [] with pa.BufferOutputStream() as f: - with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer: + with pq.ParquetWriter( + f, table.schema, metadata_collector=metadata_collector, write_statistics=write_statistics + ) as writer: writer.write_table(table) return metadata_collector[0], table_metadata @@ -681,71 +685,37 @@ def test_stats_types(table_schema_nested: Schema) -> None: ] -def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: - table_metadata = { - "format-version": 2, - "location": "s3://bucket/test/location", - "last-column-id": 7, - "current-schema-id": 0, - "schemas": [ - { - "type": "struct", - "schema-id": 0, - "fields": [ - {"id": 1, "name": "strings", "required": False, "type": "string"}, - {"id": 2, "name": "floats", "required": False, "type": "float"} - ] - } - ], - "default-spec-id": 0, - "partition-specs": [{"spec-id": 0, "fields": []}], - "properties": {}, - } +def test_read_missing_statistics() -> None: + # write statistics for only for "strings" column + metadata, table_metadata = construct_test_table(write_statistics=["strings"]) - table_metadata = TableMetadataUtil.parse_obj(table_metadata) - arrow_schema = schema_to_pyarrow(table_metadata.schemas[0]) - _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"] - _floats = [3.14, math.nan, 1.69, 100] + # expect only "strings" column to have statistics in metadata + assert metadata.row_group(0).column(0).is_stats_set is True + assert metadata.row_group(0).column(0).statistics is not None - table = pa.Table.from_pydict( - { - "strings": _strings, - "floats": _floats - }, - schema=arrow_schema, - ) - - metadata_collector: List[Any] = [] + # expect all other columns to have no statistics + for r in range(metadata.num_row_groups): + for pos in range(1, metadata.num_columns): + assert metadata.row_group(r).column(pos).is_stats_set is False + assert metadata.row_group(r).column(pos).statistics is None - with pa.BufferOutputStream() as f: - with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer: - writer.write_table(table) - - return metadata_collector[0], table_metadata - - -def test_is_stats_set_false() -> None: - metadata, table_metadata = construct_test_table_without_stats() schema = get_current_schema(table_metadata) statistics = data_file_statistics_from_parquet_metadata( parquet_metadata=metadata, stats_columns=compute_statistics_plan(schema, table_metadata.properties), parquet_column_mapping=parquet_path_to_id_mapping(schema), ) - datafile = DataFile(**statistics.to_serialized_dict()) - # assert attributes except for column_aggregates and null_value_counts are present - assert datafile.record_count == 4 - - assert len(datafile.column_sizes) == 2 - assert datafile.column_sizes[1] > 0 - assert datafile.column_sizes[2] > 0 - - assert len(datafile.nan_value_counts) == 0 + datafile = DataFile(**statistics.to_serialized_dict()) - assert datafile.split_offsets is not None - assert len(datafile.split_offsets) == 1 - assert datafile.split_offsets[0] == 4 + # expect only "strings" column values to be reflected in the + # upper_bound, lower_bound and null_value_counts props of datafile + assert len(datafile.lower_bounds) == 1 + assert datafile.lower_bounds[1].decode() == "aaaaaaaaaaaaaaaa" + assert len(datafile.upper_bounds) == 1 + assert datafile.upper_bounds[1].decode() == "zzzzzzzzzzzzzzz{" + assert len(datafile.null_value_counts) == 1 + assert datafile.null_value_counts[1] == 1 # This is commented out for now because write_to_dataset drops the partition From 939d325693e5a484bed0377310d49f04eb1fef31 Mon Sep 17 00:00:00 2001 From: Binayak Date: Sat, 23 Nov 2024 10:41:35 +0800 Subject: [PATCH 4/5] fix python 3.9 compatibility, and refactor test --- tests/io/test_pyarrow_stats.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index f1c092991..241dd800a 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -82,7 +82,7 @@ class TestStruct: def construct_test_table( - write_statistics: bool | List[str] = True, + write_statistics: Union[bool, List[str]] = True, ) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: table_metadata = { "format-version": 2, @@ -690,14 +690,15 @@ def test_read_missing_statistics() -> None: metadata, table_metadata = construct_test_table(write_statistics=["strings"]) # expect only "strings" column to have statistics in metadata - assert metadata.row_group(0).column(0).is_stats_set is True - assert metadata.row_group(0).column(0).statistics is not None - - # expect all other columns to have no statistics + # and all other columns to have no statistics for r in range(metadata.num_row_groups): - for pos in range(1, metadata.num_columns): - assert metadata.row_group(r).column(pos).is_stats_set is False - assert metadata.row_group(r).column(pos).statistics is None + for pos in range(metadata.num_columns): + if metadata.row_group(r).column(pos).path_in_schema == "strings": + assert metadata.row_group(r).column(pos).is_stats_set is True + assert metadata.row_group(r).column(pos).statistics is not None + else: + assert metadata.row_group(r).column(pos).is_stats_set is False + assert metadata.row_group(r).column(pos).statistics is None schema = get_current_schema(table_metadata) statistics = data_file_statistics_from_parquet_metadata( From e26ef5007c2c69576d948e3e6fab54b210ae1fea Mon Sep 17 00:00:00 2001 From: Binayak Date: Mon, 25 Nov 2024 13:49:18 +0800 Subject: [PATCH 5/5] update test --- tests/io/test_pyarrow_stats.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index 241dd800a..788891711 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -711,12 +711,13 @@ def test_read_missing_statistics() -> None: # expect only "strings" column values to be reflected in the # upper_bound, lower_bound and null_value_counts props of datafile + string_col_idx = 1 assert len(datafile.lower_bounds) == 1 - assert datafile.lower_bounds[1].decode() == "aaaaaaaaaaaaaaaa" + assert datafile.lower_bounds[string_col_idx].decode() == "aaaaaaaaaaaaaaaa" assert len(datafile.upper_bounds) == 1 - assert datafile.upper_bounds[1].decode() == "zzzzzzzzzzzzzzz{" + assert datafile.upper_bounds[string_col_idx].decode() == "zzzzzzzzzzzzzzz{" assert len(datafile.null_value_counts) == 1 - assert datafile.null_value_counts[1] == 1 + assert datafile.null_value_counts[string_col_idx] == 1 # This is commented out for now because write_to_dataset drops the partition