From fe20d4dd88322a5f8a994cd5dd4aa33642c1e475 Mon Sep 17 00:00:00 2001
From: Binayak <binayakd86@gmail.com>
Date: Thu, 21 Nov 2024 10:37:22 +0800
Subject: [PATCH 1/5] fix KeyError, by switching del to pop

---
 pyiceberg/io/pyarrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 9ab198106..23aec2d35 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata(
     split_offsets.sort()
 
     for field_id in invalidate_col:
-        del col_aggs[field_id]
-        del null_value_counts[field_id]
+        col_aggs.pop(field_id, None)
+        null_value_counts.pop(field_id, None)
 
     return DataFileStatistics(
         record_count=parquet_metadata.num_rows,

From 6de54c61b7f289930ec3f95fd19ead16e619c8d0 Mon Sep 17 00:00:00 2001
From: Binayak <binayakd86@gmail.com>
Date: Thu, 21 Nov 2024 16:39:06 +0800
Subject: [PATCH 2/5] added unit test

---
 tests/io/test_pyarrow_stats.py | 67 ++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py
index 41f1432db..0959a6260 100644
--- a/tests/io/test_pyarrow_stats.py
+++ b/tests/io/test_pyarrow_stats.py
@@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None:
     ]
 
 
+def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
+    table_metadata = {
+        "format-version": 2,
+        "location": "s3://bucket/test/location",
+        "last-column-id": 7,
+        "current-schema-id": 0,
+        "schemas": [
+            {
+                "type": "struct",
+                "schema-id": 0,
+                "fields": [
+                    {"id": 1, "name": "strings", "required": False, "type": "string"},
+                    {"id": 2, "name": "floats", "required": False, "type": "float"}
+                ]
+            }
+        ],
+        "default-spec-id": 0,
+        "partition-specs": [{"spec-id": 0, "fields": []}],
+        "properties": {},
+    }
+
+    table_metadata = TableMetadataUtil.parse_obj(table_metadata)
+    arrow_schema = schema_to_pyarrow(table_metadata.schemas[0])
+    _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"]
+    _floats = [3.14, math.nan, 1.69, 100]
+
+    table = pa.Table.from_pydict(
+        {
+            "strings": _strings,
+            "floats": _floats
+        },
+        schema=arrow_schema,
+    )
+
+    metadata_collector: List[Any] = []
+
+    with pa.BufferOutputStream() as f:
+        with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer:
+            writer.write_table(table)
+
+    return metadata_collector[0], table_metadata
+
+
+def test_is_stats_set_false() -> None:
+    metadata, table_metadata = construct_test_table_without_stats()
+    schema = get_current_schema(table_metadata)
+    statistics = data_file_statistics_from_parquet_metadata(
+        parquet_metadata=metadata,
+        stats_columns=compute_statistics_plan(schema, table_metadata.properties),
+        parquet_column_mapping=parquet_path_to_id_mapping(schema),
+    )
+    datafile = DataFile(**statistics.to_serialized_dict())
+
+    # assert attributes except for column_aggregates and null_value_counts are present
+    assert datafile.record_count == 4
+
+    assert len(datafile.column_sizes) == 2
+    assert datafile.column_sizes[1] > 0
+    assert datafile.column_sizes[2] > 0
+
+    assert len(datafile.nan_value_counts) == 0
+
+    assert datafile.split_offsets is not None
+    assert len(datafile.split_offsets) == 1
+    assert datafile.split_offsets[0] == 4
+
+
 # This is commented out for now because write_to_dataset drops the partition
 # columns making it harder to calculate the mapping from the column index to
 # datatype id

From 5c676b83056b45f525926b6684f26140a3faccd8 Mon Sep 17 00:00:00 2001
From: Binayak <binayakd86@gmail.com>
Date: Fri, 22 Nov 2024 14:04:29 +0800
Subject: [PATCH 3/5] update test

---
 tests/io/test_pyarrow_stats.py | 82 +++++++++++-----------------------
 1 file changed, 26 insertions(+), 56 deletions(-)

diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py
index 0959a6260..f1c092991 100644
--- a/tests/io/test_pyarrow_stats.py
+++ b/tests/io/test_pyarrow_stats.py
@@ -81,7 +81,9 @@ class TestStruct:
     y: Optional[float]
 
 
-def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
+def construct_test_table(
+    write_statistics: bool | List[str] = True,
+) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
     table_metadata = {
         "format-version": 2,
         "location": "s3://bucket/test/location",
@@ -169,7 +171,9 @@ def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, Tabl
     metadata_collector: List[Any] = []
 
     with pa.BufferOutputStream() as f:
-        with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer:
+        with pq.ParquetWriter(
+            f, table.schema, metadata_collector=metadata_collector, write_statistics=write_statistics
+        ) as writer:
             writer.write_table(table)
 
     return metadata_collector[0], table_metadata
@@ -681,71 +685,37 @@ def test_stats_types(table_schema_nested: Schema) -> None:
     ]
 
 
-def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
-    table_metadata = {
-        "format-version": 2,
-        "location": "s3://bucket/test/location",
-        "last-column-id": 7,
-        "current-schema-id": 0,
-        "schemas": [
-            {
-                "type": "struct",
-                "schema-id": 0,
-                "fields": [
-                    {"id": 1, "name": "strings", "required": False, "type": "string"},
-                    {"id": 2, "name": "floats", "required": False, "type": "float"}
-                ]
-            }
-        ],
-        "default-spec-id": 0,
-        "partition-specs": [{"spec-id": 0, "fields": []}],
-        "properties": {},
-    }
+def test_read_missing_statistics() -> None:
+    # write statistics for only for "strings" column
+    metadata, table_metadata = construct_test_table(write_statistics=["strings"])
 
-    table_metadata = TableMetadataUtil.parse_obj(table_metadata)
-    arrow_schema = schema_to_pyarrow(table_metadata.schemas[0])
-    _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"]
-    _floats = [3.14, math.nan, 1.69, 100]
+    # expect only "strings" column to have statistics in metadata
+    assert metadata.row_group(0).column(0).is_stats_set is True
+    assert metadata.row_group(0).column(0).statistics is not None
 
-    table = pa.Table.from_pydict(
-        {
-            "strings": _strings,
-            "floats": _floats
-        },
-        schema=arrow_schema,
-    )
-
-    metadata_collector: List[Any] = []
+    # expect all other columns to have no statistics
+    for r in range(metadata.num_row_groups):
+        for pos in range(1, metadata.num_columns):
+            assert metadata.row_group(r).column(pos).is_stats_set is False
+            assert metadata.row_group(r).column(pos).statistics is None
 
-    with pa.BufferOutputStream() as f:
-        with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer:
-            writer.write_table(table)
-
-    return metadata_collector[0], table_metadata
-
-
-def test_is_stats_set_false() -> None:
-    metadata, table_metadata = construct_test_table_without_stats()
     schema = get_current_schema(table_metadata)
     statistics = data_file_statistics_from_parquet_metadata(
         parquet_metadata=metadata,
         stats_columns=compute_statistics_plan(schema, table_metadata.properties),
         parquet_column_mapping=parquet_path_to_id_mapping(schema),
     )
-    datafile = DataFile(**statistics.to_serialized_dict())
 
-    # assert attributes except for column_aggregates and null_value_counts are present
-    assert datafile.record_count == 4
-
-    assert len(datafile.column_sizes) == 2
-    assert datafile.column_sizes[1] > 0
-    assert datafile.column_sizes[2] > 0
-
-    assert len(datafile.nan_value_counts) == 0
+    datafile = DataFile(**statistics.to_serialized_dict())
 
-    assert datafile.split_offsets is not None
-    assert len(datafile.split_offsets) == 1
-    assert datafile.split_offsets[0] == 4
+    # expect only "strings" column values to be reflected in the
+    # upper_bound, lower_bound and null_value_counts props of datafile
+    assert len(datafile.lower_bounds) == 1
+    assert datafile.lower_bounds[1].decode() == "aaaaaaaaaaaaaaaa"
+    assert len(datafile.upper_bounds) == 1
+    assert datafile.upper_bounds[1].decode() == "zzzzzzzzzzzzzzz{"
+    assert len(datafile.null_value_counts) == 1
+    assert datafile.null_value_counts[1] == 1
 
 
 # This is commented out for now because write_to_dataset drops the partition

From 939d325693e5a484bed0377310d49f04eb1fef31 Mon Sep 17 00:00:00 2001
From: Binayak <binayakd86@gmail.com>
Date: Sat, 23 Nov 2024 10:41:35 +0800
Subject: [PATCH 4/5] fix python 3.9 compatibility, and refactor test

---
 tests/io/test_pyarrow_stats.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py
index f1c092991..241dd800a 100644
--- a/tests/io/test_pyarrow_stats.py
+++ b/tests/io/test_pyarrow_stats.py
@@ -82,7 +82,7 @@ class TestStruct:
 
 
 def construct_test_table(
-    write_statistics: bool | List[str] = True,
+    write_statistics: Union[bool, List[str]] = True,
 ) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
     table_metadata = {
         "format-version": 2,
@@ -690,14 +690,15 @@ def test_read_missing_statistics() -> None:
     metadata, table_metadata = construct_test_table(write_statistics=["strings"])
 
     # expect only "strings" column to have statistics in metadata
-    assert metadata.row_group(0).column(0).is_stats_set is True
-    assert metadata.row_group(0).column(0).statistics is not None
-
-    # expect all other columns to have no statistics
+    # and all other columns to have no statistics
     for r in range(metadata.num_row_groups):
-        for pos in range(1, metadata.num_columns):
-            assert metadata.row_group(r).column(pos).is_stats_set is False
-            assert metadata.row_group(r).column(pos).statistics is None
+        for pos in range(metadata.num_columns):
+            if metadata.row_group(r).column(pos).path_in_schema == "strings":
+                assert metadata.row_group(r).column(pos).is_stats_set is True
+                assert metadata.row_group(r).column(pos).statistics is not None
+            else:
+                assert metadata.row_group(r).column(pos).is_stats_set is False
+                assert metadata.row_group(r).column(pos).statistics is None
 
     schema = get_current_schema(table_metadata)
     statistics = data_file_statistics_from_parquet_metadata(

From e26ef5007c2c69576d948e3e6fab54b210ae1fea Mon Sep 17 00:00:00 2001
From: Binayak <binayakd86@gmail.com>
Date: Mon, 25 Nov 2024 13:49:18 +0800
Subject: [PATCH 5/5] update test

---
 tests/io/test_pyarrow_stats.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py
index 241dd800a..788891711 100644
--- a/tests/io/test_pyarrow_stats.py
+++ b/tests/io/test_pyarrow_stats.py
@@ -711,12 +711,13 @@ def test_read_missing_statistics() -> None:
 
     # expect only "strings" column values to be reflected in the
     # upper_bound, lower_bound and null_value_counts props of datafile
+    string_col_idx = 1
     assert len(datafile.lower_bounds) == 1
-    assert datafile.lower_bounds[1].decode() == "aaaaaaaaaaaaaaaa"
+    assert datafile.lower_bounds[string_col_idx].decode() == "aaaaaaaaaaaaaaaa"
     assert len(datafile.upper_bounds) == 1
-    assert datafile.upper_bounds[1].decode() == "zzzzzzzzzzzzzzz{"
+    assert datafile.upper_bounds[string_col_idx].decode() == "zzzzzzzzzzzzzzz{"
     assert len(datafile.null_value_counts) == 1
-    assert datafile.null_value_counts[1] == 1
+    assert datafile.null_value_counts[string_col_idx] == 1
 
 
 # This is commented out for now because write_to_dataset drops the partition