From c93445976d856300aa12804343541bf370d47729 Mon Sep 17 00:00:00 2001 From: Max Piskunov Date: Mon, 13 Jan 2025 23:47:49 +0000 Subject: [PATCH 1/2] feat(python, rust): add statistics_enabled to ColumnProperties Signed-off-by: Max Piskunov --- python/deltalake/table.py | 8 +++++++- python/src/lib.rs | 20 +++++++++++++++++++- python/tests/test_writerproperties.py | 2 ++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index d4e4dd192e..caafd2eb21 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -217,6 +217,7 @@ class ColumnProperties: def __init__( self, dictionary_enabled: Optional[bool] = None, + statistics_enabled: Optional[Literal["NONE", "CHUNK", "PAGE"]] = None, max_statistics_size: Optional[int] = None, bloom_filter_properties: Optional[BloomFilterProperties] = None, ): @@ -224,15 +225,20 @@ def __init__( Args: dictionary_enabled: Enable dictionary encoding for the column. + statistics_enabled: Statistics level for the column. max_statistics_size: Maximum size of statistics for the column. bloom_filter_properties: Bloom Filter Properties for the column. """ self.dictionary_enabled = dictionary_enabled + self.statistics_enabled = statistics_enabled self.max_statistics_size = max_statistics_size self.bloom_filter_properties = bloom_filter_properties def __str__(self) -> str: - return f"dictionary_enabled: {self.dictionary_enabled}, max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}" + return ( + f"dictionary_enabled: {self.dictionary_enabled}, statistics_enabled: {self.statistics_enabled}, " + f"max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}" + ) @dataclass(init=True) diff --git a/python/src/lib.rs b/python/src/lib.rs index 8ea08158e8..a4551bf641 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -52,7 +52,7 @@ use deltalake::operations::vacuum::VacuumBuilder; use deltalake::operations::{collect_sendable_stream, CustomExecuteHandler}; use deltalake::parquet::basic::Compression; use deltalake::parquet::errors::ParquetError; -use deltalake::parquet::file::properties::WriterProperties; +use deltalake::parquet::file::properties::{EnabledStatistics, WriterProperties}; use deltalake::partitions::PartitionFilter; use deltalake::protocol::{DeltaOperation, SaveMode}; use deltalake::storage::{IORuntime, ObjectStoreRef}; @@ -1566,6 +1566,13 @@ fn set_writer_properties(writer_properties: PyWriterProperties) -> DeltaResult DeltaResult, + pub statistics_enabled: Option, pub max_statistics_size: Option, pub bloom_filter_properties: Option, } diff --git a/python/tests/test_writerproperties.py b/python/tests/test_writerproperties.py index 30c25548ad..a9a8db5868 100644 --- a/python/tests/test_writerproperties.py +++ b/python/tests/test_writerproperties.py @@ -28,6 +28,7 @@ def test_writer_properties_all_filled(): column_properties={ "a": ColumnProperties( dictionary_enabled=True, + statistics_enabled="CHUNK", max_statistics_size=40, bloom_filter_properties=BloomFilterProperties( set_bloom_filter_enabled=True, fpp=0.2, ndv=30 @@ -35,6 +36,7 @@ def test_writer_properties_all_filled(): ), "b": ColumnProperties( dictionary_enabled=True, + statistics_enabled="PAGE", max_statistics_size=400, bloom_filter_properties=BloomFilterProperties( set_bloom_filter_enabled=False, fpp=0.2, ndv=30 From 92f5cac35f1d82fc40d7432e2dd50d5af4be6557 Mon Sep 17 00:00:00 2001 From: Max Piskunov Date: Tue, 14 Jan 2025 17:08:08 +0000 Subject: [PATCH 2/2] remove max_statistics_size from ColumnProperties Signed-off-by: Max Piskunov --- python/deltalake/table.py | 5 +---- python/src/lib.rs | 4 ---- python/tests/test_writerproperties.py | 2 -- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index caafd2eb21..f8357c3700 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -218,7 +218,6 @@ def __init__( self, dictionary_enabled: Optional[bool] = None, statistics_enabled: Optional[Literal["NONE", "CHUNK", "PAGE"]] = None, - max_statistics_size: Optional[int] = None, bloom_filter_properties: Optional[BloomFilterProperties] = None, ): """Create a Column Properties instance for the Rust parquet writer: @@ -226,18 +225,16 @@ def __init__( Args: dictionary_enabled: Enable dictionary encoding for the column. statistics_enabled: Statistics level for the column. - max_statistics_size: Maximum size of statistics for the column. bloom_filter_properties: Bloom Filter Properties for the column. """ self.dictionary_enabled = dictionary_enabled self.statistics_enabled = statistics_enabled - self.max_statistics_size = max_statistics_size self.bloom_filter_properties = bloom_filter_properties def __str__(self) -> str: return ( f"dictionary_enabled: {self.dictionary_enabled}, statistics_enabled: {self.statistics_enabled}, " - f"max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}" + f"bloom_filter_properties: {self.bloom_filter_properties}" ) diff --git a/python/src/lib.rs b/python/src/lib.rs index a4551bf641..b91874616d 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1573,9 +1573,6 @@ fn set_writer_properties(writer_properties: PyWriterProperties) -> DeltaResult, pub statistics_enabled: Option, - pub max_statistics_size: Option, pub bloom_filter_properties: Option, } diff --git a/python/tests/test_writerproperties.py b/python/tests/test_writerproperties.py index a9a8db5868..822ad82c02 100644 --- a/python/tests/test_writerproperties.py +++ b/python/tests/test_writerproperties.py @@ -29,7 +29,6 @@ def test_writer_properties_all_filled(): "a": ColumnProperties( dictionary_enabled=True, statistics_enabled="CHUNK", - max_statistics_size=40, bloom_filter_properties=BloomFilterProperties( set_bloom_filter_enabled=True, fpp=0.2, ndv=30 ), @@ -37,7 +36,6 @@ def test_writer_properties_all_filled(): "b": ColumnProperties( dictionary_enabled=True, statistics_enabled="PAGE", - max_statistics_size=400, bloom_filter_properties=BloomFilterProperties( set_bloom_filter_enabled=False, fpp=0.2, ndv=30 ),