From 298cd909869e49f1a3d6b993b5ccd218fdff0009 Mon Sep 17 00:00:00 2001 From: Omkar P <45419097+omkar-foss@users.noreply.github.com> Date: Thu, 22 Aug 2024 22:12:59 +0530 Subject: [PATCH] feat: public method to get partitions for DeltaTable (#2671) This adds a public method `partitions()` to the `DeltaTable` class to get properly formatted partitions (list of dicts) for the table. Also provides an option to return partitions as a list of tuples, and proxies the partition filters to rust `get_active_partitions()`. This also adds supporting tests for this feature. --- python/deltalake/table.py | 24 +++++++++++ python/tests/test_table_read.py | 72 +++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 9628bff104..37756c62f1 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -510,6 +510,30 @@ def version(self) -> int: """ return self._table.version() + def partitions( + self, + partition_filters: Optional[List[Tuple[str, str, Any]]] = None, + as_tuple_list: bool = False, + ) -> List[Dict[str, str]] | List[Tuple[str]]: + """ + Returns the partitions as a list of dicts. Example: `[{'month': '1', 'year': '2020', 'day': '1'}, ...]` + + Args: + partition_filters: The partition filters that will be used for getting the matched partitions, defaults to `None` (no filtering). + as_tuple_list: If `True`, returns the partitions as a list of tuples. Example: `[(("day", "5"), ("month", "4"), ("year", "2021")), ...]` + """ + + partitions: List[Any] = [] + for partition in self._table.get_active_partitions(partition_filters): + if not partition: + continue + if as_tuple_list: + sorted_partition = sorted(tuple(partition), key=lambda x: x[0]) + partitions.append(tuple(sorted_partition)) + else: + partitions.append({k: v for (k, v) in partition}) + return partitions + def files( self, partition_filters: Optional[List[Tuple[str, str, Any]]] = None ) -> List[str]: diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py index cc36fc0274..3d9b7a27fe 100644 --- a/python/tests/test_table_read.py +++ b/python/tests/test_table_read.py @@ -839,6 +839,78 @@ def test_encode_partition_value(input_value: Any, expected: str) -> None: assert encode_partition_value(input_value) == expected +def test_partitions_partitioned_table(): + table_path = "../crates/test/tests/data/delta-0.8.0-partitioned" + dt = DeltaTable(table_path) + expected = [ + {"year": "2020", "month": "2", "day": "5"}, + {"year": "2021", "month": "12", "day": "4"}, + {"year": "2020", "month": "2", "day": "3"}, + {"year": "2021", "month": "4", "day": "5"}, + {"year": "2020", "month": "1", "day": "1"}, + {"year": "2021", "month": "12", "day": "20"}, + ] + actual = dt.partitions() + for partition in expected: + assert partition in actual + + +def test_partitions_tuples_partitioned_table(): + table_path = "../crates/test/tests/data/delta-0.8.0-partitioned" + dt = DeltaTable(table_path) + expected = [ + (("day", "5"), ("month", "2"), ("year", "2020")), + (("day", "1"), ("month", "1"), ("year", "2020")), + (("day", "5"), ("month", "4"), ("year", "2021")), + (("day", "3"), ("month", "2"), ("year", "2020")), + (("day", "20"), ("month", "12"), ("year", "2021")), + (("day", "4"), ("month", "12"), ("year", "2021")), + ] + actual = dt.partitions(as_tuple_list=True) + assert len(expected) == len(actual) + for partition in expected: + partition in actual + + +def test_partitions_filtering_partitioned_table(): + table_path = "../crates/test/tests/data/delta-0.8.0-partitioned" + dt = DeltaTable(table_path) + expected = [ + (("day", "5"), ("month", "4"), ("year", "2021")), + (("day", "20"), ("month", "12"), ("year", "2021")), + (("day", "4"), ("month", "12"), ("year", "2021")), + ] + partition_filters = [("year", ">=", "2021")] + actual = dt.partitions(partition_filters=partition_filters, as_tuple_list=True) + assert len(expected) == len(actual) + for partition in expected: + partition in actual + + +def test_partitions_special_partitioned_table(): + table_path = "../crates/test/tests/data/delta-0.8.0-special-partition" + dt = DeltaTable(table_path) + + # Partitions as list of dicts (default). + expected_dict = [{"x": "A/A"}, {"x": "B B"}] + actual_dict = dt.partitions() + for partition in expected_dict: + partition in actual_dict + + # Partitions as list of tuples. + expected_tuple = [[("x", "B B")], [("x", "A/A")]] + actual_tuple = dt.partitions(as_tuple_list=True) + assert len(expected_tuple) == len(actual_tuple) + for partition in expected_tuple: + partition in actual_tuple + + +def test_partitions_unpartitioned_table(): + table_path = "../crates/test/tests/data/simple_table" + dt = DeltaTable(table_path) + assert len(dt.partitions()) == 0 + + def test_read_table_last_checkpoint_not_updated(): dt = DeltaTable("../crates/test/tests/data/table_failed_last_checkpoint_update")