diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 06607ccaad..35917864bb 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -334,9 +334,18 @@ def to_pyarrow_dataset( ) ] - return FileSystemDataset( - fragments, self.schema().to_pyarrow(), format, filesystem - ) + schema = self.schema().to_pyarrow() + + dictionary_columns = format.read_options.dictionary_columns or set() + if dictionary_columns: + for index, field in enumerate(schema): + if field.name in dictionary_columns: + dict_field = field.with_type( + pyarrow.dictionary(pyarrow.int32(), field.type) + ) + schema = schema.set(index, dict_field) + + return FileSystemDataset(fragments, schema, format, filesystem) def to_pyarrow_table( self, diff --git a/python/stubs/pyarrow/__init__.pyi b/python/stubs/pyarrow/__init__.pyi index c6142647e8..dde6a48050 100644 --- a/python/stubs/pyarrow/__init__.pyi +++ b/python/stubs/pyarrow/__init__.pyi @@ -18,9 +18,11 @@ type_for_alias: Any date32: Any date64: Any decimal128: Any +int32: Any float16: Any float32: Any float64: Any +dictionary: Any py_buffer: Callable[[bytes], Any] NativeFile: Any diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py index 8a0d3499e4..3543a8bc52 100644 --- a/python/tests/test_table_read.py +++ b/python/tests/test_table_read.py @@ -160,6 +160,20 @@ def test_read_table_with_column_subset(): ) +def test_read_table_as_category(): + table_path = "../rust/tests/data/delta-0.8.0-partitioned" + dt = DeltaTable(table_path) + + assert dt.schema().to_pyarrow().field("value").type == pa.string() + + read_options = ds.ParquetReadOptions(dictionary_columns={"value"}) + + data = dt.to_pyarrow_dataset(parquet_read_options=read_options).to_table() + + assert data.schema.field("value").type == pa.dictionary(pa.int32(), pa.string()) + assert data.schema.field("day").type == pa.string() + + def test_read_table_with_filter(): table_path = "../rust/tests/data/delta-0.8.0-partitioned" dt = DeltaTable(table_path)