Skip to content

Commit

Permalink
Allow for reading columns as dictionaries using to_pyarrow_dataset (#941
Browse files Browse the repository at this point in the history
)

# Description
When passing `parquet_read_options` to `to_pyarrow_dataset` it is now
possible to use `dictionary_columns` to control which columns should be
dictionary encoded as they are read.

# Related Issue(s)
- closes #938 
<!---
For example:

- closes #106
--->

# Documentation

<!---
Share links to useful documentation
--->
  • Loading branch information
Kuhlwein authored Nov 17, 2022
1 parent 8ad897c commit ddd0744
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
15 changes: 12 additions & 3 deletions python/deltalake/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,9 +334,18 @@ def to_pyarrow_dataset(
)
]

return FileSystemDataset(
fragments, self.schema().to_pyarrow(), format, filesystem
)
schema = self.schema().to_pyarrow()

dictionary_columns = format.read_options.dictionary_columns or set()
if dictionary_columns:
for index, field in enumerate(schema):
if field.name in dictionary_columns:
dict_field = field.with_type(
pyarrow.dictionary(pyarrow.int32(), field.type)
)
schema = schema.set(index, dict_field)

return FileSystemDataset(fragments, schema, format, filesystem)

def to_pyarrow_table(
self,
Expand Down
2 changes: 2 additions & 0 deletions python/stubs/pyarrow/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ type_for_alias: Any
date32: Any
date64: Any
decimal128: Any
int32: Any
float16: Any
float32: Any
float64: Any
dictionary: Any

py_buffer: Callable[[bytes], Any]
NativeFile: Any
Expand Down
14 changes: 14 additions & 0 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,20 @@ def test_read_table_with_column_subset():
)


def test_read_table_as_category():
table_path = "../rust/tests/data/delta-0.8.0-partitioned"
dt = DeltaTable(table_path)

assert dt.schema().to_pyarrow().field("value").type == pa.string()

read_options = ds.ParquetReadOptions(dictionary_columns={"value"})

data = dt.to_pyarrow_dataset(parquet_read_options=read_options).to_table()

assert data.schema.field("value").type == pa.dictionary(pa.int32(), pa.string())
assert data.schema.field("day").type == pa.string()


def test_read_table_with_filter():
table_path = "../rust/tests/data/delta-0.8.0-partitioned"
dt = DeltaTable(table_path)
Expand Down

0 comments on commit ddd0744

Please sign in to comment.