Skip to content

Commit

Permalink
Support date32 and decimal stats in write_deltalake (#659)
Browse files Browse the repository at this point in the history
* feat: Add support for writing date32 and decimal stats

Only supported starting with PyArrow version 8.0.0.

* fix: Make packaging explicit dependency

* fix: Install packaging in lint task

* fix: make packaging dev dependency only
  • Loading branch information
wjones127 authored Jun 27, 2022
1 parent 45f7bad commit 5bfd89d
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 15 deletions.
13 changes: 10 additions & 3 deletions python/deltalake/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
else:
_has_pandas = True

PYARROW_MAJOR_VERSION = int(pa.__version__.split(".", maxsplit=1)[0])


class DeltaTableProtocolError(PyDeltaTableError):
pass
Expand Down Expand Up @@ -338,10 +340,15 @@ def iter_groups(metadata: Any) -> Iterator[Any]:
.column(column_idx)
.statistics.logical_type.type
)
#
if logical_type not in ["STRING", "INT", "TIMESTAMP", "NONE"]:

if PYARROW_MAJOR_VERSION < 8 and logical_type not in [
"STRING",
"INT",
"TIMESTAMP",
"NONE",
]:
continue
# import pdb; pdb.set_trace()

stats["minValues"][name] = min(
group.column(column_idx).statistics.min
for group in iter_groups(metadata)
Expand Down
1 change: 1 addition & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ devel = [
"mypy",
"black",
"isort",
"packaging>=20",
"pytest",
"pytest-mock",
"pytest-cov",
Expand Down
1 change: 1 addition & 0 deletions python/stubs/pyarrow/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Any, Callable

__version__: str
Schema: Any
Table: Any
RecordBatch: Any
Expand Down
15 changes: 8 additions & 7 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from datetime import datetime
from threading import Barrier, Thread

from packaging import version

try:
import pandas as pd
except ModuleNotFoundError:
Expand Down Expand Up @@ -183,14 +185,13 @@ def test_read_table_with_stats():
data = dataset.to_table(filter=filter_expr)
assert data.num_rows == 0

# TODO(wjones127): Enable these tests once C++ Arrow implements is_null and is_valid
# simplification. Blocked on: https://issues.apache.org/jira/browse/ARROW-12659

# filter_expr = ds.field("cases").is_null()
# assert len(list(dataset.get_fragments(filter=filter_expr))) == 0
# PyArrow added support for is_null and is_valid simplification in 8.0.0
if version.parse(pa.__version__).major >= 8:
filter_expr = ds.field("cases").is_null()
assert len(list(dataset.get_fragments(filter=filter_expr))) == 0

# data = dataset.to_table(filter=filter_expr)
# assert data.num_rows == 0
data = dataset.to_table(filter=filter_expr)
assert data.num_rows == 0


def test_vacuum_dry_run_simple_table():
Expand Down
16 changes: 11 additions & 5 deletions python/tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from packaging import version
from pyarrow._dataset_parquet import ParquetReadOptions
from pyarrow.dataset import ParquetFileFormat
from pyarrow.lib import RecordBatchReader
Expand Down Expand Up @@ -333,14 +334,16 @@ def test_writer_stats(existing_table: DeltaTable, sample_data: pa.Table):
"float64": 0.0,
"bool": False,
"binary": "0",
# TODO: Writer needs special decoding for decimal and date32.
#'decimal': '10.000',
# "date32": '2022-01-01',
"timestamp": "2022-01-01T00:00:00",
"struct.x": 0,
"struct.y": "0",
"list.list.item": 0,
}
# PyArrow added support for decimal and date32 in 8.0.0
if version.parse(pa.__version__).major >= 8:
expected_mins["decimal"] = "10.000"
expected_mins["date32"] = "2022-01-01"

assert stats["minValues"] == expected_mins

expected_maxs = {
Expand All @@ -353,13 +356,16 @@ def test_writer_stats(existing_table: DeltaTable, sample_data: pa.Table):
"float64": 4.0,
"bool": True,
"binary": "4",
#'decimal': '40.000',
# "date32": '2022-01-04',
"timestamp": "2022-01-01T04:00:00",
"struct.x": 4,
"struct.y": "4",
"list.list.item": 4,
}
# PyArrow added support for decimal and date32 in 8.0.0
if version.parse(pa.__version__).major >= 8:
expected_maxs["decimal"] = "14.000"
expected_maxs["date32"] = "2022-01-05"

assert stats["maxValues"] == expected_maxs


Expand Down

0 comments on commit 5bfd89d

Please sign in to comment.