diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index 1da748b3a3..eb5c562da2 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -102,7 +102,10 @@ jobs: pip install virtualenv virtualenv venv source venv/bin/activate - make develop + make develop + + - name: Download Data Acceptance Tests (DAT) files + run: make setup-dat - name: Run tests run: | diff --git a/python/.gitignore b/python/.gitignore index 96132d7999..e1e978f0a6 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -15,3 +15,6 @@ __pycache__/ docs/build *.so + +# dat data +dat-data \ No newline at end of file diff --git a/python/Makefile b/python/Makefile index db72b4f20f..e8416f67da 100644 --- a/python/Makefile +++ b/python/Makefile @@ -3,6 +3,7 @@ VENV := venv MATURIN_VERSION := $(shell grep 'requires =' pyproject.toml | cut -d= -f2- | tr -d '[ "]') PACKAGE_VERSION := $(shell grep version Cargo.toml | head -n 1 | awk '{print $$3}' | tr -d '"' ) +DAT_VERSION := 0.0.2 .PHONY: setup-venv setup-venv: ## Setup the virtualenv @@ -14,6 +15,16 @@ setup: ## Setup the requirements $(info --- Setup dependencies ---) pip install "$(MATURIN_VERSION)" +.PHONY: setup-dat +setup-dat: ## Download DAT test files + mkdir -p dat-data + rm -rf dat-data/v$(DAT_VERSION) + curl -L --silent --output dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz \ + https://github.com/delta-incubator/dat/releases/download/v$(DAT_VERSION)/deltalake-dat-v$(DAT_VERSION).tar.gz + tar --no-same-permissions -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz + mv out dat-data/v$(DAT_VERSION) + rm dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz + .PHONY: build build: setup ## Build Python binding of delta-rs $(info --- Build Python binding ---) diff --git a/python/tests/data_acceptance/__init__.py b/python/tests/data_acceptance/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/tests/data_acceptance/test_reader.py b/python/tests/data_acceptance/test_reader.py new file mode 100644 index 0000000000..60c37386df --- /dev/null +++ b/python/tests/data_acceptance/test_reader.py @@ -0,0 +1,91 @@ +import json +from pathlib import Path +from typing import Any, Dict, NamedTuple, Optional + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +from deltalake import DeltaTable + + +class ReadCase(NamedTuple): + root: Path + version: Optional[int] + case_info: Dict[str, Any] + version_metadata: Dict[str, Any] + + +cases = [] + +dat_version = "0.0.2" +reader_case_path = Path("dat-data") / f"v{dat_version}" / "reader_tests" / "generated" + +if not reader_case_path.exists(): + pytest.skip( + "DAT test data not present. Run make setup-dat to download them.", + allow_module_level=True, + ) + +for path in reader_case_path.iterdir(): + if path.is_dir(): + with open(path / "test_case_info.json") as f: + metadata = json.load(f) + + for version_path in (path / "expected").iterdir(): + if path.name.startswith("v"): + version = int(path.name[1:]) + else: + version = None + with open(version_path / "table_version_metadata.json") as f: + version_metadata = json.load(f) + + cases.append(ReadCase(path, version, metadata, version_metadata)) + +failing_cases = { + "multi_partitioned_2": "Waiting for PyArrow 11.0.0 for decimal cast support (#1078)", + "nested_types": "Waiting for PyArrow 11.0.0 so we can ignore internal field names in equality", + "multi_partitioned": "Escaped characters in data file paths aren't yet handled (#1079)", + "no_stats": "We don't yet support files without stats (#582)", +} + + +@pytest.mark.parametrize( + "case", cases, ids=lambda case: f"{case.case_info['name']} (version={case.version})" +) +def test_dat(case: ReadCase): + root, version, case_info, version_metadata = case + + if case_info["name"] in failing_cases: + msg = failing_cases[case_info["name"]] + pytest.skip(msg) + + # Get Delta Table path + delta_root = root / "delta" + + # Load table + dt = DeltaTable(str(delta_root), version=version) + + # Compare protocol versions + assert dt.protocol().min_reader_version == version_metadata["min_reader_version"] + assert dt.protocol().min_writer_version == version_metadata["min_writer_version"] + + # If supported protocol version, try to read, load parquet, and compare + if dt.protocol().min_reader_version <= 1: + version_path = "latest" if version is None else f"v{version}" + parquet_root = root / "expected" / version_path / "table_content" + expected = pq.read_table(parquet_root, coerce_int96_timestamp_unit="us") + actual = dt.to_pyarrow_table() + assert_tables_equal(expected, actual) + else: + # We should raise an error when attempting to read too advanced protocol + with pytest.raises(Exception): + dt.to_pyarrow_table() + + +def assert_tables_equal(first: pa.Table, second: pa.Table) -> None: + assert first.schema == second.schema + sort_keys = [(col, "ascending") for col in first.column_names] + first_sorted = first.sort_by(sort_keys) + second_sorted = second.sort_by(sort_keys) + assert first_sorted == second_sorted