From 0f8c36087e5c04ee7d4cbdfe5de5a66bddc29e6f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 31 Oct 2022 20:47:24 -0700 Subject: [PATCH 1/7] test: draft Data Acceptance Tests --- .gitmodules | 3 + dat | 1 + python/tests/data_acceptance/__init__.py | 0 python/tests/data_acceptance/test_reader.py | 61 +++++++++++++++++++++ 4 files changed, 65 insertions(+) create mode 100644 .gitmodules create mode 160000 dat create mode 100644 python/tests/data_acceptance/__init__.py create mode 100644 python/tests/data_acceptance/test_reader.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..b966ce62f8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dat"] + path = dat + url = https://github.com/delta-incubator/dat.git diff --git a/dat b/dat new file mode 160000 index 0000000000..e7798ddd9c --- /dev/null +++ b/dat @@ -0,0 +1 @@ +Subproject commit e7798ddd9cf4138d22e102361f08a9dcf92fb539 diff --git a/python/tests/data_acceptance/__init__.py b/python/tests/data_acceptance/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/tests/data_acceptance/test_reader.py b/python/tests/data_acceptance/test_reader.py new file mode 100644 index 0000000000..b980480cab --- /dev/null +++ b/python/tests/data_acceptance/test_reader.py @@ -0,0 +1,61 @@ +from typing import NamedTuple, Dict, Any +from pathlib import Path +import json + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +from deltalake import DeltaTable + +class ReadCase(NamedTuple): + root: Path + metadata: Dict[str, Any] + +cases = [] + +project_root = Path("../dat") +for path in (project_root / "out" / "tables" / "generated").iterdir(): + if path.is_dir(): + with open(path / "table-metadata.json") as f: + metadata = json.load(f) + cases.append(ReadCase(path, metadata)) + +# TODO: external-tables should be added to cases as well + +@pytest.mark.parametrize("case", cases) +def test_dat(case: ReadCase): + root, metadata = case + + # Get Delta Table path + delta_root = root / "delta" + + # Load table + dt = DeltaTable(str(delta_root)) + + # Compare protocol versions + # TODO: this is incorrect in dat + # assert dt.protocol().min_reader_version == metadata["reader_protocol_version"] + assert dt.protocol().min_writer_version == metadata["writer_protocol_version"] + + # Perhaps? + # assert dt.version == metadata["current_version"] + + # If supported protocol version, try to read, load parquet, and compare + if dt.protocol().min_reader_version <= 1: + parquet_root = root / "parquet" + expected = pq.read_table(parquet_root) + actual = dt.to_pyarrow_table() + assert_tables_equal(expected, actual) + else: + # We should raise an error when attempting to read too advanced protocol + with pytest.raises(Exception): + dt.to_pyarrow_table() + + +def assert_tables_equal(first: pa.Table, second: pa.Table) -> None: + assert first.schema == second.schema + sort_keys = [(col, "ascending") for col in first.column_names] + first_sorted = first.sort_by(sort_keys) + second_sorted = second.sort_by(sort_keys) + assert first_sorted == second_sorted From 783437c3ada6b0166887a5aede6fe56a22894d76 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 11 Jan 2023 20:35:38 -0800 Subject: [PATCH 2/7] feat: setup DAT files download --- .gitmodules | 3 -- dat | 1 - python/.gitignore | 3 ++ python/Makefile | 11 ++++ python/tests/data_acceptance/test_reader.py | 58 ++++++++++++++------- 5 files changed, 53 insertions(+), 23 deletions(-) delete mode 160000 dat diff --git a/.gitmodules b/.gitmodules index b966ce62f8..e69de29bb2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "dat"] - path = dat - url = https://github.com/delta-incubator/dat.git diff --git a/dat b/dat deleted file mode 160000 index e7798ddd9c..0000000000 --- a/dat +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e7798ddd9cf4138d22e102361f08a9dcf92fb539 diff --git a/python/.gitignore b/python/.gitignore index 96132d7999..e1e978f0a6 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -15,3 +15,6 @@ __pycache__/ docs/build *.so + +# dat data +dat-data \ No newline at end of file diff --git a/python/Makefile b/python/Makefile index db72b4f20f..59d89051a6 100644 --- a/python/Makefile +++ b/python/Makefile @@ -3,6 +3,7 @@ VENV := venv MATURIN_VERSION := $(shell grep 'requires =' pyproject.toml | cut -d= -f2- | tr -d '[ "]') PACKAGE_VERSION := $(shell grep version Cargo.toml | head -n 1 | awk '{print $$3}' | tr -d '"' ) +DAT_VERSION := 0.0.1 .PHONY: setup-venv setup-venv: ## Setup the virtualenv @@ -14,6 +15,16 @@ setup: ## Setup the requirements $(info --- Setup dependencies ---) pip install "$(MATURIN_VERSION)" +.PHONY: setup-dat +setup-dat: ## Download DAT test files + mkdir -p dat-data + rm -rf dat-data/v$(DAT_VERSION) + curl -L --silent --output dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz \ + https://github.com/delta-incubator/dat/releases/download/v$(DAT_VERSION)/deltalake-dat-v$(DAT_VERSION).tar.gz + -tar -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz + mv out dat-data/v$(DAT_VERSION) + rm dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz + .PHONY: build build: setup ## Build Python binding of delta-rs $(info --- Build Python binding ---) diff --git a/python/tests/data_acceptance/test_reader.py b/python/tests/data_acceptance/test_reader.py index b980480cab..8ab1e8bf50 100644 --- a/python/tests/data_acceptance/test_reader.py +++ b/python/tests/data_acceptance/test_reader.py @@ -1,6 +1,6 @@ -from typing import NamedTuple, Dict, Any -from pathlib import Path import json +from pathlib import Path +from typing import Any, Dict, NamedTuple, Optional import pyarrow as pa import pyarrow.parquet as pq @@ -8,42 +8,62 @@ from deltalake import DeltaTable + class ReadCase(NamedTuple): root: Path - metadata: Dict[str, Any] + version: Optional[int] + case_info: Dict[str, Any] + version_metadata: Dict[str, Any] + cases = [] -project_root = Path("../dat") -for path in (project_root / "out" / "tables" / "generated").iterdir(): +dat_version = "0.0.1" +reader_case_path = Path("dat-data") / f"v{dat_version}" / "reader_tests" / "generated" + +if not reader_case_path.exists(): + pytest.skip( + "DAT test data not present. Run make setup-dat to download them.", + allow_module_level=True, + ) + +for path in reader_case_path.iterdir(): if path.is_dir(): - with open(path / "table-metadata.json") as f: + with open(path / "test_case_info.json") as f: metadata = json.load(f) - cases.append(ReadCase(path, metadata)) -# TODO: external-tables should be added to cases as well + for version_path in (path / "expected").iterdir(): + if path.name.startswith("v"): + version = int(path.name[1:]) + else: + version = None + with open(version_path / "table_version_metadata.json") as f: + version_metadata = json.load(f) + + cases.append(ReadCase(path, version, metadata, version_metadata)) + -@pytest.mark.parametrize("case", cases) +@pytest.mark.parametrize( + "case", cases, ids=lambda case: f"{case.case_info['name']} (version={case.version})" +) def test_dat(case: ReadCase): - root, metadata = case + root, version, case_info, version_metadata = case # Get Delta Table path delta_root = root / "delta" # Load table - dt = DeltaTable(str(delta_root)) + dt = DeltaTable(str(delta_root), version=version) # Compare protocol versions - # TODO: this is incorrect in dat - # assert dt.protocol().min_reader_version == metadata["reader_protocol_version"] - assert dt.protocol().min_writer_version == metadata["writer_protocol_version"] - - # Perhaps? - # assert dt.version == metadata["current_version"] + assert dt.protocol().min_reader_version == version_metadata["min_reader_version"] + assert dt.protocol().min_writer_version == version_metadata["min_writer_version"] # If supported protocol version, try to read, load parquet, and compare if dt.protocol().min_reader_version <= 1: - parquet_root = root / "parquet" + version_path = "latest" if version is None else f"v{version}" + # TODO: fix the directory name here + parquet_root = root / "expected" / version_path / "table_content.parquet" expected = pq.read_table(parquet_root) actual = dt.to_pyarrow_table() assert_tables_equal(expected, actual) @@ -51,7 +71,7 @@ def test_dat(case: ReadCase): # We should raise an error when attempting to read too advanced protocol with pytest.raises(Exception): dt.to_pyarrow_table() - + def assert_tables_equal(first: pa.Table, second: pa.Table) -> None: assert first.schema == second.schema From 86519bb04317317d6b5b6b454a5fc73466338bb9 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 11 Jan 2023 20:37:09 -0800 Subject: [PATCH 3/7] test: add DAT files to CI --- .github/workflows/python_build.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index 1da748b3a3..eb5c562da2 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -102,7 +102,10 @@ jobs: pip install virtualenv virtualenv venv source venv/bin/activate - make develop + make develop + + - name: Download Data Acceptance Tests (DAT) files + run: make setup-dat - name: Run tests run: | From a80d48ffdeeb8effc3092dafc3b0398c8d278bbc Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 18 Jan 2023 19:45:03 -0800 Subject: [PATCH 4/7] test: get all DAT tests either passing or skipped with linked issue --- python/tests/data_acceptance/test_reader.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/tests/data_acceptance/test_reader.py b/python/tests/data_acceptance/test_reader.py index 8ab1e8bf50..57bb016d25 100644 --- a/python/tests/data_acceptance/test_reader.py +++ b/python/tests/data_acceptance/test_reader.py @@ -42,6 +42,13 @@ class ReadCase(NamedTuple): cases.append(ReadCase(path, version, metadata, version_metadata)) +failing_cases = { + "multi_partitioned_2": "Waiting for PyArrow 11.0.0 for decimal cast support (#1078)", + "nested_types": "Waiting for PyArrow 11.0.0 so we can ignore internal field names in equality", + "multi_partitioned": "Escaped characters in data file paths aren't yet handled (#1079)", + "no_stats": "We don't yet support files without stats (#582)", +} + @pytest.mark.parametrize( "case", cases, ids=lambda case: f"{case.case_info['name']} (version={case.version})" @@ -49,6 +56,10 @@ class ReadCase(NamedTuple): def test_dat(case: ReadCase): root, version, case_info, version_metadata = case + if case_info["name"] in failing_cases: + msg = failing_cases[case_info["name"]] + pytest.skip(msg) + # Get Delta Table path delta_root = root / "delta" @@ -64,7 +75,7 @@ def test_dat(case: ReadCase): version_path = "latest" if version is None else f"v{version}" # TODO: fix the directory name here parquet_root = root / "expected" / version_path / "table_content.parquet" - expected = pq.read_table(parquet_root) + expected = pq.read_table(parquet_root, coerce_int96_timestamp_unit="us") actual = dt.to_pyarrow_table() assert_tables_equal(expected, actual) else: From 703730c4a0a58b5da9c964529bf2eb2d9f235840 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 19 Jan 2023 10:14:36 -0800 Subject: [PATCH 5/7] fix tar command --- python/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/Makefile b/python/Makefile index 59d89051a6..0967e34356 100644 --- a/python/Makefile +++ b/python/Makefile @@ -21,7 +21,7 @@ setup-dat: ## Download DAT test files rm -rf dat-data/v$(DAT_VERSION) curl -L --silent --output dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz \ https://github.com/delta-incubator/dat/releases/download/v$(DAT_VERSION)/deltalake-dat-v$(DAT_VERSION).tar.gz - -tar -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz + tar --no-same-permissions -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz mv out dat-data/v$(DAT_VERSION) rm dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz From 3b5464aeef562569bbaf45e445972118e1bb4ad6 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 24 Jan 2023 18:10:23 -0800 Subject: [PATCH 6/7] Upgrade to latest version --- python/Makefile | 2 +- python/tests/data_acceptance/test_reader.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/Makefile b/python/Makefile index 0967e34356..e8416f67da 100644 --- a/python/Makefile +++ b/python/Makefile @@ -3,7 +3,7 @@ VENV := venv MATURIN_VERSION := $(shell grep 'requires =' pyproject.toml | cut -d= -f2- | tr -d '[ "]') PACKAGE_VERSION := $(shell grep version Cargo.toml | head -n 1 | awk '{print $$3}' | tr -d '"' ) -DAT_VERSION := 0.0.1 +DAT_VERSION := 0.0.2 .PHONY: setup-venv setup-venv: ## Setup the virtualenv diff --git a/python/tests/data_acceptance/test_reader.py b/python/tests/data_acceptance/test_reader.py index 57bb016d25..60c37386df 100644 --- a/python/tests/data_acceptance/test_reader.py +++ b/python/tests/data_acceptance/test_reader.py @@ -18,7 +18,7 @@ class ReadCase(NamedTuple): cases = [] -dat_version = "0.0.1" +dat_version = "0.0.2" reader_case_path = Path("dat-data") / f"v{dat_version}" / "reader_tests" / "generated" if not reader_case_path.exists(): @@ -73,8 +73,7 @@ def test_dat(case: ReadCase): # If supported protocol version, try to read, load parquet, and compare if dt.protocol().min_reader_version <= 1: version_path = "latest" if version is None else f"v{version}" - # TODO: fix the directory name here - parquet_root = root / "expected" / version_path / "table_content.parquet" + parquet_root = root / "expected" / version_path / "table_content" expected = pq.read_table(parquet_root, coerce_int96_timestamp_unit="us") actual = dt.to_pyarrow_table() assert_tables_equal(expected, actual) From c33c635d627176f9b780be5a8a6ac518d5688758 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 24 Jan 2023 20:47:49 -0800 Subject: [PATCH 7/7] remove gitmodules --- .gitmodules | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e69de29bb2..0000000000