From 7d934485c9dd3808c67b733b69e514cc780ca9f1 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 4 May 2022 20:25:45 -0700 Subject: [PATCH 1/4] Make sure pandas is optional --- .github/workflows/python_build.yml | 5 +++++ python/deltalake/writer.py | 27 +++++++++++++++++++++++---- python/pyproject.toml | 2 +- python/tests/test_fs.py | 3 +-- python/tests/test_table_read.py | 12 ++++++++++-- python/tests/test_writer.py | 9 ++++++++- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index 759bb65722..cfc08a1ef3 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -120,6 +120,11 @@ jobs: run: | source venv/bin/activate make unit-test + + - name: Test without pandas + run: | + pip uninstall pandas + python -m pytest -m "not pandas and not integration" - name: Build Sphinx documentation run: | diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index 9304e129cb..8310cc6c2c 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -3,9 +3,21 @@ from dataclasses import dataclass from datetime import date, datetime from decimal import Decimal -from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Union +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterable, + Iterator, + List, + Mapping, + Optional, + Union, +) + +if TYPE_CHECKING: + import pandas as pd -import pandas as pd import pyarrow as pa import pyarrow.dataset as ds import pyarrow.fs as pa_fs @@ -16,6 +28,13 @@ from .deltalake import write_new_deltalake as _write_new_deltalake from .table import DeltaTable +try: + import pandas as pd +except ModuleNotFoundError: + _has_pandas = False +else: + _has_pandas = True + class DeltaTableProtocolError(PyDeltaTableError): pass @@ -34,7 +53,7 @@ class AddAction: def write_deltalake( table_or_uri: Union[str, DeltaTable], data: Union[ - pd.DataFrame, + "pd.DataFrame", pa.Table, pa.RecordBatch, Iterable[pa.RecordBatch], @@ -97,7 +116,7 @@ def write_deltalake( :param description: User-provided description for this table. :param configuration: A map containing configuration options for the metadata action. """ - if isinstance(data, pd.DataFrame): + if _has_pandas and isinstance(data, pd.DataFrame): data = pa.Table.from_pandas(data) if schema is None: diff --git a/python/pyproject.toml b/python/pyproject.toml index f3eb4356d7..7d3b783bf8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -36,7 +36,6 @@ devel = [ "sphinx", "sphinx-rtd-theme", "toml", - "pandas", "typing-extensions" ] @@ -78,4 +77,5 @@ testpaths = [ markers = [ "integration: marks tests as integration tests (deselect with '-m \"not integration\"')", "s3: marks tests as integration tests with S3 (deselect with '-m \"not s3\"')", + "pandas: marks tests that require pandas", ] \ No newline at end of file diff --git a/python/tests/test_fs.py b/python/tests/test_fs.py index 6bb565e6e8..0991aa096f 100644 --- a/python/tests/test_fs.py +++ b/python/tests/test_fs.py @@ -1,4 +1,3 @@ -import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pytest @@ -37,4 +36,4 @@ def test_read_files(s3_localstack): def test_read_simple_table_from_remote(s3_localstack): table_path = "s3://deltars/simple" dt = DeltaTable(table_path) - assert dt.to_pandas().equals(pd.DataFrame({"id": [5, 7, 9]})) + assert dt.to_pyarrow_table().equals(pa.table({"id": [5, 7, 9]})) diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py index dc71b2bc60..40cbd2cafb 100644 --- a/python/tests/test_table_read.py +++ b/python/tests/test_table_read.py @@ -1,8 +1,14 @@ import os -from datetime import date, datetime +from datetime import datetime from threading import Barrier, Thread -import pandas as pd +try: + import pandas as pd +except ModuleNotFoundError: + _has_pandas = False +else: + _has_pandas = True + import pyarrow as pa import pyarrow.dataset as ds import pytest @@ -322,12 +328,14 @@ def test_get_files_partitioned_table(): ) +@pytest.mark.pandas def test_delta_table_to_pandas(): table_path = "../rust/tests/data/simple_table" dt = DeltaTable(table_path) assert dt.to_pandas().equals(pd.DataFrame({"id": [5, 7, 9]})) +@pytest.mark.pandas def test_delta_table_with_filesystem(): table_path = "../rust/tests/data/simple_table" dt = DeltaTable(table_path) diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py index adb2e5f6a0..c2261fdc05 100644 --- a/python/tests/test_writer.py +++ b/python/tests/test_writer.py @@ -12,7 +12,6 @@ import pyarrow as pa import pyarrow.compute as pc import pytest -from pandas.testing import assert_frame_equal from pyarrow._dataset_parquet import ParquetReadOptions from pyarrow.dataset import ParquetFileFormat from pyarrow.lib import RecordBatchReader @@ -21,6 +20,13 @@ from deltalake.table import ProtocolVersions from deltalake.writer import DeltaTableProtocolError +try: + from pandas.testing import assert_frame_equal +except ModuleNotFoundError: + _has_pandas = False +else: + _has_pandas = True + def _is_old_glibc_version(): if "CS_GNU_LIBC_VERSION" in os.confstr_names: @@ -217,6 +223,7 @@ def test_fails_wrong_partitioning(existing_table: DeltaTable, sample_data: pa.Ta ) +@pytest.mark.pandas def test_write_pandas(tmp_path: pathlib.Path, sample_data: pa.Table): # When timestamp is converted to Pandas, it gets casted to ns resolution, # but Delta Lake schemas only support us resolution. From c36cf660f1f98aba77d6e5ec623bf420158ce616 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 4 May 2022 20:32:40 -0700 Subject: [PATCH 2/4] Still need to work inside of venv --- .github/workflows/python_build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index cfc08a1ef3..d89bf6f1f0 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -123,6 +123,7 @@ jobs: - name: Test without pandas run: | + source venv/bin/activate pip uninstall pandas python -m pytest -m "not pandas and not integration" From 5a80190da7a0372f2276e6af5996213a38bbfc40 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 4 May 2022 20:35:45 -0700 Subject: [PATCH 3/4] Don't ask about uninstalling --- .github/workflows/python_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index d89bf6f1f0..7f8a2c302e 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -124,7 +124,7 @@ jobs: - name: Test without pandas run: | source venv/bin/activate - pip uninstall pandas + pip uninstall --yes pandas python -m pytest -m "not pandas and not integration" - name: Build Sphinx documentation From dbc444de8a4619748daf368e1b640e174959a40e Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 4 May 2022 20:38:38 -0700 Subject: [PATCH 4/4] Add pandas back for docs --- .github/workflows/python_build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index 7f8a2c302e..610076bb5b 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -126,6 +126,7 @@ jobs: source venv/bin/activate pip uninstall --yes pandas python -m pytest -m "not pandas and not integration" + pip install pandas - name: Build Sphinx documentation run: |