Skip to content

Commit

Permalink
Make sure pandas is optional (#597)
Browse files Browse the repository at this point in the history
* Make sure pandas is optional

* Still need to work inside of venv

* Don't ask about uninstalling

* Add pandas back for docs
  • Loading branch information
wjones127 authored May 5, 2022
1 parent 812d827 commit fae9278
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 10 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/python_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ jobs:
run: |
source venv/bin/activate
make unit-test
- name: Test without pandas
run: |
source venv/bin/activate
pip uninstall --yes pandas
python -m pytest -m "not pandas and not integration"
pip install pandas
- name: Build Sphinx documentation
run: |
Expand Down
27 changes: 23 additions & 4 deletions python/deltalake/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,21 @@
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Union
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
Iterator,
List,
Mapping,
Optional,
Union,
)

if TYPE_CHECKING:
import pandas as pd

import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.fs as pa_fs
Expand All @@ -16,6 +28,13 @@
from .deltalake import write_new_deltalake as _write_new_deltalake
from .table import DeltaTable

try:
import pandas as pd
except ModuleNotFoundError:
_has_pandas = False
else:
_has_pandas = True


class DeltaTableProtocolError(PyDeltaTableError):
pass
Expand All @@ -34,7 +53,7 @@ class AddAction:
def write_deltalake(
table_or_uri: Union[str, DeltaTable],
data: Union[
pd.DataFrame,
"pd.DataFrame",
pa.Table,
pa.RecordBatch,
Iterable[pa.RecordBatch],
Expand Down Expand Up @@ -97,7 +116,7 @@ def write_deltalake(
:param description: User-provided description for this table.
:param configuration: A map containing configuration options for the metadata action.
"""
if isinstance(data, pd.DataFrame):
if _has_pandas and isinstance(data, pd.DataFrame):
data = pa.Table.from_pandas(data)

if schema is None:
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ devel = [
"sphinx",
"sphinx-rtd-theme",
"toml",
"pandas",
"typing-extensions"
]

Expand Down Expand Up @@ -78,4 +77,5 @@ testpaths = [
markers = [
"integration: marks tests as integration tests (deselect with '-m \"not integration\"')",
"s3: marks tests as integration tests with S3 (deselect with '-m \"not s3\"')",
"pandas: marks tests that require pandas",
]
3 changes: 1 addition & 2 deletions python/tests/test_fs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pytest
Expand Down Expand Up @@ -37,4 +36,4 @@ def test_read_files(s3_localstack):
def test_read_simple_table_from_remote(s3_localstack):
table_path = "s3://deltars/simple"
dt = DeltaTable(table_path)
assert dt.to_pandas().equals(pd.DataFrame({"id": [5, 7, 9]}))
assert dt.to_pyarrow_table().equals(pa.table({"id": [5, 7, 9]}))
12 changes: 10 additions & 2 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
import os
from datetime import date, datetime
from datetime import datetime
from threading import Barrier, Thread

import pandas as pd
try:
import pandas as pd
except ModuleNotFoundError:
_has_pandas = False
else:
_has_pandas = True

import pyarrow as pa
import pyarrow.dataset as ds
import pytest
Expand Down Expand Up @@ -322,12 +328,14 @@ def test_get_files_partitioned_table():
)


@pytest.mark.pandas
def test_delta_table_to_pandas():
table_path = "../rust/tests/data/simple_table"
dt = DeltaTable(table_path)
assert dt.to_pandas().equals(pd.DataFrame({"id": [5, 7, 9]}))


@pytest.mark.pandas
def test_delta_table_with_filesystem():
table_path = "../rust/tests/data/simple_table"
dt = DeltaTable(table_path)
Expand Down
9 changes: 8 additions & 1 deletion python/tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import pyarrow as pa
import pyarrow.compute as pc
import pytest
from pandas.testing import assert_frame_equal
from pyarrow._dataset_parquet import ParquetReadOptions
from pyarrow.dataset import ParquetFileFormat
from pyarrow.lib import RecordBatchReader
Expand All @@ -21,6 +20,13 @@
from deltalake.table import ProtocolVersions
from deltalake.writer import DeltaTableProtocolError

try:
from pandas.testing import assert_frame_equal
except ModuleNotFoundError:
_has_pandas = False
else:
_has_pandas = True


def _is_old_glibc_version():
if "CS_GNU_LIBC_VERSION" in os.confstr_names:
Expand Down Expand Up @@ -217,6 +223,7 @@ def test_fails_wrong_partitioning(existing_table: DeltaTable, sample_data: pa.Ta
)


@pytest.mark.pandas
def test_write_pandas(tmp_path: pathlib.Path, sample_data: pa.Table):
# When timestamp is converted to Pandas, it gets casted to ns resolution,
# but Delta Lake schemas only support us resolution.
Expand Down

0 comments on commit fae9278

Please sign in to comment.