Skip to content

Commit

Permalink
refactor(deps): make pyarrow optional for non-backend installs (#9552)
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud authored Jul 14, 2024
1 parent 2835b9f commit 9047b26
Show file tree
Hide file tree
Showing 20 changed files with 211 additions and 135 deletions.
17 changes: 14 additions & 3 deletions .github/workflows/ibis-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ env:

jobs:
test_core:
name: Test ${{ matrix.os }} python-${{ matrix.python-version }}
name: Test ${{ matrix.os }} python-${{ matrix.python-version }} pyarrow-${{ matrix.pyarrow }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
Expand All @@ -49,6 +49,9 @@ jobs:
- "3.10"
- "3.11"
- "3.12"
pyarrow:
- true
- false
steps:
- name: checkout
uses: actions/checkout@v4
Expand Down Expand Up @@ -77,17 +80,25 @@ jobs:
- name: install ibis
run: poetry install --without dev --without docs --extras "visualization decompiler"

- name: install pyarrow
if: matrix.pyarrow
run: poetry run pip install pyarrow pyarrow-hotfix

- name: check pyarrow import
if: matrix.pyarrow
run: poetry run python -c 'import pyarrow, pyarrow_hotfix'

- uses: extractions/setup-just@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: run all core tests and run benchmarks once parallel
if: matrix.os != 'windows-latest'
run: just ci-check -m "'core or benchmarks'" --numprocesses auto
run: just ci-check -m "'core or benchmarks'" --numprocesses auto -rs

- name: run all core tests and run benchmarks once serial
if: matrix.os == 'windows-latest'
run: just ci-check -m "'core or benchmarks'"
run: just ci-check -m "'core or benchmarks'" -rs

- name: upload code coverage
if: success()
Expand Down
4 changes: 3 additions & 1 deletion ibis/backends/tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import numpy as np
import pandas as pd
import pyarrow as pa
import pytest

pa = pytest.importorskip("pyarrow")

array_types = pd.DataFrame(
[
Expand Down
7 changes: 4 additions & 3 deletions ibis/backends/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset
import pytest
import rich.console
import toolz
Expand All @@ -43,6 +41,9 @@
if TYPE_CHECKING:
from ibis.backends import BaseBackend


pa = pytest.importorskip("pyarrow")
ds = pytest.importorskip("pyarrow.dataset")
pl = pytest.importorskip("polars", reason="Polars is not installed")


Expand Down Expand Up @@ -948,7 +949,7 @@ def test_self_join_memory_table(backend, con, monkeypatch):
id="pyarrow_single_batch",
),
param(
lambda: pa.dataset.dataset(pa.table({"a": ["a"], "b": [1]})),
lambda: ds.dataset(pa.table({"a": ["a"], "b": [1]})),
"df_arrow_dataset",
marks=[
pytest.mark.notimpl(
Expand Down
3 changes: 2 additions & 1 deletion ibis/backends/tests/test_dataframe_interchange.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from __future__ import annotations

import pyarrow as pa
import pytest
from packaging.version import parse as vparse

pa = pytest.importorskip("pyarrow")

pytestmark = pytest.mark.skipif(
vparse(pa.__version__) < vparse("12"), reason="pyarrow >= 12 required"
)
Expand Down
9 changes: 6 additions & 3 deletions ibis/backends/tests/test_export.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from __future__ import annotations

import pandas as pd
import pyarrow as pa
import pyarrow.csv as pcsv
import pytest
from packaging.version import parse as vparse
from pytest import param
Expand All @@ -24,7 +22,8 @@
SnowflakeProgrammingError,
TrinoUserError,
)
from ibis.formats.pyarrow import PyArrowType

pa = pytest.importorskip("pyarrow")

limit = [
param(
Expand Down Expand Up @@ -330,6 +329,8 @@ def test_table_to_csv(tmp_path, backend, awards_players):
)
@pytest.mark.parametrize("delimiter", [";", "\t"], ids=["semicolon", "tab"])
def test_table_to_csv_writer_kwargs(delimiter, tmp_path, awards_players):
import pyarrow.csv as pcsv

outcsv = tmp_path / "out.csv"
# avoid pandas NaNonense
awards_players = awards_players.select("playerID", "awardID", "yearID", "lgID")
Expand Down Expand Up @@ -426,6 +427,8 @@ def test_roundtrip_delta(backend, con, alltypes, tmp_path):
["druid"], raises=AttributeError, reason="string type is used for timestamp_col"
)
def test_arrow_timestamp_with_time_zone(alltypes):
from ibis.formats.pyarrow import PyArrowType

t = alltypes.select(
tz=alltypes.timestamp_col.cast(
alltypes.timestamp_col.type().copy(timezone="UTC")
Expand Down
3 changes: 2 additions & 1 deletion ibis/backends/tests/test_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import numpy as np
import pandas as pd
import pandas.testing as tm
import pyarrow as pa
import pytest
from pytest import param

Expand All @@ -12,6 +11,8 @@
import ibis.expr.datatypes as dt
from ibis.backends.tests.errors import PsycoPg2InternalError, Py4JJavaError

pa = pytest.importorskip("pyarrow")

pytestmark = [
pytest.mark.never(
["sqlite", "mysql", "mssql"], reason="Unlikely to ever add map support"
Expand Down
4 changes: 3 additions & 1 deletion ibis/common/tests/test_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import decimal
from typing import TYPE_CHECKING, Union

import pytest

from ibis.common.dispatch import Dispatched, lazy_singledispatch

# ruff: noqa: F811
Expand Down Expand Up @@ -222,7 +224,7 @@ def test_dispatched():


def test_dispatched_lazy():
import pyarrow as pa
pa = pytest.importorskip("pyarrow")

empty_pyarrow_table = pa.Table.from_arrays([])
empty_pandas_table = empty_pyarrow_table.to_pandas()
Expand Down
1 change: 1 addition & 0 deletions ibis/expr/datatypes/tests/test_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ def test_from_numpy_timedelta():
],
)
def test_infer_numpy_array(numpy_array, expected_dtypes):
pytest.importorskip("pyarrow")
pandas_series = pd.Series(numpy_array)
assert dt.infer(numpy_array) in expected_dtypes
assert dt.infer(pandas_series) in expected_dtypes
Expand Down
4 changes: 2 additions & 2 deletions ibis/expr/tests/test_newrels.py
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ def test_self_join():


def test_self_join_view():
t = ibis.memtable({"x": [1, 2], "y": [2, 1], "z": ["a", "b"]})
t = ibis.table(schema={"x": "int", "y": "int", "z": "str"})
t_view = t.view()
expr = t.join(t_view, t.x == t_view.y).select("x", "y", "z", "z_right")

Expand All @@ -1146,7 +1146,7 @@ def test_self_join_view():


def test_self_join_with_view_projection():
t1 = ibis.memtable({"x": [1, 2], "y": [2, 1], "z": ["a", "b"]})
t1 = ibis.table(schema={"x": "int", "y": "int", "z": "str"})
t2 = t1.view()
expr = t1.inner_join(t2, ["x"])[[t1]]

Expand Down
3 changes: 2 additions & 1 deletion ibis/expr/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import NamedTuple

import numpy as np
import pyarrow as pa
import pytest

import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -365,6 +364,7 @@ def test_schema_set_operations():


def test_schema_infer_pyarrow_table():
pa = pytest.importorskip("pyarrow")
table = pa.Table.from_arrays(
[
pa.array([1, 2, 3]),
Expand All @@ -378,6 +378,7 @@ def test_schema_infer_pyarrow_table():


def test_schema_from_to_pyarrow_schema():
pa = pytest.importorskip("pyarrow")
pyarrow_schema = pa.schema(
[
pa.field("a", pa.int64()),
Expand Down
33 changes: 23 additions & 10 deletions ibis/expr/types/joins.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,28 @@
from ibis.expr.operations.relations import JoinKind


def coerce_to_table(data):
try:
import pandas as pd
except ImportError:
pass
else:
if isinstance(data, pd.DataFrame):
return ibis.memtable(data)

try:
import pyarrow as pa
except ImportError:
pass
else:
if isinstance(data, pa.Table):
return ibis.memtable(data)

if not isinstance(data, Table):
raise TypeError(f"right operand must be a Table, got {type(data).__name__}")
return data


def disambiguate_fields(
how,
predicates,
Expand Down Expand Up @@ -228,16 +250,7 @@ def join(
lname: str = "",
rname: str = "{name}_right",
):
import pandas as pd
import pyarrow as pa

# TODO(kszucs): factor out to a helper function
if isinstance(right, (pd.DataFrame, pa.Table)):
right = ibis.memtable(right)
elif not isinstance(right, Table):
raise TypeError(
f"right operand must be a Table, got {type(right).__name__}"
)
right = coerce_to_table(right)

if how == "left_semi":
how = "semi"
Expand Down
5 changes: 4 additions & 1 deletion ibis/formats/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import numpy as np
import pandas as pd
import pandas.api.types as pdt
import pyarrow as pa

import ibis.expr.datatypes as dt
import ibis.expr.schema as sch
Expand All @@ -23,6 +22,7 @@

if TYPE_CHECKING:
import polars as pl
import pyarrow as pa

_has_arrow_dtype = hasattr(pd, "ArrowDtype")

Expand Down Expand Up @@ -408,6 +408,9 @@ def to_frame(self) -> pd.DataFrame:
return self.obj

def to_pyarrow(self, schema: sch.Schema) -> pa.Table:
import pyarrow as pa
import pyarrow_hotfix # noqa: F401

pyarrow_schema = PyArrowSchema.from_ibis(schema)
return pa.Table.from_pandas(self.obj, schema=pyarrow_schema)

Expand Down
Loading

0 comments on commit 9047b26

Please sign in to comment.