refactor(deps): make pyarrow optional for non-backend installs (#9552)

ibis-project · Jul 14, 2024 · 9047b26 · 9047b26
1 parent 2835b9f
commit 9047b26
Show file tree

Hide file tree

Showing 20 changed files with 211 additions and 135 deletions.
diff --git a/.github/workflows/ibis-main.yml b/.github/workflows/ibis-main.yml
@@ -37,7 +37,7 @@ env:
 
 jobs:
   test_core:
-    name: Test ${{ matrix.os }} python-${{ matrix.python-version }}
+    name: Test ${{ matrix.os }} python-${{ matrix.python-version }} pyarrow-${{ matrix.pyarrow }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -49,6 +49,9 @@ jobs:
           - "3.10"
           - "3.11"
           - "3.12"
+        pyarrow:
+          - true
+          - false
     steps:
       - name: checkout
         uses: actions/checkout@v4
@@ -77,17 +80,25 @@ jobs:
       - name: install ibis
         run: poetry install --without dev --without docs --extras "visualization decompiler"
 
+      - name: install pyarrow
+        if: matrix.pyarrow
+        run: poetry run pip install pyarrow pyarrow-hotfix
+
+      - name: check pyarrow import
+        if: matrix.pyarrow
+        run: poetry run python -c 'import pyarrow, pyarrow_hotfix'
+
       - uses: extractions/setup-just@v2
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: run all core tests and run benchmarks once parallel
         if: matrix.os != 'windows-latest'
-        run: just ci-check -m "'core or benchmarks'" --numprocesses auto
+        run: just ci-check -m "'core or benchmarks'" --numprocesses auto -rs
 
       - name: run all core tests and run benchmarks once serial
         if: matrix.os == 'windows-latest'
-        run: just ci-check -m "'core or benchmarks'"
+        run: just ci-check -m "'core or benchmarks'" -rs
 
       - name: upload code coverage
         if: success()

diff --git a/ibis/backends/tests/data.py b/ibis/backends/tests/data.py
@@ -2,7 +2,9 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
+import pytest
+
+pa = pytest.importorskip("pyarrow")
 
 array_types = pd.DataFrame(
     [

diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py
@@ -15,8 +15,6 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
-import pyarrow.dataset
 import pytest
 import rich.console
 import toolz
@@ -43,6 +41,9 @@
 if TYPE_CHECKING:
     from ibis.backends import BaseBackend
 
+
+pa = pytest.importorskip("pyarrow")
+ds = pytest.importorskip("pyarrow.dataset")
 pl = pytest.importorskip("polars", reason="Polars is not installed")
 
 
@@ -948,7 +949,7 @@ def test_self_join_memory_table(backend, con, monkeypatch):
             id="pyarrow_single_batch",
         ),
         param(
-            lambda: pa.dataset.dataset(pa.table({"a": ["a"], "b": [1]})),
+            lambda: ds.dataset(pa.table({"a": ["a"], "b": [1]})),
             "df_arrow_dataset",
             marks=[
                 pytest.mark.notimpl(

diff --git a/ibis/backends/tests/test_dataframe_interchange.py b/ibis/backends/tests/test_dataframe_interchange.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
-import pyarrow as pa
 import pytest
 from packaging.version import parse as vparse
 
+pa = pytest.importorskip("pyarrow")
+
 pytestmark = pytest.mark.skipif(
     vparse(pa.__version__) < vparse("12"), reason="pyarrow >= 12 required"
 )

diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
 
 import pandas as pd
-import pyarrow as pa
-import pyarrow.csv as pcsv
 import pytest
 from packaging.version import parse as vparse
 from pytest import param
@@ -24,7 +22,8 @@
     SnowflakeProgrammingError,
     TrinoUserError,
 )
-from ibis.formats.pyarrow import PyArrowType
+
+pa = pytest.importorskip("pyarrow")
 
 limit = [
     param(
@@ -330,6 +329,8 @@ def test_table_to_csv(tmp_path, backend, awards_players):
 )
 @pytest.mark.parametrize("delimiter", [";", "\t"], ids=["semicolon", "tab"])
 def test_table_to_csv_writer_kwargs(delimiter, tmp_path, awards_players):
+    import pyarrow.csv as pcsv
+
     outcsv = tmp_path / "out.csv"
     # avoid pandas NaNonense
     awards_players = awards_players.select("playerID", "awardID", "yearID", "lgID")
@@ -426,6 +427,8 @@ def test_roundtrip_delta(backend, con, alltypes, tmp_path):
     ["druid"], raises=AttributeError, reason="string type is used for timestamp_col"
 )
 def test_arrow_timestamp_with_time_zone(alltypes):
+    from ibis.formats.pyarrow import PyArrowType
+
     t = alltypes.select(
         tz=alltypes.timestamp_col.cast(
             alltypes.timestamp_col.type().copy(timezone="UTC")

diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py
@@ -3,7 +3,6 @@
 import numpy as np
 import pandas as pd
 import pandas.testing as tm
-import pyarrow as pa
 import pytest
 from pytest import param
 
@@ -12,6 +11,8 @@
 import ibis.expr.datatypes as dt
 from ibis.backends.tests.errors import PsycoPg2InternalError, Py4JJavaError
 
+pa = pytest.importorskip("pyarrow")
+
 pytestmark = [
     pytest.mark.never(
         ["sqlite", "mysql", "mssql"], reason="Unlikely to ever add map support"

diff --git a/ibis/common/tests/test_dispatch.py b/ibis/common/tests/test_dispatch.py
@@ -4,6 +4,8 @@
 import decimal
 from typing import TYPE_CHECKING, Union
 
+import pytest
+
 from ibis.common.dispatch import Dispatched, lazy_singledispatch
 
 # ruff: noqa: F811
@@ -222,7 +224,7 @@ def test_dispatched():
 
 
 def test_dispatched_lazy():
-    import pyarrow as pa
+    pa = pytest.importorskip("pyarrow")
 
     empty_pyarrow_table = pa.Table.from_arrays([])
     empty_pandas_table = empty_pyarrow_table.to_pandas()

diff --git a/ibis/expr/datatypes/tests/test_value.py b/ibis/expr/datatypes/tests/test_value.py
@@ -327,6 +327,7 @@ def test_from_numpy_timedelta():
     ],
 )
 def test_infer_numpy_array(numpy_array, expected_dtypes):
+    pytest.importorskip("pyarrow")
     pandas_series = pd.Series(numpy_array)
     assert dt.infer(numpy_array) in expected_dtypes
     assert dt.infer(pandas_series) in expected_dtypes

diff --git a/ibis/expr/tests/test_newrels.py b/ibis/expr/tests/test_newrels.py
@@ -1130,7 +1130,7 @@ def test_self_join():
 
 
 def test_self_join_view():
-    t = ibis.memtable({"x": [1, 2], "y": [2, 1], "z": ["a", "b"]})
+    t = ibis.table(schema={"x": "int", "y": "int", "z": "str"})
     t_view = t.view()
     expr = t.join(t_view, t.x == t_view.y).select("x", "y", "z", "z_right")
 
@@ -1146,7 +1146,7 @@ def test_self_join_view():
 
 
 def test_self_join_with_view_projection():
-    t1 = ibis.memtable({"x": [1, 2], "y": [2, 1], "z": ["a", "b"]})
+    t1 = ibis.table(schema={"x": "int", "y": "int", "z": "str"})
     t2 = t1.view()
     expr = t1.inner_join(t2, ["x"])[[t1]]
 

diff --git a/ibis/expr/tests/test_schema.py b/ibis/expr/tests/test_schema.py
@@ -5,7 +5,6 @@
 from typing import NamedTuple
 
 import numpy as np
-import pyarrow as pa
 import pytest
 
 import ibis.expr.datatypes as dt
@@ -365,6 +364,7 @@ def test_schema_set_operations():
 
 
 def test_schema_infer_pyarrow_table():
+    pa = pytest.importorskip("pyarrow")
     table = pa.Table.from_arrays(
         [
             pa.array([1, 2, 3]),
@@ -378,6 +378,7 @@ def test_schema_infer_pyarrow_table():
 
 
 def test_schema_from_to_pyarrow_schema():
+    pa = pytest.importorskip("pyarrow")
     pyarrow_schema = pa.schema(
         [
             pa.field("a", pa.int64()),

diff --git a/ibis/expr/types/joins.py b/ibis/expr/types/joins.py
@@ -31,6 +31,28 @@
     from ibis.expr.operations.relations import JoinKind
 
 
+def coerce_to_table(data):
+    try:
+        import pandas as pd
+    except ImportError:
+        pass
+    else:
+        if isinstance(data, pd.DataFrame):
+            return ibis.memtable(data)
+
+    try:
+        import pyarrow as pa
+    except ImportError:
+        pass
+    else:
+        if isinstance(data, pa.Table):
+            return ibis.memtable(data)
+
+    if not isinstance(data, Table):
+        raise TypeError(f"right operand must be a Table, got {type(data).__name__}")
+    return data
+
+
 def disambiguate_fields(
     how,
     predicates,
@@ -228,16 +250,7 @@ def join(
         lname: str = "",
         rname: str = "{name}_right",
     ):
-        import pandas as pd
-        import pyarrow as pa
-
-        # TODO(kszucs): factor out to a helper function
-        if isinstance(right, (pd.DataFrame, pa.Table)):
-            right = ibis.memtable(right)
-        elif not isinstance(right, Table):
-            raise TypeError(
-                f"right operand must be a Table, got {type(right).__name__}"
-            )
+        right = coerce_to_table(right)
 
         if how == "left_semi":
             how = "semi"

diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py
@@ -10,7 +10,6 @@
 import numpy as np
 import pandas as pd
 import pandas.api.types as pdt
-import pyarrow as pa
 
 import ibis.expr.datatypes as dt
 import ibis.expr.schema as sch
@@ -23,6 +22,7 @@
 
 if TYPE_CHECKING:
     import polars as pl
+    import pyarrow as pa
 
 _has_arrow_dtype = hasattr(pd, "ArrowDtype")
 
@@ -408,6 +408,9 @@ def to_frame(self) -> pd.DataFrame:
         return self.obj
 
     def to_pyarrow(self, schema: sch.Schema) -> pa.Table:
+        import pyarrow as pa
+        import pyarrow_hotfix  # noqa: F401
+
         pyarrow_schema = PyArrowSchema.from_ibis(schema)
         return pa.Table.from_pandas(self.obj, schema=pyarrow_schema)