fix(deps): support pandas 2.0

ibis-project · Apr 5, 2023 · 4f1d9fe · 4f1d9fe
1 parent 523e198
commit 4f1d9fe
Show file tree

Hide file tree

Showing 37 changed files with 388 additions and 237 deletions.
diff --git a/.github/workflows/ibis-backends.yml b/.github/workflows/ibis-backends.yml
@@ -144,30 +144,6 @@ jobs:
               - druid
             services:
               - druid
-        include:
-          - os: ubuntu-latest
-            python-version: "3.8"
-            backend:
-              name: pyspark
-              title: PySpark
-              serial: true
-              extras:
-                - pyspark
-          - os: ubuntu-latest
-            python-version: "3.10"
-            backend:
-              name: pyspark
-              title: PySpark
-              serial: true
-              extras:
-                - pyspark
-          - os: ubuntu-latest
-            python-version: "3.11"
-            backend:
-              name: pyspark
-              title: PySpark
-              extras:
-                - pyspark
         exclude:
           - os: windows-latest
             backend:
@@ -267,12 +243,6 @@ jobs:
         if: matrix.os == 'windows-latest' && matrix.backend.name == 'sqlite'
         run: choco install sqlite
 
-      - uses: actions/setup-java@v3
-        if: matrix.backend.name == 'pyspark'
-        with:
-          distribution: microsoft
-          java-version: 17
-
       - name: checkout
         uses: actions/checkout@v3
 
@@ -362,6 +332,7 @@ jobs:
             title: Dask
             deps:
               - "dask[array,dataframe]@2022.9.1"
+              - "pandas@1.5.3"
             extras:
               - dask
           - name: postgres
@@ -467,6 +438,86 @@ jobs:
         if: ${{ failure() }}
         run: docker compose logs
 
+  test_pyspark:
+    name: PySpark ${{ matrix.os }} python-${{ matrix.python-version }} pandas<${{ matrix.pandas-upper-bound }} numpy<${{ matrix.numpy-upper-bound }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+        python-version:
+          - "3.10"
+        pandas-upper-bound:
+          - "2"
+          - "3"
+        numpy-upper-bound:
+          - "1.24"
+          - "2"
+        include:
+          - os: ubuntu-latest
+            python-version: "3.8"
+            pandas-upper-bound: "2"
+            numpy-upper-bound: "1.24"
+          - os: ubuntu-latest
+            python-version: "3.11"
+            pandas-upper-bound: "2"
+            numpy-upper-bound: "1.24"
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+
+      - uses: actions/setup-java@v3
+        with:
+          distribution: microsoft
+          java-version: 17
+
+      - uses: extractions/setup-just@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: download backend data
+        run: just download-data
+
+      - name: install python
+        uses: actions/setup-python@v4
+        id: install_python
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - run: python -m pip install --upgrade pip 'poetry<1.4'
+
+      - name: install minimum versions
+        run: poetry add --lock 'pandas>=1.2.5,<${{ matrix.pandas-upper-bound }}' 'numpy>=1,<${{ matrix.numpy-upper-bound }}'
+
+      - name: checkout the lock file
+        run: git checkout poetry.lock
+
+      - name: lock with no updates
+        # poetry add is aggressive and will update other dependencies like
+        # numpy and pandas so we keep the pyproject.toml edits and then relock
+        # without updating anything except the requested versions
+        run: poetry lock --no-update
+
+      - name: install ibis
+        run: poetry install --without dev --without docs --extras pyspark
+
+      - name: run tests
+        run: just ci-check -m pyspark --numprocesses auto --dist=loadgroup
+
+      - name: upload code coverage
+        if: success()
+        uses: codecov/codecov-action@v3
+        with:
+          flags: backend,pyspark,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }},pandas-upper-bound-${{ matrix.pandas-upper-bound }},numpy-upper-bound-${{ matrix.numpy-upper-bound }}
+
+      - name: publish test report
+        uses: actions/upload-artifact@v3
+        if: success() || failure()
+        with:
+          name: pyspark-${{ matrix.os }}-${{ matrix.python-version }}
+          path: junit.xml
+
   gen_lockfile_sqlalchemy2:
     name: Generate Poetry Lockfile for SQLAlchemy 2
     runs-on: ubuntu-latest
@@ -634,5 +685,6 @@ jobs:
       - test_backends_min_version
       - test_backends
       - test_backends_sqlalchemy2
+      - test_pyspark
     steps:
       - run: exit 0
diff --git a/ci/schema/sqlite.sql b/ci/schema/sqlite.sql
@@ -13,7 +13,7 @@ CREATE TABLE functional_alltypes (
     double_col REAL,
     date_string_col TEXT,
     string_col TEXT,
-    timestamp_col TEXT,
+    timestamp_col TIMESTAMP,
     year BIGINT,
     month BIGINT,
     CHECK (bool_col IN (0, 1))

diff --git a/ibis/backends/base/sql/alchemy/registry.py b/ibis/backends/base/sql/alchemy/registry.py
@@ -175,7 +175,7 @@ def _cast(t, op):
 
     # specialize going from an integer type to a timestamp
     if arg_dtype.is_integer() and typ.is_timestamp():
-        return t.integer_to_timestamp(sa_arg)
+        return t.integer_to_timestamp(sa_arg, tz=typ.timezone)
 
     if arg_dtype.is_binary() and typ.is_string():
         return sa.func.encode(sa_arg, 'escape')

diff --git a/ibis/backends/base/sql/alchemy/translator.py b/ibis/backends/base/sql/alchemy/translator.py
@@ -44,8 +44,15 @@ class AlchemyExprTranslator(ExprTranslator):
 
     _bool_aggs_need_cast_to_int32 = True
     _has_reduction_filter_syntax = False
+    _integer_to_timestamp = staticmethod(sa.func.to_timestamp)
+    _timestamp_type = sa.TIMESTAMP
+
+    def integer_to_timestamp(self, arg, tz: str | None = None):
+        return sa.cast(
+            self._integer_to_timestamp(arg),
+            self._timestamp_type(timezone=tz is not None),
+        )
 
-    integer_to_timestamp = sa.func.to_timestamp
     native_json_type = True
     _always_quote_columns = None  # let the dialect decide how to quote
 

diff --git a/ibis/backends/bigquery/tests/system/test_client.py b/ibis/backends/bigquery/tests/system/test_client.py
@@ -7,6 +7,7 @@
 import pandas.testing as tm
 import pytest
 import pytz
+import toolz
 
 import ibis
 import ibis.expr.datatypes as dt
@@ -73,15 +74,12 @@ def test_count_distinct_with_filter(alltypes):
     assert result == expected
 
 
-@pytest.mark.parametrize("type", ["date", dt.date])
-def test_cast_string_to_date(alltypes, df, type):
-    import toolz
-
+def test_cast_string_to_date(alltypes, df):
     string_col = alltypes.date_string_col
     month, day, year = toolz.take(3, string_col.split("/"))
 
     expr = "20" + ibis.literal("-").join([year, month, day])
-    expr = expr.cast(type)
+    expr = expr.cast("date")
 
     result = (
         expr.execute()
@@ -91,7 +89,7 @@ def test_cast_string_to_date(alltypes, df, type):
         .rename("date_string_col")
     )
     expected = (
-        pd.to_datetime(df.date_string_col)
+        pd.to_datetime(df.date_string_col, format="%m/%d/%y")
         .dt.normalize()
         .sort_values()
         .reset_index(drop=True)

diff --git a/ibis/backends/clickhouse/compiler/values.py b/ibis/backends/clickhouse/compiler/values.py
@@ -77,7 +77,10 @@ def _cast(op, **kw):
         return f"toInterval{suffix}({arg})"
 
     to = translate_val(op.to, **kw)
-    return f"CAST({arg} AS {to})"
+    result = f"CAST({arg} AS {to})"
+    if (timezone := getattr(op.to, "timezone", None)) is not None:
+        return f"toTimeZone({result}, {timezone!r})"
+    return result
 
 
 @translate_val.register(ops.Between)
@@ -575,11 +578,11 @@ def _date_from_ymd(op, **kw):
     m = translate_val(op.month, **kw)
     d = translate_val(op.day, **kw)
     return (
-        f"toDate(concat("
+        "toDate(concat("
         f"toString({y}), '-', "
         f"leftPad(toString({m}), 2, '0'), '-', "
         f"leftPad(toString({d}), 2, '0')"
-        f"))"
+        "))"
     )
 
 
@@ -591,20 +594,20 @@ def _timestamp_from_ymdhms(op, **kw):
     h = translate_val(op.hours, **kw)
     min = translate_val(op.minutes, **kw)
     s = translate_val(op.seconds, **kw)
-    timezone_arg = ''
-    if timezone := op.output_dtype.timezone:
-        timezone_arg = f', {timezone}'
 
-    return (
-        f"toDateTime("
+    to_datetime = (
+        "toDateTime("
         f"concat(toString({y}), '-', "
         f"leftPad(toString({m}), 2, '0'), '-', "
         f"leftPad(toString({d}), 2, '0'), ' ', "
         f"leftPad(toString({h}), 2, '0'), ':', "
         f"leftPad(toString({min}), 2, '0'), ':', "
         f"leftPad(toString({s}), 2, '0')"
-        f"), {timezone_arg})"
+        "))"
     )
+    if timezone := op.output_dtype.timezone:
+        return f"toTimeZone({to_datetime}, {timezone})"
+    return to_datetime
 
 
 @translate_val.register(ops.ExistsSubquery)

diff --git a/ibis/backends/conftest.py b/ibis/backends/conftest.py
@@ -12,6 +12,7 @@
 from typing import Any, TextIO
 
 import _pytest
+import numpy as np
 import pandas as pd
 import pytest
 import sqlalchemy as sa
@@ -307,8 +308,12 @@ def pytest_collection_modifyitems(session, config, items):
                     item,
                     pytest.mark.xfail(
                         (
-                            sys.version_info >= (3, 11)
-                            and not isinstance(item, pytest.DoctestItem)
+                            not isinstance(item, pytest.DoctestItem)
+                            and (
+                                sys.version_info >= (3, 11)
+                                or vparse(pd.__version__) >= vparse("2")
+                                or vparse(np.__version__) >= vparse("1.24")
+                            )
                         ),
                         reason="PySpark doesn't support Python 3.11",
                     ),

diff --git a/ibis/backends/dask/client.py b/ibis/backends/dask/client.py
@@ -4,18 +4,15 @@
 
 import dask.dataframe as dd
 import numpy as np
+import pandas as pd
+from dateutil.parser import parse as date_parse
 from pandas.api.types import DatetimeTZDtype
 
 import ibis.expr.datatypes as dt
 import ibis.expr.operations as ops
 import ibis.expr.schema as sch
 from ibis.backends.base import Database
-from ibis.backends.pandas.client import (
-    PANDAS_DATE_TYPES,
-    PANDAS_STRING_TYPES,
-    ibis_dtype_to_pandas,
-    ibis_schema_to_pandas,
-)
+from ibis.backends.pandas.client import ibis_dtype_to_pandas, ibis_schema_to_pandas
 
 
 @sch.schema.register(dd.Series)
@@ -54,15 +51,31 @@ def infer_dask_schema(df, schema=None):
 
 
 @sch.convert.register(DatetimeTZDtype, dt.Timestamp, dd.Series)
-def convert_datetimetz_to_timestamp(in_dtype, out_dtype, column):
+def convert_datetimetz_to_timestamp(_, out_dtype, column):
     output_timezone = out_dtype.timezone
     if output_timezone is not None:
         return column.dt.tz_convert(output_timezone)
-    return column.astype(out_dtype.to_dask())
+    else:
+        return column.dt.tz_localize(None)
 
 
-DASK_STRING_TYPES = PANDAS_STRING_TYPES
-DASK_DATE_TYPES = PANDAS_DATE_TYPES
+@sch.convert.register(np.dtype, dt.Timestamp, dd.Series)
+def convert_any_to_timestamp(_, out_dtype, column):
+    if isinstance(dtype := out_dtype.to_dask(), DatetimeTZDtype):
+        column = dd.to_datetime(column)
+        timezone = out_dtype.timezone
+        if getattr(column.dtype, "tz", None) is not None:
+            return column.dt.tz_convert(timezone)
+        else:
+            return column.dt.tz_localize(timezone)
+    else:
+        try:
+            return column.astype(dtype)
+        except pd.errors.OutOfBoundsDatetime:
+            try:
+                return column.map(date_parse)
+            except TypeError:
+                return column
 
 
 @sch.convert.register(np.dtype, dt.Interval, dd.Series)

diff --git a/ibis/backends/dask/execution/generic.py b/ibis/backends/dask/execution/generic.py
@@ -271,7 +271,7 @@ def execute_cast_scalar_timestamp(op, data, type, **kwargs):
 
 def cast_series_to_timestamp(data, tz):
     if pd.api.types.is_string_dtype(data):
-        timestamps = to_datetime(data, infer_datetime_format=True)
+        timestamps = to_datetime(data)
     else:
         timestamps = to_datetime(data, unit="s")
     if getattr(timestamps.dtype, "tz", None) is not None:
@@ -290,10 +290,17 @@ def execute_cast_series_timestamp(op, data, type, **kwargs):
     tz = type.timezone
     dtype = 'M8[ns]' if tz is None else DatetimeTZDtype('ns', tz)
 
-    if from_type.is_timestamp() or from_type.is_date():
-        return data.astype(dtype)
-
-    if from_type.is_string() or from_type.is_integer():
+    if from_type.is_timestamp():
+        from_tz = from_type.timezone
+        if tz is None and from_tz is None:
+            return data
+        elif tz is None or from_tz is None:
+            return data.dt.tz_localize(tz)
+        elif tz is not None and from_tz is not None:
+            return data.dt.tz_convert(tz)
+    elif from_type.is_date():
+        return data if tz is None else data.dt.tz_localize(tz)
+    elif from_type.is_string() or from_type.is_integer():
         return data.map_partitions(
             cast_series_to_timestamp,
             tz,
@@ -319,11 +326,7 @@ def execute_cast_series_date(op, data, type, **kwargs):
 
     if from_type.equals(dt.string):
         # TODO - this is broken
-        datetimes = data.map_partitions(
-            to_datetime,
-            infer_datetime_format=True,
-            meta=(data.name, 'datetime64[ns]'),
-        )
+        datetimes = data.map_partitions(to_datetime, meta=(data.name, 'datetime64[ns]'))
 
         # TODO - we are getting rid of the index here
         return datetimes.dt.normalize()