Skip to content

Commit

Permalink
fix(deps): support pandas 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Apr 5, 2023
1 parent 523e198 commit 4f1d9fe
Show file tree
Hide file tree
Showing 37 changed files with 388 additions and 237 deletions.
112 changes: 82 additions & 30 deletions .github/workflows/ibis-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,30 +144,6 @@ jobs:
- druid
services:
- druid
include:
- os: ubuntu-latest
python-version: "3.8"
backend:
name: pyspark
title: PySpark
serial: true
extras:
- pyspark
- os: ubuntu-latest
python-version: "3.10"
backend:
name: pyspark
title: PySpark
serial: true
extras:
- pyspark
- os: ubuntu-latest
python-version: "3.11"
backend:
name: pyspark
title: PySpark
extras:
- pyspark
exclude:
- os: windows-latest
backend:
Expand Down Expand Up @@ -267,12 +243,6 @@ jobs:
if: matrix.os == 'windows-latest' && matrix.backend.name == 'sqlite'
run: choco install sqlite

- uses: actions/setup-java@v3
if: matrix.backend.name == 'pyspark'
with:
distribution: microsoft
java-version: 17

- name: checkout
uses: actions/checkout@v3

Expand Down Expand Up @@ -362,6 +332,7 @@ jobs:
title: Dask
deps:
- "dask[array,dataframe]@2022.9.1"
- "pandas@1.5.3"
extras:
- dask
- name: postgres
Expand Down Expand Up @@ -467,6 +438,86 @@ jobs:
if: ${{ failure() }}
run: docker compose logs

test_pyspark:
name: PySpark ${{ matrix.os }} python-${{ matrix.python-version }} pandas<${{ matrix.pandas-upper-bound }} numpy<${{ matrix.numpy-upper-bound }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
python-version:
- "3.10"
pandas-upper-bound:
- "2"
- "3"
numpy-upper-bound:
- "1.24"
- "2"
include:
- os: ubuntu-latest
python-version: "3.8"
pandas-upper-bound: "2"
numpy-upper-bound: "1.24"
- os: ubuntu-latest
python-version: "3.11"
pandas-upper-bound: "2"
numpy-upper-bound: "1.24"
steps:
- name: checkout
uses: actions/checkout@v3

- uses: actions/setup-java@v3
with:
distribution: microsoft
java-version: 17

- uses: extractions/setup-just@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: download backend data
run: just download-data

- name: install python
uses: actions/setup-python@v4
id: install_python
with:
python-version: ${{ matrix.python-version }}

- run: python -m pip install --upgrade pip 'poetry<1.4'

- name: install minimum versions
run: poetry add --lock 'pandas>=1.2.5,<${{ matrix.pandas-upper-bound }}' 'numpy>=1,<${{ matrix.numpy-upper-bound }}'

- name: checkout the lock file
run: git checkout poetry.lock

- name: lock with no updates
# poetry add is aggressive and will update other dependencies like
# numpy and pandas so we keep the pyproject.toml edits and then relock
# without updating anything except the requested versions
run: poetry lock --no-update

- name: install ibis
run: poetry install --without dev --without docs --extras pyspark

- name: run tests
run: just ci-check -m pyspark --numprocesses auto --dist=loadgroup

- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
with:
flags: backend,pyspark,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }},pandas-upper-bound-${{ matrix.pandas-upper-bound }},numpy-upper-bound-${{ matrix.numpy-upper-bound }}

- name: publish test report
uses: actions/upload-artifact@v3
if: success() || failure()
with:
name: pyspark-${{ matrix.os }}-${{ matrix.python-version }}
path: junit.xml

gen_lockfile_sqlalchemy2:
name: Generate Poetry Lockfile for SQLAlchemy 2
runs-on: ubuntu-latest
Expand Down Expand Up @@ -634,5 +685,6 @@ jobs:
- test_backends_min_version
- test_backends
- test_backends_sqlalchemy2
- test_pyspark
steps:
- run: exit 0
2 changes: 1 addition & 1 deletion ci/schema/sqlite.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ CREATE TABLE functional_alltypes (
double_col REAL,
date_string_col TEXT,
string_col TEXT,
timestamp_col TEXT,
timestamp_col TIMESTAMP,
year BIGINT,
month BIGINT,
CHECK (bool_col IN (0, 1))
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/base/sql/alchemy/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def _cast(t, op):

# specialize going from an integer type to a timestamp
if arg_dtype.is_integer() and typ.is_timestamp():
return t.integer_to_timestamp(sa_arg)
return t.integer_to_timestamp(sa_arg, tz=typ.timezone)

if arg_dtype.is_binary() and typ.is_string():
return sa.func.encode(sa_arg, 'escape')
Expand Down
9 changes: 8 additions & 1 deletion ibis/backends/base/sql/alchemy/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,15 @@ class AlchemyExprTranslator(ExprTranslator):

_bool_aggs_need_cast_to_int32 = True
_has_reduction_filter_syntax = False
_integer_to_timestamp = staticmethod(sa.func.to_timestamp)
_timestamp_type = sa.TIMESTAMP

def integer_to_timestamp(self, arg, tz: str | None = None):
return sa.cast(
self._integer_to_timestamp(arg),
self._timestamp_type(timezone=tz is not None),
)

integer_to_timestamp = sa.func.to_timestamp
native_json_type = True
_always_quote_columns = None # let the dialect decide how to quote

Expand Down
10 changes: 4 additions & 6 deletions ibis/backends/bigquery/tests/system/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas.testing as tm
import pytest
import pytz
import toolz

import ibis
import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -73,15 +74,12 @@ def test_count_distinct_with_filter(alltypes):
assert result == expected


@pytest.mark.parametrize("type", ["date", dt.date])
def test_cast_string_to_date(alltypes, df, type):
import toolz

def test_cast_string_to_date(alltypes, df):
string_col = alltypes.date_string_col
month, day, year = toolz.take(3, string_col.split("/"))

expr = "20" + ibis.literal("-").join([year, month, day])
expr = expr.cast(type)
expr = expr.cast("date")

result = (
expr.execute()
Expand All @@ -91,7 +89,7 @@ def test_cast_string_to_date(alltypes, df, type):
.rename("date_string_col")
)
expected = (
pd.to_datetime(df.date_string_col)
pd.to_datetime(df.date_string_col, format="%m/%d/%y")
.dt.normalize()
.sort_values()
.reset_index(drop=True)
Expand Down
21 changes: 12 additions & 9 deletions ibis/backends/clickhouse/compiler/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ def _cast(op, **kw):
return f"toInterval{suffix}({arg})"

to = translate_val(op.to, **kw)
return f"CAST({arg} AS {to})"
result = f"CAST({arg} AS {to})"
if (timezone := getattr(op.to, "timezone", None)) is not None:
return f"toTimeZone({result}, {timezone!r})"
return result


@translate_val.register(ops.Between)
Expand Down Expand Up @@ -575,11 +578,11 @@ def _date_from_ymd(op, **kw):
m = translate_val(op.month, **kw)
d = translate_val(op.day, **kw)
return (
f"toDate(concat("
"toDate(concat("
f"toString({y}), '-', "
f"leftPad(toString({m}), 2, '0'), '-', "
f"leftPad(toString({d}), 2, '0')"
f"))"
"))"
)


Expand All @@ -591,20 +594,20 @@ def _timestamp_from_ymdhms(op, **kw):
h = translate_val(op.hours, **kw)
min = translate_val(op.minutes, **kw)
s = translate_val(op.seconds, **kw)
timezone_arg = ''
if timezone := op.output_dtype.timezone:
timezone_arg = f', {timezone}'

return (
f"toDateTime("
to_datetime = (
"toDateTime("
f"concat(toString({y}), '-', "
f"leftPad(toString({m}), 2, '0'), '-', "
f"leftPad(toString({d}), 2, '0'), ' ', "
f"leftPad(toString({h}), 2, '0'), ':', "
f"leftPad(toString({min}), 2, '0'), ':', "
f"leftPad(toString({s}), 2, '0')"
f"), {timezone_arg})"
"))"
)
if timezone := op.output_dtype.timezone:
return f"toTimeZone({to_datetime}, {timezone})"
return to_datetime


@translate_val.register(ops.ExistsSubquery)
Expand Down
9 changes: 7 additions & 2 deletions ibis/backends/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from typing import Any, TextIO

import _pytest
import numpy as np
import pandas as pd
import pytest
import sqlalchemy as sa
Expand Down Expand Up @@ -307,8 +308,12 @@ def pytest_collection_modifyitems(session, config, items):
item,
pytest.mark.xfail(
(
sys.version_info >= (3, 11)
and not isinstance(item, pytest.DoctestItem)
not isinstance(item, pytest.DoctestItem)
and (
sys.version_info >= (3, 11)
or vparse(pd.__version__) >= vparse("2")
or vparse(np.__version__) >= vparse("1.24")
)
),
reason="PySpark doesn't support Python 3.11",
),
Expand Down
33 changes: 23 additions & 10 deletions ibis/backends/dask/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,15 @@

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dateutil.parser import parse as date_parse
from pandas.api.types import DatetimeTZDtype

import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
import ibis.expr.schema as sch
from ibis.backends.base import Database
from ibis.backends.pandas.client import (
PANDAS_DATE_TYPES,
PANDAS_STRING_TYPES,
ibis_dtype_to_pandas,
ibis_schema_to_pandas,
)
from ibis.backends.pandas.client import ibis_dtype_to_pandas, ibis_schema_to_pandas


@sch.schema.register(dd.Series)
Expand Down Expand Up @@ -54,15 +51,31 @@ def infer_dask_schema(df, schema=None):


@sch.convert.register(DatetimeTZDtype, dt.Timestamp, dd.Series)
def convert_datetimetz_to_timestamp(in_dtype, out_dtype, column):
def convert_datetimetz_to_timestamp(_, out_dtype, column):
output_timezone = out_dtype.timezone
if output_timezone is not None:
return column.dt.tz_convert(output_timezone)
return column.astype(out_dtype.to_dask())
else:
return column.dt.tz_localize(None)


DASK_STRING_TYPES = PANDAS_STRING_TYPES
DASK_DATE_TYPES = PANDAS_DATE_TYPES
@sch.convert.register(np.dtype, dt.Timestamp, dd.Series)
def convert_any_to_timestamp(_, out_dtype, column):
if isinstance(dtype := out_dtype.to_dask(), DatetimeTZDtype):
column = dd.to_datetime(column)
timezone = out_dtype.timezone
if getattr(column.dtype, "tz", None) is not None:
return column.dt.tz_convert(timezone)
else:
return column.dt.tz_localize(timezone)
else:
try:
return column.astype(dtype)
except pd.errors.OutOfBoundsDatetime:
try:
return column.map(date_parse)
except TypeError:
return column


@sch.convert.register(np.dtype, dt.Interval, dd.Series)
Expand Down
23 changes: 13 additions & 10 deletions ibis/backends/dask/execution/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def execute_cast_scalar_timestamp(op, data, type, **kwargs):

def cast_series_to_timestamp(data, tz):
if pd.api.types.is_string_dtype(data):
timestamps = to_datetime(data, infer_datetime_format=True)
timestamps = to_datetime(data)
else:
timestamps = to_datetime(data, unit="s")
if getattr(timestamps.dtype, "tz", None) is not None:
Expand All @@ -290,10 +290,17 @@ def execute_cast_series_timestamp(op, data, type, **kwargs):
tz = type.timezone
dtype = 'M8[ns]' if tz is None else DatetimeTZDtype('ns', tz)

if from_type.is_timestamp() or from_type.is_date():
return data.astype(dtype)

if from_type.is_string() or from_type.is_integer():
if from_type.is_timestamp():
from_tz = from_type.timezone
if tz is None and from_tz is None:
return data
elif tz is None or from_tz is None:
return data.dt.tz_localize(tz)
elif tz is not None and from_tz is not None:
return data.dt.tz_convert(tz)
elif from_type.is_date():
return data if tz is None else data.dt.tz_localize(tz)
elif from_type.is_string() or from_type.is_integer():
return data.map_partitions(
cast_series_to_timestamp,
tz,
Expand All @@ -319,11 +326,7 @@ def execute_cast_series_date(op, data, type, **kwargs):

if from_type.equals(dt.string):
# TODO - this is broken
datetimes = data.map_partitions(
to_datetime,
infer_datetime_format=True,
meta=(data.name, 'datetime64[ns]'),
)
datetimes = data.map_partitions(to_datetime, meta=(data.name, 'datetime64[ns]'))

# TODO - we are getting rid of the index here
return datetimes.dt.normalize()
Expand Down
Loading

0 comments on commit 4f1d9fe

Please sign in to comment.