Skip to content

Commit

Permalink
feat: support of arrays and tuples for clickhouse
Browse files Browse the repository at this point in the history
add support array index op

add method for converting of dtype to typename

add support array concat op

update array index op

it uses translate of exprs

fix format

add support array repeat op

add support array slice op

add DateTime64 to ch dtypes

update test_array_index for check negative ids

add support for neg slice ids

use f-string in array repeat op

refactor to_ibis method

style: oneline expr for array repeat

Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>

test: remove xfail on array slice and index ops

removed for:
* dask
* pandas
* postgres
* pyspark

fix: add negative index support for postgres

style: remove unused module
  • Loading branch information
hikitani authored and cpcloud committed Jan 13, 2022
1 parent 2ba540d commit db512a8
Show file tree
Hide file tree
Showing 10 changed files with 234 additions and 179 deletions.
44 changes: 38 additions & 6 deletions ibis/backends/clickhouse/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,29 @@
'FixedString': dt.String,
'Date': dt.Date,
'DateTime': dt.Timestamp,
'DateTime64': dt.Timestamp,
'Array': dt.Array,
}
_ibis_dtypes = {v: k for k, v in _clickhouse_dtypes.items()}
_ibis_dtypes[dt.String] = 'String'
_ibis_dtypes[dt.Timestamp] = 'DateTime'


class ClickhouseDataType:

__slots__ = 'typename', 'nullable'
__slots__ = 'typename', 'base_typename', 'nullable'

def __init__(self, typename, nullable=False):
m = base_typename_re.match(typename)
base_typename = m.groups()[0]
if base_typename not in _clickhouse_dtypes:
self.base_typename = m.groups()[0]
if self.base_typename not in _clickhouse_dtypes:
raise com.UnsupportedBackendType(typename)
self.typename = base_typename
self.typename = self.base_typename
self.nullable = nullable

if self.base_typename == 'Array':
self.typename = typename

def __str__(self):
if self.nullable:
return f'Nullable({self.typename})'
Expand All @@ -63,11 +69,37 @@ def parse(cls, spec):
return cls(spec)

def to_ibis(self):
return _clickhouse_dtypes[self.typename](nullable=self.nullable)
if self.base_typename != 'Array':
return _clickhouse_dtypes[self.typename](nullable=self.nullable)

sub_type = ClickhouseDataType(
self.get_subname(self.typename)
).to_ibis()
return dt.Array(value_type=sub_type)

@staticmethod
def get_subname(name: str) -> str:
lbracket_pos = name.find('(')
rbracket_pos = name.rfind(')')

if lbracket_pos == -1 or rbracket_pos == -1:
return ''

subname = name[lbracket_pos + 1 : rbracket_pos]
return subname

@staticmethod
def get_typename_from_ibis_dtype(dtype):
if not isinstance(dtype, dt.Array):
return _ibis_dtypes[type(dtype)]

return 'Array({})'.format(
ClickhouseDataType.get_typename_from_ibis_dtype(dtype.value_type)
)

@classmethod
def from_ibis(cls, dtype, nullable=None):
typename = _ibis_dtypes[type(dtype)]
typename = ClickhouseDataType.get_typename_from_ibis_dtype(dtype)
if nullable is None:
nullable = dtype.nullable
return cls(typename, nullable=nullable)
Expand Down
56 changes: 56 additions & 0 deletions ibis/backends/clickhouse/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,58 @@ def formatter(translator, expr):
return formatter


def _array_index_op(translator, expr):
op = expr.op()

arr = op.args[0]
idx = op.args[1]

arr_ = translator.translate(arr)
idx_ = _parenthesize(translator, idx)

correct_idx = f'if({idx_} >= 0, {idx_} + 1, {idx_})'

return f'arrayElement({arr_}, {correct_idx})'


def _array_repeat_op(translator, expr):
op = expr.op()
arr, times = op.args

arr_ = _parenthesize(translator, arr)
times_ = _parenthesize(translator, times)

select = 'arrayFlatten(groupArray(arr))'
from_ = f'(select {arr_} as arr from system.numbers limit {times_})'
return f'(select {select} from {from_})'


def _array_slice_op(translator, expr):
op = expr.op()
arg, start, stop = op.args

start_ = _parenthesize(translator, start)
arg_ = translator.translate(arg)

start_correct_ = f'if({start_} < 0, {start_}, {start_} + 1)'

if stop is not None:
stop_ = _parenthesize(translator, stop)

cast_arg_ = f'if({arg_} = [], CAST({arg_} AS Array(UInt8)), {arg_})'
neg_start_ = f'(arrayCount({cast_arg_}) + {start_})'
diff_fmt = f'greatest(-0, {stop_} - {{}})'.format

length_ = (
f'if({stop_} < 0, {stop_}, '
+ f'if({start_} < 0, {diff_fmt(neg_start_)}, {diff_fmt(start_)}))'
)

return f'arraySlice({arg_}, {start_correct_}, {length_})'

return f'arraySlice({arg_}, {start_correct_})'


def _agg(func):
def formatter(translator, expr):
return _aggregate(translator, func, *expr.op().args)
Expand Down Expand Up @@ -644,6 +696,10 @@ def _group_concat(translator, expr):
ops.ExistsSubquery: _exists_subquery,
ops.NotExistsSubquery: _exists_subquery,
ops.ArrayLength: _unary('length'),
ops.ArrayIndex: _array_index_op,
ops.ArrayConcat: _fixed_arity('arrayConcat', 2),
ops.ArrayRepeat: _array_repeat_op,
ops.ArraySlice: _array_slice_op,
}


Expand Down
62 changes: 62 additions & 0 deletions ibis/backends/clickhouse/tests/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,65 @@ def test_search_case(con, alltypes, translate):
END"""
assert translate(expr) == expected
assert len(con.execute(expr))


@pytest.mark.parametrize(
'arr',
[
[1, 2, 3],
['qw', 'wq', '1'],
[1.2, 0.3, 0.4],
[[1], [1, 2], [1, 2, 3]],
],
)
@pytest.mark.parametrize(
'ids',
[
lambda arr: range(len(arr)),
lambda arr: range(-len(arr), 0),
],
)
def test_array_index(con, arr, ids):
expr = L(arr)
for i in ids(arr):
el_expr = expr[i]
el = con.execute(el_expr)
assert el == arr[i]


@pytest.mark.parametrize(
'arrays',
[
([1], [2]),
([1], [1, 2]),
([1, 2], [1]),
([1, 2], [3, 4]),
([1, 2], [3, 4], [5, 6]),
],
)
def test_array_concat(con, arrays):
expr = L([]).cast(dt.Array(dt.int8))
expected = sum(arrays, [])
for arr in arrays:
expr += L(arr)

assert con.execute(expr) == expected


@pytest.mark.parametrize(
('arr', 'times'),
[([1], 1), ([1], 2), ([1], 3), ([1, 2], 1), ([1, 2], 2), ([1, 2], 3)],
)
def test_array_repeat(con, arr, times):
expected = arr * times
expr = L(arr)

assert con.execute(expr * times) == expected


@pytest.mark.parametrize('arr', [[], [1], [1, 2, 3, 4, 5, 6]])
@pytest.mark.parametrize('start', [None, 0, 1, 2, -1, -3])
@pytest.mark.parametrize('stop', [None, 0, 1, 3, -2, -4])
def test_array_slice(con, arr, start, stop):
expr = L(arr)
assert con.execute(expr[start:stop]) == arr[start:stop]
39 changes: 39 additions & 0 deletions ibis/backends/clickhouse/tests/test_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import pytest
from pkg_resources import parse_version

import ibis.expr.datatypes as dt
from ibis.backends.clickhouse.client import ClickhouseDataType


def test_column_types(alltypes):
df = alltypes.execute()
Expand All @@ -23,3 +27,38 @@ def test_columns_types_with_additional_argument(con):
assert df.fixedstring_col.dtype.name == 'object'
if parse_version(con.version).base_version >= '1.1.54337':
assert df.datetime_col.dtype.name == 'datetime64[ns]'


@pytest.mark.parametrize(
('ch_type', 'ibis_type'),
[
('Array(Int8)', dt.Array(dt.Int8(nullable=False))),
('Array(Int16)', dt.Array(dt.Int16(nullable=False))),
('Array(Int32)', dt.Array(dt.Int32(nullable=False))),
('Array(Int64)', dt.Array(dt.Int64(nullable=False))),
('Array(UInt8)', dt.Array(dt.UInt8(nullable=False))),
('Array(UInt16)', dt.Array(dt.UInt16(nullable=False))),
('Array(UInt32)', dt.Array(dt.UInt32(nullable=False))),
('Array(UInt64)', dt.Array(dt.UInt64(nullable=False))),
('Array(Float32)', dt.Array(dt.Float32(nullable=False))),
('Array(Float64)', dt.Array(dt.Float64(nullable=False))),
('Array(String)', dt.Array(dt.String(nullable=False))),
('Array(FixedString(32))', dt.Array(dt.String(nullable=False))),
('Array(Date)', dt.Array(dt.Date(nullable=False))),
('Array(DateTime)', dt.Array(dt.Timestamp(nullable=False))),
('Array(DateTime64)', dt.Array(dt.Timestamp(nullable=False))),
('Array(Nothing)', dt.Array(dt.Null(nullable=False))),
('Array(Null)', dt.Array(dt.Null(nullable=False))),
('Array(Array(Int8))', dt.Array(dt.Array(dt.Int8(nullable=False)))),
(
'Array(Array(Array(Int8)))',
dt.Array(dt.Array(dt.Array(dt.Int8(nullable=False)))),
),
(
'Array(Array(Array(Array(Int8))))',
dt.Array(dt.Array(dt.Array(dt.Array(dt.Int8(nullable=False))))),
),
],
)
def test_array_type(ch_type, ibis_type):
assert ClickhouseDataType(ch_type).to_ibis() == ibis_type
52 changes: 6 additions & 46 deletions ibis/backends/dask/tests/execution/test_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pandas as pd
import pytest
from dask.dataframe.utils import tm
from pytest import param

import ibis
from ibis.common.exceptions import IbisTypeError
Expand Down Expand Up @@ -103,29 +102,9 @@ def test_array_collect_scalar(client):
(None, 3),
(None, None),
(3, None),
# negative slices are not supported
# TODO: uncomment once test as a whole is not xfailed
# param(
# -3,
# None,
# marks=pytest.mark.xfail(
# raises=ValueError, reason='Negative slicing not supported'
# ),
# ),
# param(
# None,
# -3,
# marks=pytest.mark.xfail(
# raises=ValueError, reason='Negative slicing not supported'
# ),
# ),
# param(
# -3,
# -1,
# marks=pytest.mark.xfail(
# raises=ValueError, reason='Negative slicing not supported'
# ),
# ),
(-3, None),
(None, -3),
(-3, -1),
],
)
def test_array_slice(t, df, start, stop):
Expand All @@ -146,28 +125,9 @@ def test_array_slice(t, df, start, stop):
(None, 3),
(None, None),
(3, None),
# negative slices are not supported
param(
-3,
None,
marks=pytest.mark.xfail(
raises=ValueError, reason='Negative slicing not supported'
),
),
param(
None,
-3,
marks=pytest.mark.xfail(
raises=ValueError, reason='Negative slicing not supported'
),
),
param(
-3,
-1,
marks=pytest.mark.xfail(
raises=ValueError, reason='Negative slicing not supported'
),
),
(-3, None),
(None, -3),
(-3, -1),
],
)
def test_array_slice_scalar(client, start, stop):
Expand Down
Loading

0 comments on commit db512a8

Please sign in to comment.