From 80f4ab9908bba36b6ddca10809ef498c967c7401 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 23 Apr 2022 11:24:42 -0400 Subject: [PATCH] feat(clickhouse): implement proper type serialization --- docker-compose.yml | 2 +- ibis/backends/clickhouse/__init__.py | 26 +-- ibis/backends/clickhouse/client.py | 140 +---------- ibis/backends/clickhouse/datatypes.py | 220 ++++++++++++++++++ ibis/backends/clickhouse/registry.py | 8 +- .../clickhouse/tests/test_functions.py | 53 +++-- .../clickhouse/tests/test_operators.py | 4 +- ibis/backends/clickhouse/tests/test_select.py | 2 +- ibis/backends/clickhouse/tests/test_types.py | 145 +++++++++--- ibis/backends/duckdb/__init__.py | 9 +- ibis/backends/duckdb/datatypes.py | 83 ++++--- ibis/backends/duckdb/tests/test_datatypes.py | 9 +- ibis/backends/postgres/datatypes.py | 2 +- ibis/backends/tests/test_array.py | 9 +- ibis/expr/datatypes/core.py | 148 +++++++----- ibis/tests/benchmarks/test_benchmarks.py | 4 +- ibis/tests/expr/test_datatypes.py | 5 + poetry.lock | 4 +- pyproject.toml | 4 +- setup.py | 4 +- 20 files changed, 557 insertions(+), 324 deletions(-) create mode 100644 ibis/backends/clickhouse/datatypes.py diff --git a/docker-compose.yml b/docker-compose.yml index 8a0041e2413d..670ec3c4e1e6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,7 @@ version: "3.4" services: clickhouse: - image: yandex/clickhouse-server:22-alpine + image: clickhouse/clickhouse-server:22-alpine ports: - 8123:8123 - 9000:9000 diff --git a/ibis/backends/clickhouse/__init__.py b/ibis/backends/clickhouse/__init__.py index 30480388c31a..5e3fa644c7e6 100644 --- a/ibis/backends/clickhouse/__init__.py +++ b/ibis/backends/clickhouse/__init__.py @@ -11,12 +11,9 @@ import ibis.config import ibis.expr.schema as sch from ibis.backends.base.sql import BaseSQLBackend -from ibis.backends.clickhouse.client import ( - ClickhouseDataType, - ClickhouseTable, - fully_qualified_re, -) +from ibis.backends.clickhouse.client import ClickhouseTable, fully_qualified_re from ibis.backends.clickhouse.compiler import ClickhouseCompiler +from ibis.backends.clickhouse.datatypes import parse, serialize from ibis.config import options _default_compression: str | bool @@ -109,12 +106,12 @@ def current_database(self): return self.con.connection.database def list_databases(self, like=None): - data, schema = self.raw_sql('SELECT name FROM system.databases') + data, _ = self.raw_sql('SELECT name FROM system.databases') databases = list(data[0]) return self._filter_with_like(databases, like) def list_tables(self, like=None, database=None): - data, schema = self.raw_sql('SHOW TABLES') + data, _ = self.raw_sql('SHOW TABLES') databases = list(data[0]) return self._filter_with_like(databases, like) @@ -152,13 +149,7 @@ def raw_sql( 'name': name, 'data': df.to_dict('records'), 'structure': list( - zip( - schema.names, - [ - str(ClickhouseDataType.from_ibis(t)) - for t in schema.types - ], - ) + zip(schema.names, map(serialize, schema.types)) ), } ) @@ -216,9 +207,8 @@ def get_schema( (column_names, types, *_), *_ = self.raw_sql( f"DESCRIBE {qualified_name}" ) - return sch.Schema.from_tuples( - zip(column_names, map(ClickhouseDataType.parse, types)) - ) + + return sch.Schema.from_tuples(zip(column_names, map(parse, types))) def set_options(self, options): self.con.set_options(options) @@ -238,7 +228,7 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: ) [plan] = json.loads(raw_plans) fields = [ - (field["Name"], ClickhouseDataType.parse(field["Type"])) + (field["Name"], parse(field["Type"])) for field in plan["Plan"]["Header"] ] return sch.Schema.from_tuples(fields) diff --git a/ibis/backends/clickhouse/client.py b/ibis/backends/clickhouse/client.py index c30de3284074..0bd6a2878742 100644 --- a/ibis/backends/clickhouse/client.py +++ b/ibis/backends/clickhouse/client.py @@ -4,111 +4,10 @@ import numpy as np import pandas as pd -import ibis.common.exceptions as com import ibis.expr.datatypes as dt import ibis.expr.types as ir fully_qualified_re = re.compile(r"(.*)\.(?:`(.*)`|(.*))") -base_typename_re = re.compile(r"(\w+)") - - -_clickhouse_dtypes = { - 'Null': dt.Null, - 'Nothing': dt.Null, - 'UInt8': dt.UInt8, - 'UInt16': dt.UInt16, - 'UInt32': dt.UInt32, - 'UInt64': dt.UInt64, - 'Int8': dt.Int8, - 'Int16': dt.Int16, - 'Int32': dt.Int32, - 'Int64': dt.Int64, - 'Float32': dt.Float32, - 'Float64': dt.Float64, - 'String': dt.String, - 'FixedString': dt.String, - 'Date': dt.Date, - 'DateTime': dt.Timestamp, - 'DateTime64': dt.Timestamp, - 'Array': dt.Array, -} -_ibis_dtypes = {v: k for k, v in _clickhouse_dtypes.items()} -_ibis_dtypes[dt.String] = 'String' -_ibis_dtypes[dt.Timestamp] = 'DateTime' - - -class ClickhouseDataType: - - __slots__ = 'typename', 'base_typename', 'nullable' - - def __init__(self, typename, nullable=False): - m = base_typename_re.match(typename) - self.base_typename = m.groups()[0] - if self.base_typename not in _clickhouse_dtypes: - raise com.UnsupportedBackendType(typename) - self.typename = self.base_typename - self.nullable = nullable - - if self.base_typename == 'Array': - self.typename = typename - - def __str__(self): - if self.nullable: - return f'Nullable({self.typename})' - else: - return self.typename - - def __repr__(self): - return f'' - - @classmethod - def parse(cls, spec): - # TODO(kszucs): spare parsing, depends on clickhouse-driver#22 - if spec.startswith('Nullable'): - return cls(spec[9:-1], nullable=True) - else: - return cls(spec) - - def to_ibis(self): - if self.base_typename != 'Array': - return _clickhouse_dtypes[self.typename](nullable=self.nullable) - - sub_type = ClickhouseDataType( - self.get_subname(self.typename) - ).to_ibis() - return dt.Array(value_type=sub_type) - - @staticmethod - def get_subname(name: str) -> str: - lbracket_pos = name.find('(') - rbracket_pos = name.rfind(')') - - if lbracket_pos == -1 or rbracket_pos == -1: - return '' - - subname = name[lbracket_pos + 1 : rbracket_pos] - return subname - - @staticmethod - def get_typename_from_ibis_dtype(dtype): - if not isinstance(dtype, dt.Array): - return _ibis_dtypes[type(dtype)] - - return 'Array({})'.format( - ClickhouseDataType.get_typename_from_ibis_dtype(dtype.value_type) - ) - - @classmethod - def from_ibis(cls, dtype, nullable=None): - typename = ClickhouseDataType.get_typename_from_ibis_dtype(dtype) - if nullable is None: - nullable = dtype.nullable - return cls(typename, nullable=nullable) - - -@dt.dtype.register(ClickhouseDataType) -def clickhouse_to_ibis_dtype(clickhouse_dtype): - return clickhouse_dtype.to_ibis() class ClickhouseTable(ir.TableExpr): @@ -116,30 +15,11 @@ class ClickhouseTable(ir.TableExpr): @property def _qualified_name(self): - return self.op().args[0] - - @property - def _unqualified_name(self): - return self._match_name()[1] + return self.op().name @property def _client(self): - return self.op().args[2] - - def _match_name(self): - m = fully_qualified_re.match(self._qualified_name) - if not m: - raise com.IbisError( - 'Cannot determine database name from {}'.format( - self._qualified_name - ) - ) - db, quoted, unquoted = m.groups() - return db, quoted or unquoted - - @property - def _database(self): - return self._match_name()[0] + return self.op().source def invalidate_metadata(self): self._client.invalidate_metadata(self._qualified_name) @@ -168,10 +48,8 @@ def insert(self, obj, **kwargs): assert isinstance(obj, pd.DataFrame) assert set(schema.names) >= set(obj.columns) - columns = ', '.join(map(quote_identifier, obj.columns)) - query = 'INSERT INTO {table} ({columns}) VALUES'.format( - table=self._qualified_name, columns=columns - ) + columns = ", ".join(map(quote_identifier, obj.columns)) + query = f"INSERT INTO {self._qualified_name} ({columns}) VALUES" # convert data columns with datetime64 pandas dtype to native date # because clickhouse-driver 0.0.10 does arithmetic operations on it @@ -180,5 +58,11 @@ def insert(self, obj, **kwargs): if isinstance(schema[col], dt.Date): obj[col] = obj[col].dt.date - data = obj.to_dict('records') - return self._client.con.execute(query, data, **kwargs) + settings = kwargs.pop("settings", {}) + settings["use_numpy"] = True + return self._client.con.insert_dataframe( + query, + obj, + settings=settings, + **kwargs, + ) diff --git a/ibis/backends/clickhouse/datatypes.py b/ibis/backends/clickhouse/datatypes.py new file mode 100644 index 000000000000..7f20d98adce4 --- /dev/null +++ b/ibis/backends/clickhouse/datatypes.py @@ -0,0 +1,220 @@ +from __future__ import annotations + +import functools +from typing import TYPE_CHECKING + +import parsy as p + +if TYPE_CHECKING: + from ibis.expr.datatypes import DataType + +import ibis.expr.datatypes as dt + + +def parse(text: str) -> DataType: + @p.generate + def datetime(): + yield dt.spaceless_string("datetime64", "datetime") + timezone = yield parened_string.optional() + return dt.Timestamp(timezone=timezone, nullable=False) + + primitive = ( + datetime + | dt.spaceless_string("null", "nothing").result(dt.null) + | dt.spaceless_string("bigint", "int64").result( + dt.Int64(nullable=False) + ) + | dt.spaceless_string("double", "float64").result( + dt.Float64(nullable=False) + ) + | dt.spaceless_string("float32", "float").result( + dt.Float32(nullable=False) + ) + | dt.spaceless_string("smallint", "int16", "int2").result( + dt.Int16(nullable=False) + ) + | dt.spaceless_string("date32", "date").result(dt.Date(nullable=False)) + | dt.spaceless_string("time").result(dt.Time(nullable=False)) + | dt.spaceless_string( + "tinyint", "int8", "int1", "boolean", "bool" + ).result(dt.Int8(nullable=False)) + | dt.spaceless_string("integer", "int32", "int4", "int").result( + dt.Int32(nullable=False) + ) + | dt.spaceless_string("uint64").result(dt.UInt64(nullable=False)) + | dt.spaceless_string("uint32").result(dt.UInt32(nullable=False)) + | dt.spaceless_string("uint16").result(dt.UInt16(nullable=False)) + | dt.spaceless_string("uint8").result(dt.UInt8(nullable=False)) + | dt.spaceless_string("uuid").result(dt.UUID(nullable=False)) + | dt.spaceless_string( + "longtext", + "mediumtext", + "tinytext", + "text", + "longblob", + "mediumblob", + "tinyblob", + "blob", + "varchar", + "char", + "string", + ).result(dt.String(nullable=False)) + ) + + @p.generate + def parened_string(): + yield dt.LPAREN + s = yield dt.RAW_STRING + yield dt.RPAREN + return s + + @p.generate + def nullable(): + yield dt.spaceless_string("nullable") + yield dt.LPAREN + parsed_ty = yield ty + yield dt.RPAREN + return parsed_ty(nullable=True) + + @p.generate + def fixed_string(): + yield dt.spaceless_string("fixedstring") + yield dt.LPAREN + yield dt.NUMBER + yield dt.RPAREN + return dt.String(nullable=False) + + @p.generate + def decimal(): + yield dt.spaceless_string("decimal", "numeric") + precision, scale = yield dt.LPAREN.then( + p.seq(dt.PRECISION.skip(dt.COMMA), dt.SCALE) + ).skip(dt.RPAREN) + return dt.Decimal(precision, scale, nullable=False) + + @p.generate + def paren_type(): + yield dt.LPAREN + value_type = yield ty + yield dt.RPAREN + return value_type + + @p.generate + def array(): + yield dt.spaceless_string("array") + value_type = yield paren_type + return dt.Array(value_type, nullable=False) + + @p.generate + def map(): + yield dt.spaceless_string("map") + yield dt.LPAREN + key_type = yield ty + yield dt.COMMA + value_type = yield ty + yield dt.RPAREN + return dt.Map(key_type, value_type, nullable=False) + + at_least_one_space = p.regex(r"\s+") + + @p.generate + def nested(): + yield dt.spaceless_string("nested") + yield dt.LPAREN + + field_names_types = yield ( + p.seq(dt.SPACES.then(dt.FIELD.skip(at_least_one_space)), ty) + .combine(lambda field, ty: (field, dt.Array(ty, nullable=False))) + .sep_by(dt.COMMA) + ) + yield dt.RPAREN + return dt.Struct.from_tuples(field_names_types, nullable=False) + + @p.generate + def struct(): + yield dt.spaceless_string("tuple") + yield dt.LPAREN + field_names_types = yield ( + p.seq( + dt.SPACES.then(dt.FIELD.skip(at_least_one_space).optional()), + ty, + ) + .combine(lambda field, ty: (field, ty)) + .sep_by(dt.COMMA) + ) + yield dt.RPAREN + return dt.Struct.from_tuples( + [ + (field_name if field_name is not None else f"f{i:d}", typ) + for i, (field_name, typ) in enumerate(field_names_types) + ], + nullable=False, + ) + + ty = ( + nullable + | nested + | primitive + | fixed_string + | decimal + | array + | map + | struct + ) + return ty.parse(text) + + +@functools.singledispatch +def serialize(ty) -> str: + raise NotImplementedError( + f"{ty} not serializable to clickhouse type string" + ) + + +@serialize.register(dt.DataType) +def _(ty: dt.DataType) -> str: + ser_ty = serialize_raw(ty) + if ty.nullable: + return f"Nullable({ser_ty})" + return ser_ty + + +@functools.singledispatch +def serialize_raw(ty: dt.DataType) -> str: + raise NotImplementedError( + f"{ty} not serializable to clickhouse type string" + ) + + +@serialize_raw.register(dt.DataType) +def _(ty: dt.DataType) -> str: + return type(ty).__name__.capitalize() + + +@serialize_raw.register(dt.Array) +def _(ty: dt.Array) -> str: + return f"Array({serialize(ty.value_type)})" + + +@serialize_raw.register(dt.Map) +def _(ty: dt.Map) -> str: + key_type = serialize(ty.key_type) + value_type = serialize(ty.value_type) + return f"Map({key_type}, {value_type})" + + +@serialize_raw.register(dt.Struct) +def _(ty: dt.Struct) -> str: + fields = ", ".join( + f"{name} {serialize(field_ty)}" for name, field_ty in ty.pairs.items() + ) + return f"Tuple({fields})" + + +@serialize_raw.register(dt.Timestamp) +def _(ty: dt.Timestamp) -> str: + return ( + "DateTime64(6)" + if ty.timezone is None + else f"DateTime64(6, {ty.timezone!r})" + ) diff --git a/ibis/backends/clickhouse/registry.py b/ibis/backends/clickhouse/registry.py index 7f3f785b7ec3..7cb1db2abfa6 100644 --- a/ibis/backends/clickhouse/registry.py +++ b/ibis/backends/clickhouse/registry.py @@ -6,6 +6,7 @@ import ibis.expr.operations as ops import ibis.expr.types as ir import ibis.util as util +from ibis.backends.clickhouse.datatypes import serialize from ibis.backends.clickhouse.identifiers import quote_identifier # TODO(kszucs): should inherit operation registry from the base compiler @@ -19,12 +20,11 @@ def _alias(translator, expr): def _cast(translator, expr): - from ibis.backends.clickhouse.client import ClickhouseDataType - op = expr.op() - arg, target = op.args + arg = op.arg + target = op.to arg_ = translator.translate(arg) - type_ = str(ClickhouseDataType.from_ibis(target, nullable=False)) + type_ = serialize(target) return f'CAST({arg_!s} AS {type_!s})' diff --git a/ibis/backends/clickhouse/tests/test_functions.py b/ibis/backends/clickhouse/tests/test_functions.py index 6b91ea318efc..36e9e13bed1c 100644 --- a/ibis/backends/clickhouse/tests/test_functions.py +++ b/ibis/backends/clickhouse/tests/test_functions.py @@ -18,9 +18,11 @@ @pytest.mark.parametrize( ('to_type', 'expected'), [ - param('int8', 'CAST(`double_col` AS Int8)', id="int8"), - param('int16', 'CAST(`double_col` AS Int16)', id="int16"), - param('float32', 'CAST(`double_col` AS Float32)', id="float32"), + param('int8', 'CAST(`double_col` AS Nullable(Int8))', id="int8"), + param('int16', 'CAST(`double_col` AS Nullable(Int16))', id="int16"), + param( + 'float32', 'CAST(`double_col` AS Nullable(Float32))', id="float32" + ), param('float', '`double_col`', id="float"), # alltypes.double_col is non-nullable param( @@ -38,11 +40,22 @@ def test_cast_double_col(alltypes, translate, to_type, expected): @pytest.mark.parametrize( ('to_type', 'expected'), [ - ('int8', 'CAST(`string_col` AS Int8)'), - ('int16', 'CAST(`string_col` AS Int16)'), + ('int8', 'CAST(`string_col` AS Nullable(Int8))'), + ('int16', 'CAST(`string_col` AS Nullable(Int16))'), (dt.String(nullable=False), 'CAST(`string_col` AS String)'), - ('timestamp', 'CAST(`string_col` AS DateTime)'), - ('date', 'CAST(`string_col` AS Date)'), + ('timestamp', 'CAST(`string_col` AS Nullable(DateTime64(6)))'), + ('date', 'CAST(`string_col` AS Nullable(Date))'), + ( + '!map', + 'CAST(`string_col` AS Map(Nullable(String), Nullable(Int64)))', + ), + ( + '!struct', + ( + 'CAST(`string_col` AS ' + 'Tuple(a Nullable(String), b Nullable(Int64)))' + ), + ), ], ) def test_cast_string_col(alltypes, translate, to_type, expected): @@ -85,15 +98,13 @@ def test_timestamp_cast(alltypes, translate): assert isinstance(result1, ir.TimestampColumn) assert isinstance(result2, ir.TimestampColumn) - assert translate(result1) == 'CAST(`timestamp_col` AS DateTime)' - assert translate(result2) == 'CAST(`int_col` AS DateTime)' + assert translate(result1) == 'CAST(`timestamp_col` AS DateTime64(6))' + assert translate(result2) == 'CAST(`int_col` AS DateTime64(6))' -def test_timestamp_now(con, translate): +def test_timestamp_now(translate): expr = ibis.now() - # now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') assert translate(expr) == 'now()' - # assert con.execute(expr) == now @pytest.mark.parametrize( @@ -107,7 +118,7 @@ def test_timestamp_now(con, translate): ('minute', '2009-05-17 12:34:00'), ], ) -def test_timestamp_truncate(con, translate, unit, expected): +def test_timestamp_truncate(con, unit, expected): stamp = ibis.timestamp('2009-05-17 12:34:56') expr = stamp.truncate(unit) assert con.execute(expr) == pd.Timestamp(expected) @@ -268,7 +279,7 @@ def test_string_contains(con, op, value, expected): # TODO: clickhouse-driver escaping bug -def test_re_replace(con, translate): +def test_re_replace(con): expr1 = L('Hello, World!').re_replace('.', '\\\\0\\\\0') expr2 = L('Hello, World!').re_replace('^', 'here: ') @@ -280,7 +291,7 @@ def test_re_replace(con, translate): ('value', 'expected'), [(L('a'), 0), (L('b'), 1), (L('d'), -1)], # TODO: what's the expected? ) -def test_find_in_set(con, value, expected, translate): +def test_find_in_set(con, value, expected): vals = list('abc') expr = value.find_in_set(vals) assert con.execute(expr) == expected @@ -312,12 +323,12 @@ def test_string_column_find_in_set(con, alltypes, translate): ), ], ) -def test_parse_url(con, translate, url, extract, expected): +def test_parse_url(con, url, extract, expected): expr = url.parse_url(extract) assert con.execute(expr) == expected -def test_parse_url_query_parameter(con, translate): +def test_parse_url_query_parameter(con): url = L('https://www.youtube.com/watch?v=kEuEcWfewf8&t=10') expr = url.parse_url('QUERY', 't') assert con.execute(expr) == '10' @@ -427,7 +438,7 @@ def test_translate_math_functions(con, alltypes, translate, call, expected): ), ], ) -def test_math_functions(con, expr, expected, translate): +def test_math_functions(con, expr, expected): assert con.execute(expr) == expected @@ -476,7 +487,7 @@ def test_regexp(con, expr, expected): # (L('abcd').re_extract('abcd', 3), None), ], ) -def test_regexp_extract(con, expr, expected, translate): +def test_regexp_extract(con, expr, expected): assert con.execute(expr) == expected @@ -496,14 +507,14 @@ def test_column_regexp_replace(con, alltypes, translate): assert len(con.execute(expr)) -def test_numeric_builtins_work(con, alltypes, df, translate): +def test_numeric_builtins_work(alltypes, df): expr = alltypes.double_col result = expr.execute() expected = df.double_col.fillna(0) tm.assert_series_equal(result, expected) -def test_null_column(alltypes, translate): +def test_null_column(alltypes): t = alltypes nrows = t.count().execute() expr = t.mutate(na_column=ibis.NA).na_column diff --git a/ibis/backends/clickhouse/tests/test_operators.py b/ibis/backends/clickhouse/tests/test_operators.py index 213452082eae..aa40666fa26e 100644 --- a/ibis/backends/clickhouse/tests/test_operators.py +++ b/ibis/backends/clickhouse/tests/test_operators.py @@ -312,10 +312,10 @@ def test_array_index(con, arr, ids): ], ) def test_array_concat(con, arrays): - expr = L([]).cast(dt.Array(dt.int8)) + expr = L([]).cast("!array") expected = sum(arrays, []) for arr in arrays: - expr += L(arr) + expr += L(arr, type="!array") assert con.execute(expr) == expected diff --git a/ibis/backends/clickhouse/tests/test_select.py b/ibis/backends/clickhouse/tests/test_select.py index cfa69c0c6953..9e13d44a337f 100644 --- a/ibis/backends/clickhouse/tests/test_select.py +++ b/ibis/backends/clickhouse/tests/test_select.py @@ -193,7 +193,7 @@ def test_complex_array_expr_projection(db, alltypes): query = ibis.clickhouse.compile(expr2) name = expr2.get_name() - expected = f"""SELECT CAST(`string_col` AS Float64) AS `{name}` + expected = f"""SELECT CAST(`string_col` AS Nullable(Float64)) AS `{name}` FROM ( SELECT `string_col`, count(*) AS `count` FROM {db.name}.`functional_alltypes` diff --git a/ibis/backends/clickhouse/tests/test_types.py b/ibis/backends/clickhouse/tests/test_types.py index d2f63fee600e..7b1785f551b3 100644 --- a/ibis/backends/clickhouse/tests/test_types.py +++ b/ibis/backends/clickhouse/tests/test_types.py @@ -1,8 +1,7 @@ import pytest -from pkg_resources import parse_version import ibis.expr.datatypes as dt -from ibis.backends.clickhouse.client import ClickhouseDataType +from ibis.backends.clickhouse.datatypes import parse pytest.importorskip("clickhouse_driver") @@ -19,48 +18,128 @@ def test_column_types(alltypes): def test_columns_types_with_additional_argument(con): - sql_types = ["toFixedString('foo', 8) AS fixedstring_col"] - if parse_version(con.version).base_version >= '1.1.54337': - sql_types.append( - "toDateTime('2018-07-02 00:00:00', 'UTC') AS datetime_col" - ) - sql = 'SELECT {}'.format(', '.join(sql_types)) - df = con.sql(sql).execute() + sql_types = [ + "toFixedString('foo', 8) AS fixedstring_col", + "toDateTime('2018-07-02 00:00:00', 'UTC') AS datetime_col", + ] + df = con.sql(f"SELECT {', '.join(sql_types)}").execute() assert df.fixedstring_col.dtype.name == 'object' - if parse_version(con.version).base_version >= '1.1.54337': - assert df.datetime_col.dtype.name == 'datetime64[ns]' + assert df.datetime_col.dtype.name == 'datetime64[ns, UTC]' @pytest.mark.parametrize( ('ch_type', 'ibis_type'), [ - ('Array(Int8)', dt.Array(dt.Int8(nullable=False))), - ('Array(Int16)', dt.Array(dt.Int16(nullable=False))), - ('Array(Int32)', dt.Array(dt.Int32(nullable=False))), - ('Array(Int64)', dt.Array(dt.Int64(nullable=False))), - ('Array(UInt8)', dt.Array(dt.UInt8(nullable=False))), - ('Array(UInt16)', dt.Array(dt.UInt16(nullable=False))), - ('Array(UInt32)', dt.Array(dt.UInt32(nullable=False))), - ('Array(UInt64)', dt.Array(dt.UInt64(nullable=False))), - ('Array(Float32)', dt.Array(dt.Float32(nullable=False))), - ('Array(Float64)', dt.Array(dt.Float64(nullable=False))), - ('Array(String)', dt.Array(dt.String(nullable=False))), - ('Array(FixedString(32))', dt.Array(dt.String(nullable=False))), - ('Array(Date)', dt.Array(dt.Date(nullable=False))), - ('Array(DateTime)', dt.Array(dt.Timestamp(nullable=False))), - ('Array(DateTime64)', dt.Array(dt.Timestamp(nullable=False))), - ('Array(Nothing)', dt.Array(dt.Null(nullable=False))), - ('Array(Null)', dt.Array(dt.Null(nullable=False))), - ('Array(Array(Int8))', dt.Array(dt.Array(dt.Int8(nullable=False)))), + ('Array(Int8)', dt.Array(dt.Int8(nullable=False), nullable=False)), + ('Array(Int16)', dt.Array(dt.Int16(nullable=False), nullable=False)), + ('Array(Int32)', dt.Array(dt.Int32(nullable=False), nullable=False)), + ('Array(Int64)', dt.Array(dt.Int64(nullable=False), nullable=False)), + ('Array(UInt8)', dt.Array(dt.UInt8(nullable=False), nullable=False)), + ('Array(UInt16)', dt.Array(dt.UInt16(nullable=False), nullable=False)), + ('Array(UInt32)', dt.Array(dt.UInt32(nullable=False), nullable=False)), + ('Array(UInt64)', dt.Array(dt.UInt64(nullable=False), nullable=False)), + ( + 'Array(Float32)', + dt.Array(dt.Float32(nullable=False), nullable=False), + ), + ( + 'Array(Float64)', + dt.Array(dt.Float64(nullable=False), nullable=False), + ), + ('Array(String)', dt.Array(dt.String(nullable=False), nullable=False)), + ( + 'Array(FixedString(32))', + dt.Array(dt.String(nullable=False), nullable=False), + ), + ('Array(Date)', dt.Array(dt.Date(nullable=False), nullable=False)), + ( + 'Array(DateTime)', + dt.Array(dt.Timestamp(nullable=False), nullable=False), + ), + ( + 'Array(DateTime64)', + dt.Array(dt.Timestamp(nullable=False), nullable=False), + ), + ('Array(Nothing)', dt.Array(dt.null, nullable=False)), + ('Array(Null)', dt.Array(dt.null, nullable=False)), + ( + 'Array(Array(Int8))', + dt.Array( + dt.Array(dt.Int8(nullable=False), nullable=False), + nullable=False, + ), + ), ( 'Array(Array(Array(Int8)))', - dt.Array(dt.Array(dt.Array(dt.Int8(nullable=False)))), + dt.Array( + dt.Array( + dt.Array(dt.Int8(nullable=False), nullable=False), + nullable=False, + ), + nullable=False, + ), ), ( 'Array(Array(Array(Array(Int8))))', - dt.Array(dt.Array(dt.Array(dt.Array(dt.Int8(nullable=False))))), + dt.Array( + dt.Array( + dt.Array( + dt.Array(dt.Int8(nullable=False), nullable=False), + nullable=False, + ), + nullable=False, + ), + nullable=False, + ), + ), + ( + "Map(Nullable(String), Nullable(UInt64))", + dt.Map(dt.string, dt.uint64, nullable=False), + ), + ("Decimal(10, 3)", dt.Decimal(10, 3, nullable=False)), + ( + "Tuple(a String, b Array(Nullable(Float64)))", + dt.Struct.from_dict( + dict( + a=dt.String(nullable=False), + b=dt.Array(dt.float64, nullable=False), + ), + nullable=False, + ), + ), + ( + "Tuple(String, Array(Nullable(Float64)))", + dt.Struct.from_dict( + dict( + f0=dt.String(nullable=False), + f1=dt.Array(dt.float64, nullable=False), + ), + nullable=False, + ), + ), + ( + "Tuple(a String, Array(Nullable(Float64)))", + dt.Struct.from_dict( + dict( + a=dt.String(nullable=False), + f1=dt.Array(dt.float64, nullable=False), + ), + nullable=False, + ), + ), + ( + "Nested(a String, b Array(Nullable(Float64)))", + dt.Struct.from_dict( + dict( + a=dt.Array(dt.String(nullable=False), nullable=False), + b=dt.Array( + dt.Array(dt.float64, nullable=False), nullable=False + ), + ), + nullable=False, + ), ), ], ) -def test_array_type(ch_type, ibis_type): - assert ClickhouseDataType(ch_type).to_ibis() == ibis_type +def test_parse_type(ch_type, ibis_type): + assert parse(ch_type) == ibis_type diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index 14155a6f5b74..fe48bf6d2ba9 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -16,7 +16,7 @@ import ibis.expr.schema as sch from ibis.backends.base.sql.alchemy import BaseAlchemyBackend from ibis.backends.duckdb.compiler import DuckDBSQLCompiler -from ibis.backends.duckdb.datatypes import parse_type +from ibis.backends.duckdb.datatypes import parse class Backend(BaseAlchemyBackend): @@ -73,10 +73,7 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: with self.con.connect() as con: rel = con.connection.c.query(query) return sch.Schema.from_dict( - { - name: parse_type(type) - for name, type in zip(rel.columns, rel.types) - } + {name: parse(type) for name, type in zip(rel.columns, rel.types)} ) def _get_sqla_table( @@ -113,7 +110,7 @@ def _get_sqla_table( if colname in nulltype_cols: column = sa.Column( colname, - to_sqla_type(parse_type(type)), + to_sqla_type(parse(type)), nullable=null == "YES", ) # replace null types discovered by sqlite with non null diff --git a/ibis/backends/duckdb/datatypes.py b/ibis/backends/duckdb/datatypes.py index d3ae0c8d0a88..1f04a35801ec 100644 --- a/ibis/backends/duckdb/datatypes.py +++ b/ibis/backends/duckdb/datatypes.py @@ -1,14 +1,26 @@ from __future__ import annotations -import re from typing import TYPE_CHECKING import parsy as p +from ibis import util + if TYPE_CHECKING: from ibis.expr.datatypes import DataType from ibis.expr.datatypes import ( + COLON, + COMMA, + FIELD, + LANGLE, + LBRACKET, + LPAREN, + PRECISION, + RANGLE, + RBRACKET, + RPAREN, + SCALE, Array, Decimal, Interval, @@ -24,6 +36,8 @@ int16, int32, int64, + spaceless, + spaceless_string, string, time, uint8, @@ -33,34 +47,9 @@ uuid, ) -_SPACES = p.regex(r'\s*', re.MULTILINE) - - -def spaceless(parser): - return _SPACES.then(parser).skip(_SPACES) - - -def spaceless_string(*strings: str): - return spaceless( - p.alt(*[p.string(s, transform=str.lower) for s in strings]) - ) - - -def parse_type(text: str, default_decimal_parameters=(18, 3)) -> DataType: - precision = scale = p.digit.at_least(1).concat().map(int) - - lparen = spaceless_string("(") - rparen = spaceless_string(")") - - lbracket = spaceless_string("[") - rbracket = spaceless_string("]") - - langle = spaceless_string("<") - rangle = spaceless_string(">") - - comma = spaceless_string(",") - colon = spaceless_string(":") +def parse(text: str, default_decimal_parameters=(18, 3)) -> DataType: + """Parse a DuckDB type into an ibis data type.""" primitive = ( spaceless_string("interval").result(Interval()) | spaceless_string("bigint", "int8", "long").result(int64) @@ -99,21 +88,21 @@ def parse_type(text: str, default_decimal_parameters=(18, 3)) -> DataType: def decimal(): yield spaceless_string("decimal", "numeric") prec_scale = ( - yield lparen.then( - p.seq(precision.skip(comma), scale).combine( + yield LPAREN.then( + p.seq(PRECISION.skip(COMMA), SCALE).combine( lambda prec, scale: (prec, scale) ) ) - .skip(rparen) + .skip(RPAREN) .optional() ) or default_decimal_parameters return Decimal(*prec_scale) @p.generate def angle_type(): - yield langle + yield LANGLE value_type = yield ty - yield rangle + yield RANGLE return value_type @p.generate @@ -125,34 +114,42 @@ def list_array(): @p.generate def pg_array(): value_type = yield non_pg_array_type - yield lbracket - yield rbracket + yield LBRACKET + yield RBRACKET return Array(value_type) @p.generate def map(): yield spaceless_string("map") - yield langle + yield LANGLE key_type = yield primitive - yield comma + yield COMMA value_type = yield ty - yield rangle + yield RANGLE return Map(key_type, value_type) - field = spaceless(p.regex("[a-zA-Z_][a-zA-Z_0-9]*")) + field = spaceless(FIELD) @p.generate def struct(): yield spaceless_string("struct") - yield langle + yield LANGLE field_names_types = yield ( - p.seq(field.skip(colon), ty) + p.seq(field.skip(COLON), ty) .combine(lambda field, ty: (field, ty)) - .sep_by(comma) + .sep_by(COMMA) ) - yield rangle + yield RANGLE return Struct.from_tuples(field_names_types) non_pg_array_type = primitive | decimal | list_array | map | struct ty = pg_array | non_pg_array_type return ty.parse(text) + + +@util.deprecated( + instead=f"use {parse.__module__}.{parse.__name__}", + version="4.0", +) +def parse_type(*args, **kwargs): + return parse(*args, **kwargs) diff --git a/ibis/backends/duckdb/tests/test_datatypes.py b/ibis/backends/duckdb/tests/test_datatypes.py index 21a87fef609d..e08597ea0a46 100644 --- a/ibis/backends/duckdb/tests/test_datatypes.py +++ b/ibis/backends/duckdb/tests/test_datatypes.py @@ -2,7 +2,7 @@ from pytest import param import ibis.expr.datatypes as dt -from ibis.backends.duckdb.datatypes import parse_type +from ibis.backends.duckdb.datatypes import parse, parse_type EXPECTED_SCHEMA = dict( a=dt.int64, @@ -116,5 +116,10 @@ ], ) def test_parser(column, type): - ty = parse_type(type) + ty = parse(type) assert ty == EXPECTED_SCHEMA[column] + + +def test_parse_type_warns(): + with pytest.warns(FutureWarning): + parse_type("BIGINT") diff --git a/ibis/backends/postgres/datatypes.py b/ibis/backends/postgres/datatypes.py index dc02cf516e7c..abb42edb5d39 100644 --- a/ibis/backends/postgres/datatypes.py +++ b/ibis/backends/postgres/datatypes.py @@ -8,7 +8,7 @@ def _get_type(typestr: str) -> dt.DataType: try: return _type_mapping[typestr] except KeyError: - return ddb.parse_type(typestr) + return ddb.parse(typestr) _type_mapping = { diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index a8b27e34ac9c..83be8b6ef7cd 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -80,11 +80,6 @@ def test_np_array_literal(con): builtin_array = toolz.compose( - # the type parser needs additional work for this to work - pytest.mark.broken( - "clickhouse", - reason="nullable types can't yet be parsed", - ), # these will almost certainly never be supported pytest.mark.never( ["mysql", "sqlite"], @@ -112,4 +107,6 @@ def test_array_discovery(con): scalar_column=dt.float64, ) ) - assert t.schema() == expected + assert t.columns == list(expected.names) + for ty, expected_ty in zip(t.schema().types, expected.types): + assert isinstance(ty, type(expected_ty)) diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py index a586dafb5498..5165602f061d 100644 --- a/ibis/expr/datatypes/core.py +++ b/ibis/expr/datatypes/core.py @@ -26,6 +26,7 @@ from multipledispatch import Dispatcher from public import public +from ibis import util from ibis.common.exceptions import IbisTypeError, InputTypeError from ibis.common.grounds import Annotable, Comparable, Singleton from ibis.common.validators import ( @@ -61,7 +62,7 @@ def default(value, **kwargs) -> DataType: @dtype.register(str) def from_string(value: str) -> DataType: try: - return parse_type(value) + return parse(value) except SyntaxError: raise IbisTypeError(f'{value!r} cannot be parsed as a datatype') @@ -936,20 +937,62 @@ class INET(String): _STRING_REGEX = """('[^\n'\\\\]*(?:\\\\.[^\n'\\\\]*)*'|"[^\n"\\\\"]*(?:\\\\.[^\n"\\\\]*)*")""" # noqa: E501 -_SPACES = p.regex(r'\s*', re.MULTILINE) +SPACES = p.regex(r'\s*', re.MULTILINE) +@public def spaceless(parser): - return _SPACES.then(parser).skip(_SPACES) + return SPACES.then(parser).skip(SPACES) + + +@public +def spaceless_string(*strings: str): + return spaceless( + p.alt(*(p.string(string, transform=str.lower) for string in strings)) + ) + +RAW_NUMBER = p.digit.at_least(1).concat() +PRECISION = SCALE = NUMBER = RAW_NUMBER.map(int) -def spaceless_string(s: str): - return spaceless(p.string(s, transform=str.lower)) +LPAREN = spaceless_string("(") +RPAREN = spaceless_string(")") + +LBRACKET = spaceless_string("[") +RBRACKET = spaceless_string("]") + +LANGLE = spaceless_string("<") +RANGLE = spaceless_string(">") + +COMMA = spaceless_string(",") +COLON = spaceless_string(":") +SEMICOLON = spaceless_string(";") + +RAW_STRING = p.regex(_STRING_REGEX).map(ast.literal_eval) +FIELD = p.regex("[a-zA-Z_][a-zA-Z_0-9]*") + +public( + COLON=COLON, + COMMA=COMMA, + FIELD=FIELD, + LANGLE=LANGLE, + LBRACKET=LBRACKET, + RBRACKET=RBRACKET, + LPAREN=LPAREN, + NUMBER=NUMBER, + PRECISION=PRECISION, + RANGLE=RANGLE, + RAW_STRING=RAW_STRING, + RPAREN=RPAREN, + SCALE=SCALE, + SEMICOLON=SEMICOLON, + SPACES=SPACES, +) @public @functools.lru_cache(maxsize=100) -def parse_type(text: str) -> DataType: +def parse(text: str) -> DataType: """Parse a type from a [`str`][str] `text`. The default `maxsize` parameter for caching is chosen to cache the most @@ -967,50 +1010,38 @@ def parse_type(text: str) -> DataType: >>> import ibis >>> import ibis.expr.datatypes as dt - >>> dt.parse_type("array") + >>> dt.parse("array") Array(value_type=Int64(nullable=True), nullable=True) You can avoid parsing altogether by constructing objects directly >>> import ibis >>> import ibis.expr.datatypes as dt - >>> ty = dt.parse_type("array") + >>> ty = dt.parse("array") >>> ty == dt.Array(dt.int64) True """ - precision = scale = srid = p.digit.at_least(1).concat().map(int) - - lparen = spaceless_string("(") - rparen = spaceless_string(")") - - langle = spaceless_string("<") - rangle = spaceless_string(">") - - comma = spaceless_string(",") - colon = spaceless_string(":") - semicolon = spaceless_string(";") - - raw_string = p.regex(_STRING_REGEX).map(ast.literal_eval) + srid = NUMBER geotype = spaceless_string("geography") | spaceless_string("geometry") @p.generate def srid_geotype(): - yield semicolon + yield SEMICOLON sr = yield srid - yield colon + yield COLON gt = yield geotype return (gt, sr) @p.generate def geotype_part(): - yield colon + yield COLON gt = yield geotype return (gt, None) @p.generate def srid_part(): - yield semicolon + yield SEMICOLON sr = yield srid return (None, sr) @@ -1062,32 +1093,28 @@ def parser(): @p.generate def varchar_or_char(): - yield p.alt( - spaceless_string("varchar"), spaceless_string("char") - ).then( - lparen.then(p.digit.at_least(1).concat()).skip(rparen).optional() + yield spaceless_string("varchar", "char").then( + LPAREN.then(RAW_NUMBER).skip(RPAREN).optional() ) return String() @p.generate def decimal(): yield spaceless_string("decimal") - prec, sc = ( - yield lparen.then( - p.seq(precision.skip(comma), scale).combine( - lambda prec, scale: (prec, scale) - ) + precision, scale = ( + yield LPAREN.then( + p.seq(spaceless(PRECISION).skip(COMMA), spaceless(SCALE)) ) - .skip(rparen) + .skip(RPAREN) .optional() ) or (None, None) - return Decimal(precision=prec, scale=sc) + return Decimal(precision=precision, scale=scale) @p.generate def parened_string(): - yield lparen - s = yield raw_string - yield rparen + yield LPAREN + s = yield RAW_STRING + yield RPAREN return s @p.generate @@ -1098,18 +1125,19 @@ def timestamp(): @p.generate def angle_type(): - yield langle + yield LANGLE value_type = yield ty - yield rangle + yield RANGLE return value_type @p.generate def interval(): yield spaceless_string("interval") value_type = yield angle_type.optional() - un = yield parened_string.optional() + unit = yield parened_string.optional() return Interval( - value_type=value_type, unit=un if un is not None else 's' + value_type=value_type, + unit=unit if unit is not None else "s", ) @p.generate @@ -1127,29 +1155,36 @@ def set(): @p.generate def map(): yield spaceless_string("map") - yield langle + yield LANGLE key_type = yield primitive - yield comma + yield COMMA value_type = yield ty - yield rangle + yield RANGLE return Map(key_type, value_type) - field = spaceless(p.regex("[a-zA-Z_][a-zA-Z_0-9]*")) + spaceless_field = spaceless(FIELD) @p.generate def struct(): yield spaceless_string("struct") - yield langle + yield LANGLE field_names_types = yield ( - p.seq(field.skip(colon), ty) + p.seq(spaceless_field.skip(COLON), ty) .combine(lambda field, ty: (field, ty)) - .sep_by(comma) + .sep_by(COMMA) ) - yield rangle + yield RANGLE return Struct.from_tuples(field_names_types) + @p.generate + def nullable(): + yield spaceless_string("!") + parsed_ty = yield ty + return parsed_ty(nullable=False) + ty = ( - timestamp + nullable + | timestamp | primitive | decimal | varchar_or_char @@ -1172,6 +1207,15 @@ def struct(): return ty.parse(text) +@util.deprecated( + instead=f"use {parse.__module__}.{parse.__name__}", + version="4.0", +) +@public +def parse_type(*args, **kwargs): + return parse(*args, **kwargs) + + def _get_timedelta_units( timedelta: datetime.timedelta | pd.Timedelta, ) -> list[str]: diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py index 78c6523df1be..60a88ba4981b 100644 --- a/ibis/tests/benchmarks/test_benchmarks.py +++ b/ibis/tests/benchmarks/test_benchmarks.py @@ -544,8 +544,8 @@ def test_complex_datatype_parse(benchmark): ) ) ) - assert dt.parse_type(type_str) == expected - benchmark(dt.parse_type, type_str) + assert dt.parse(type_str) == expected + benchmark(dt.parse, type_str) @pytest.mark.benchmark(group="datatype") diff --git a/ibis/tests/expr/test_datatypes.py b/ibis/tests/expr/test_datatypes.py index e9d1d62934e5..6b5ea692cec3 100644 --- a/ibis/tests/expr/test_datatypes.py +++ b/ibis/tests/expr/test_datatypes.py @@ -502,3 +502,8 @@ class MyStruct(dt.Struct): dtype = MyStruct.from_tuples([('a', 'int64')]) assert isinstance(dtype, MyStruct) + + +def test_parse_type_warns(): + with pytest.warns(FutureWarning): + dt.parse_type("int64") diff --git a/poetry.lock b/poetry.lock index 5ce59352d5d3..46802d895252 100644 --- a/poetry.lock +++ b/poetry.lock @@ -251,6 +251,8 @@ optional = true python-versions = ">=3.4.*, <4" [package.dependencies] +numpy = {version = ">=1.12.0", optional = true, markers = "extra == \"numpy\""} +pandas = {version = ">=0.24.0", optional = true, markers = "extra == \"numpy\""} pytz = "*" tzlocal = "*" @@ -2323,7 +2325,7 @@ visualization = ["graphviz"] [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.11" -content-hash = "141e837de174d1ddbbc1a7f8d519c0b0320323e4bb38f08de5f80740372ec4e3" +content-hash = "6cdfde1ce4daee23a5c5e195b89ba5a3ef5d436e700899573dbe184344fc17cb" [metadata.files] absolufy-imports = [ diff --git a/pyproject.toml b/pyproject.toml index 12782ca841fa..eb9d87b8f253 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,9 @@ regex = ">=2021.7.6" tabulate = ">=0.8.9,<1" toolz = ">=0.11,<0.12" clickhouse-cityhash = { version = ">=1.0.2,<2", optional = true } -clickhouse-driver = { version = ">=0.1,<0.3", optional = true } +clickhouse-driver = { version = ">=0.1,<0.3", optional = true, extras = [ + "numpy" +] } dask = { version = ">=2021.10.0", optional = true, extras = [ "array", "dataframe" diff --git a/setup.py b/setup.py index 9a2ebf828082..e37289964744 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ ':python_version < "3.10"': ['importlib-metadata>=4,<5'], 'all': [ 'clickhouse-cityhash>=1.0.2,<2', - 'clickhouse-driver>=0.1,<0.3', + 'clickhouse-driver[numpy]>=0.1,<0.3', 'dask[array,dataframe]>=2021.10.0', 'datafusion>=0.4,<0.6', 'duckdb>=0.3.2,<0.4.0', @@ -89,7 +89,7 @@ ], 'clickhouse': [ 'clickhouse-cityhash>=1.0.2,<2', - 'clickhouse-driver>=0.1,<0.3', + 'clickhouse-driver[numpy]>=0.1,<0.3', 'lz4>=3.1.10,<5', ], 'dask': ['dask[array,dataframe]>=2021.10.0', 'pyarrow>=1,<8'],