From 910b8f540a659ffb24703869d7a3a65fe4985cdf Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Fri, 30 Aug 2024 08:02:37 -0400 Subject: [PATCH] fix(clickhouse): workaround `EXCEPT` and `INTERSECT` generation in sqlglot; add tpcds query 87 (#9959) Add query 87 and workaround a bug in sqlglot (fixed upstream in https://github.com/tobymao/sqlglot/pull/4007) --- ibis/backends/sql/dialects.py | 25 +++++++++---- ibis/backends/tests/tpc/ds/test_queries.py | 35 ++++++++++++++++--- .../tests/tpc/queries/duckdb/ds/87.sql | 2 +- 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/ibis/backends/sql/dialects.py b/ibis/backends/sql/dialects.py index 04f0a53e7ca2..68711d9f81f6 100644 --- a/ibis/backends/sql/dialects.py +++ b/ibis/backends/sql/dialects.py @@ -9,7 +9,6 @@ from sqlglot import transforms from sqlglot.dialects import ( TSQL, - ClickHouse, Hive, MySQL, Oracle, @@ -19,15 +18,27 @@ SQLite, Trino, ) +from sqlglot.dialects import ClickHouse as _ClickHouse from sqlglot.dialects.dialect import rename_func from sqlglot.helper import find_new_name, seq_get -ClickHouse.Generator.TRANSFORMS |= { - sge.ArraySize: rename_func("length"), - sge.ArraySort: rename_func("arraySort"), - sge.LogicalAnd: rename_func("min"), - sge.LogicalOr: rename_func("max"), -} + +class ClickHouse(_ClickHouse): + class Generator(_ClickHouse.Generator): + _ClickHouse.Generator.TRANSFORMS |= { + sge.ArraySize: rename_func("length"), + sge.ArraySort: rename_func("arraySort"), + sge.LogicalAnd: rename_func("min"), + sge.LogicalOr: rename_func("max"), + } + + def except_op(self, expression: sge.Except) -> str: + return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" + + def intersect_op(self, expression: sge.Intersect) -> str: + return ( + f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" + ) class DataFusion(Postgres): diff --git a/ibis/backends/tests/tpc/ds/test_queries.py b/ibis/backends/tests/tpc/ds/test_queries.py index 47bc16afd19f..055bc8315253 100644 --- a/ibis/backends/tests/tpc/ds/test_queries.py +++ b/ibis/backends/tests/tpc/ds/test_queries.py @@ -1912,11 +1912,6 @@ def test_37(item, inventory, date_dim, catalog_sales): @tpc_test("ds") -@pytest.mark.notyet( - ["clickhouse"], - raises=AssertionError, - reason="clickhouse returns an incorrect result for this query", -) def test_38(store_sales, catalog_sales, web_sales, date_dim, customer): dates = date_dim.filter(_.d_month_seq.between(1200, 1200 + 11)) columns = "c_last_name", "c_first_name", "d_date" @@ -4358,6 +4353,36 @@ def test_86(web_sales, date_dim, item): raise NotImplementedError() +@tpc_test("ds") +def test_87(store_sales, date_dim, customer, catalog_sales, web_sales): + def cust(sales, sold_date_sk, customer_sk): + return ( + sales.join(date_dim, [(sold_date_sk, "d_date_sk")]) + .join(customer, [(customer_sk, "c_customer_sk")]) + .filter(_.d_month_seq.between(1200, 1200 + 11)) + .select(_.c_last_name, _.c_first_name, _.d_date) + .distinct() + ) + + return ibis.difference( + cust( + store_sales, + sold_date_sk="ss_sold_date_sk", + customer_sk="ss_customer_sk", + ), + cust( + catalog_sales, + sold_date_sk="cs_sold_date_sk", + customer_sk="cs_bill_customer_sk", + ), + cust( + web_sales, + sold_date_sk="ws_sold_date_sk", + customer_sk="ws_bill_customer_sk", + ), + ).agg(num_cool=_.count()) + + @tpc_test("ds") def test_89(item, store_sales, date_dim, store): return ( diff --git a/ibis/backends/tests/tpc/queries/duckdb/ds/87.sql b/ibis/backends/tests/tpc/queries/duckdb/ds/87.sql index d08740b8714e..345f59209ffb 100644 --- a/ibis/backends/tests/tpc/queries/duckdb/ds/87.sql +++ b/ibis/backends/tests/tpc/queries/duckdb/ds/87.sql @@ -1,4 +1,4 @@ -SELECT count(*) +SELECT count(*) num_cool FROM ((SELECT DISTINCT c_last_name, c_first_name, d_date