From 2330b0c86f9b0c18da7d26d4f6e9384f58d6c022 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 31 Jul 2024 06:01:37 -0400 Subject: [PATCH] feat(datafusion): `pivot_longer` --- ibis/backends/sql/compilers/datafusion.py | 7 ++ .../test_union_aliasing/datafusion/out.sql | 90 +++++++++++++++++++ ibis/backends/tests/test_array.py | 4 +- ibis/backends/tests/test_generic.py | 2 +- ibis/backends/tests/test_sql.py | 2 +- 5 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 ibis/backends/tests/snapshots/test_sql/test_union_aliasing/datafusion/out.sql diff --git a/ibis/backends/sql/compilers/datafusion.py b/ibis/backends/sql/compilers/datafusion.py index cba38a038c13..0e7ba7ae2aff 100644 --- a/ibis/backends/sql/compilers/datafusion.py +++ b/ibis/backends/sql/compilers/datafusion.py @@ -478,3 +478,10 @@ def visit_Aggregate(self, op, *, parent, groups, metrics): sel = sel.group_by(*by_names_quoted) return sel + + def visit_StructColumn(self, op, *, names, values): + args = [] + for name, value in zip(names, values): + args.append(sge.convert(name)) + args.append(value) + return self.f.named_struct(*args) diff --git a/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/datafusion/out.sql b/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/datafusion/out.sql new file mode 100644 index 000000000000..ddd0af6c0c02 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/datafusion/out.sql @@ -0,0 +1,90 @@ +WITH "t5" AS ( + SELECT + "t4"."field_of_study", + FIRST_VALUE("t4"."diff") FILTER(WHERE + NOT "t4"."diff" IS NULL) AS "diff" + FROM ( + SELECT + "t4"."years", + "t4"."degrees", + "t4"."earliest_degrees", + "t4"."latest_degrees", + "t4"."diff", + "t4"."field_of_study" + FROM ( + SELECT + "t3"."field_of_study", + "t3"."years", + "t3"."degrees", + "t3"."earliest_degrees", + "t3"."latest_degrees", + "t3"."latest_degrees" - "t3"."earliest_degrees" AS "diff" + FROM ( + SELECT + "t2"."field_of_study", + "t2"."years", + "t2"."degrees", + FIRST_VALUE("t2"."degrees") OVER (PARTITION BY "t2"."field_of_study" ORDER BY "t2"."years" ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "earliest_degrees", + LAST_VALUE("t2"."degrees") OVER (PARTITION BY "t2"."field_of_study" ORDER BY "t2"."years" ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "latest_degrees" + FROM ( + SELECT + "t1"."field_of_study", + "t1"."__pivoted__"."years" AS "years", + "t1"."__pivoted__"."degrees" AS "degrees" + FROM ( + SELECT + "t0"."field_of_study", + UNNEST( + MAKE_ARRAY( + NAMED_STRUCT('years', '1970-71', 'degrees', "t0"."1970-71"), + NAMED_STRUCT('years', '1975-76', 'degrees', "t0"."1975-76"), + NAMED_STRUCT('years', '1980-81', 'degrees', "t0"."1980-81"), + NAMED_STRUCT('years', '1985-86', 'degrees', "t0"."1985-86"), + NAMED_STRUCT('years', '1990-91', 'degrees', "t0"."1990-91"), + NAMED_STRUCT('years', '1995-96', 'degrees', "t0"."1995-96"), + NAMED_STRUCT('years', '2000-01', 'degrees', "t0"."2000-01"), + NAMED_STRUCT('years', '2005-06', 'degrees', "t0"."2005-06"), + NAMED_STRUCT('years', '2010-11', 'degrees', "t0"."2010-11"), + NAMED_STRUCT('years', '2011-12', 'degrees', "t0"."2011-12"), + NAMED_STRUCT('years', '2012-13', 'degrees', "t0"."2012-13"), + NAMED_STRUCT('years', '2013-14', 'degrees', "t0"."2013-14"), + NAMED_STRUCT('years', '2014-15', 'degrees', "t0"."2014-15"), + NAMED_STRUCT('years', '2015-16', 'degrees', "t0"."2015-16"), + NAMED_STRUCT('years', '2016-17', 'degrees', "t0"."2016-17"), + NAMED_STRUCT('years', '2017-18', 'degrees', "t0"."2017-18"), + NAMED_STRUCT('years', '2018-19', 'degrees', "t0"."2018-19"), + NAMED_STRUCT('years', '2019-20', 'degrees', "t0"."2019-20") + ) + ) AS "__pivoted__" + FROM "humanities" AS "t0" + ) AS "t1" + ) AS "t2" + ) AS "t3" + ) AS "t4" + ) AS t4 + GROUP BY + "t4"."field_of_study" +) +SELECT + * +FROM ( + SELECT + * + FROM "t5" AS "t6" + ORDER BY + "t6"."diff" DESC NULLS LAST + LIMIT 10 +) AS "t9" +UNION ALL +SELECT + * +FROM ( + SELECT + * + FROM "t5" AS "t6" + WHERE + "t6"."diff" < 0 + ORDER BY + "t6"."diff" ASC + LIMIT 10 +) AS "t10" \ No newline at end of file diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 4a01bbd932fa..b4480da68c94 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -901,9 +901,7 @@ def test_zip_null(con, fn): @builtin_array @pytest.mark.notimpl(["postgres"], raises=PsycoPg2SyntaxError) @pytest.mark.notimpl(["risingwave"], raises=PsycoPg2ProgrammingError) -@pytest.mark.notimpl( - ["datafusion"], raises=Exception, reason="probably generating invalid SQL" -) +@pytest.mark.notimpl(["datafusion"], raises=Exception, reason="not yet supported") @pytest.mark.notimpl( ["polars"], raises=com.OperationNotDefinedError, diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 4f9b8e9dd1be..6f760a1dcbbd 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1403,7 +1403,7 @@ def query(t, group_cols): @pytest.mark.notimpl(["oracle", "exasol"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl(["druid"], raises=AssertionError) @pytest.mark.notyet( - ["datafusion", "impala", "mssql", "mysql", "sqlite"], + ["impala", "mssql", "mysql", "sqlite"], reason="backend doesn't support arrays and we don't implement pivot_longer with unions yet", raises=com.OperationNotDefinedError, ) diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 4d1a3d678ce7..fcc8e50d8f9c 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -102,7 +102,7 @@ def test_isin_bug(con, snapshot): raises=NotImplementedError, ) @pytest.mark.notyet( - ["datafusion", "exasol", "oracle", "flink"], + ["exasol", "oracle", "flink"], reason="no unnest support", raises=exc.OperationNotDefinedError, )