diff --git a/ibis/backends/bigquery/__init__.py b/ibis/backends/bigquery/__init__.py index 1cb7f8ffbcb3..0675fb3bf498 100644 --- a/ibis/backends/bigquery/__init__.py +++ b/ibis/backends/bigquery/__init__.py @@ -12,6 +12,7 @@ import google.cloud.bigquery_storage_v1 as bqstorage import pandas as pd import pydata_google_auth +import sqlglot as sg from pydata_google_auth import cache import ibis @@ -74,6 +75,17 @@ def _create_client_info_gapic(application_name): return ClientInfo(user_agent=_create_user_agent(application_name)) +def _anonymous_unnest_to_explode(node: sg.exp.Expression): + """Convert `ANONYMOUS` `unnest` function calls to `EXPLODE` calls. + + This allows us to generate DuckDB-like `UNNEST` calls and let sqlglot do + the work of transforming those into the correct BigQuery SQL. + """ + if isinstance(node, sg.exp.Anonymous) and node.this.lower() == "unnest": + return sg.exp.Explode(this=node.expressions[0]) + return node + + class Backend(BaseSQLBackend, CanCreateSchema, CanListDatabases): name = "bigquery" compiler = BigQueryCompiler @@ -313,6 +325,42 @@ def _execute(self, stmt, results=True, query_parameters=None): query.result() # blocks until finished return BigQueryCursor(query) + def compile( + self, + expr: ir.Expr, + limit: str | None = None, + params: Mapping[ir.Expr, Any] | None = None, + **_, + ) -> Any: + """Compile an Ibis expression. + + Parameters + ---------- + expr + Ibis expression + limit + For expressions yielding result sets; retrieve at most this number + of values/rows. Overrides any limit already set on the expression. + params + Named unbound parameters + + Returns + ------- + Any + The output of compilation. The type of this value depends on the + backend. + """ + + self._define_udf_translation_rules(expr) + sql = self.compiler.to_ast_ensure_limit(expr, limit, params=params).compile() + + return ";\n\n".join( + query.transform(_anonymous_unnest_to_explode).sql( + dialect="bigquery", pretty=True + ) + for query in sg.parse(sql, read="bigquery") + ) + def raw_sql(self, query: str, results=False, params=None): query_parameters = [ bigquery_param( @@ -378,8 +426,7 @@ def execute(self, expr, params=None, limit="default", **kwargs): # TODO: upstream needs to pass params to raw_sql, I think. kwargs.pop("timecontext", None) - query_ast = self.compiler.to_ast_ensure_limit(expr, limit, params=params) - sql = query_ast.compile() + sql = self.compile(expr, limit=limit, params=params, **kwargs) self._log(sql) cursor = self.raw_sql(sql, params=params, **kwargs) diff --git a/ibis/backends/bigquery/registry.py b/ibis/backends/bigquery/registry.py index efa398529b98..92cfb7fcccc2 100644 --- a/ibis/backends/bigquery/registry.py +++ b/ibis/backends/bigquery/registry.py @@ -903,6 +903,7 @@ def _count_distinct_star(t, op): ops.TableColumn: table_column, ops.CountDistinctStar: _count_distinct_star, ops.Argument: lambda _, op: op.name, + ops.Unnest: unary("UNNEST"), } _invalid_operations = { diff --git a/ibis/backends/bigquery/tests/unit/snapshots/test_compiler/test_unnest/out_one_unnest.sql b/ibis/backends/bigquery/tests/unit/snapshots/test_compiler/test_unnest/out_one_unnest.sql new file mode 100644 index 000000000000..1556f98ec2a7 --- /dev/null +++ b/ibis/backends/bigquery/tests/unit/snapshots/test_compiler/test_unnest/out_one_unnest.sql @@ -0,0 +1,15 @@ +SELECT + t0.`rowindex`, + IF(pos = pos_2, repeated_struct_col, NULL) AS repeated_struct_col +FROM array_test AS t0, UNNEST(GENERATE_ARRAY(0, GREATEST(ARRAY_LENGTH(t0.`repeated_struct_col`)) - 1)) AS pos +CROSS JOIN UNNEST(t0.`repeated_struct_col`) AS repeated_struct_col WITH OFFSET AS pos_2 +WHERE + pos = pos_2 + OR ( + pos > ( + ARRAY_LENGTH(t0.`repeated_struct_col`) - 1 + ) + AND pos_2 = ( + ARRAY_LENGTH(t0.`repeated_struct_col`) - 1 + ) + ) \ No newline at end of file diff --git a/ibis/backends/bigquery/tests/unit/snapshots/test_compiler/test_unnest/out_two_unnests.sql b/ibis/backends/bigquery/tests/unit/snapshots/test_compiler/test_unnest/out_two_unnests.sql new file mode 100644 index 000000000000..9a5ce2dcca03 --- /dev/null +++ b/ibis/backends/bigquery/tests/unit/snapshots/test_compiler/test_unnest/out_two_unnests.sql @@ -0,0 +1,30 @@ +SELECT + IF(pos = pos_2, level_two, NULL) AS level_two +FROM ( + SELECT + t1.`rowindex`, + IF(pos = pos_2, level_one, NULL).`nested_struct_col` AS level_one + FROM array_test AS t1, UNNEST(GENERATE_ARRAY(0, GREATEST(ARRAY_LENGTH(t1.`repeated_struct_col`)) - 1)) AS pos + CROSS JOIN UNNEST(t1.`repeated_struct_col`) AS level_one WITH OFFSET AS pos_2 + WHERE + pos = pos_2 + OR ( + pos > ( + ARRAY_LENGTH(t1.`repeated_struct_col`) - 1 + ) + AND pos_2 = ( + ARRAY_LENGTH(t1.`repeated_struct_col`) - 1 + ) + ) +) AS t0, UNNEST(GENERATE_ARRAY(0, GREATEST(ARRAY_LENGTH(t0.`level_one`)) - 1)) AS pos +CROSS JOIN UNNEST(t0.`level_one`) AS level_two WITH OFFSET AS pos_2 +WHERE + pos = pos_2 + OR ( + pos > ( + ARRAY_LENGTH(t0.`level_one`) - 1 + ) + AND pos_2 = ( + ARRAY_LENGTH(t0.`level_one`) - 1 + ) + ) \ No newline at end of file diff --git a/ibis/backends/bigquery/tests/unit/test_compiler.py b/ibis/backends/bigquery/tests/unit/test_compiler.py index 6e3644348490..fafeddbdaec2 100644 --- a/ibis/backends/bigquery/tests/unit/test_compiler.py +++ b/ibis/backends/bigquery/tests/unit/test_compiler.py @@ -399,10 +399,7 @@ def test_timestamp_accepts_date_literals(alltypes): expr = alltypes.mutate(param=p) params = {p: date_string} result = to_sql(expr, params=params) - expected = """\ -SELECT t\\d+\\.\\*, @param_\\d+ AS `param` -FROM functional_alltypes t\\d+""" - assert re.match(expected, result) is not None + assert re.search(r"@param_\d+ AS `param`", result) is not None @pytest.mark.parametrize("distinct", [True, False]) @@ -587,13 +584,47 @@ def test_scalar_param_scope(alltypes): t = alltypes param = ibis.param("timestamp") result = to_sql(t.mutate(param=param), params={param: "2017-01-01"}) - expected = """\ -SELECT t\\d+\\.\\*, @param_\\d+ AS `param` -FROM functional_alltypes t\\d+""" - assert re.match(expected, result) is not None + assert re.search(r"@param_\d+ AS `param`", result) is not None def test_cast_float_to_int(alltypes, snapshot): expr = alltypes.double_col.cast("int64") result = to_sql(expr) snapshot.assert_match(result, "out.sql") + + +def test_unnest(snapshot): + table = ibis.table( + dict( + rowindex="int", + repeated_struct_col=dt.Array( + dt.Struct( + dict( + nested_struct_col=dt.Array( + dt.Struct( + dict( + doubly_nested_array="array", + doubly_nested_field="string", + ) + ) + ) + ) + ) + ), + ), + name="array_test", + ) + repeated_struct_col = table.repeated_struct_col + + # Works as expected :-) + result = ibis.bigquery.compile( + table.select("rowindex", repeated_struct_col.unnest()) + ) + snapshot.assert_match(result, "out_one_unnest.sql") + + result = ibis.bigquery.compile( + table.select( + "rowindex", level_one=repeated_struct_col.unnest().nested_struct_col + ).select(level_two=lambda t: t.level_one.unnest()) + ) + snapshot.assert_match(result, "out_two_unnests.sql") diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index 78f08f9cd329..0f1685f56b07 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -1261,11 +1261,6 @@ def test_topk_op(alltypes, df): ) ], ) -@mark.broken( - ["bigquery"], - raises=GoogleBadRequest, - reason='400 Syntax error: Expected keyword JOIN but got identifier "SEMI"', -) @mark.broken( ["druid"], raises=sa.exc.ProgrammingError, diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 8e8ca5eb3548..fb08c122ea6c 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -2,7 +2,6 @@ import contextlib import functools -import os import numpy as np import pandas as pd @@ -11,7 +10,6 @@ import sqlalchemy as sa import sqlglot as sg import toolz -from packaging.version import parse as parse_version from pytest import param import ibis @@ -185,20 +183,6 @@ def test_array_index(con, idx): assert result == arr[idx] -duckdb_0_4_0 = pytest.mark.xfail( - ( - # nixpkgs is patched to include the fix, so we pass these tests - # inside the nix-shell or when they run under `nix build` - (not any(key.startswith("NIX_") for key in os.environ)) - and ( - parse_version(getattr(duckdb, "__version__", "0.0.0")) - == parse_version("0.4.0") - ) - ), - reason="DuckDB array support is broken in 0.4.0 without nix", -) - - builtin_array = toolz.compose( # these will almost certainly never be supported pytest.mark.never( @@ -211,16 +195,6 @@ def test_array_index(con, idx): ), # someone just needs to implement these pytest.mark.notimpl(["datafusion"], raises=Exception), - duckdb_0_4_0, -) - -unnest = toolz.compose( - builtin_array, - pytest.mark.notyet( - ["bigquery"], - reason="doesn't support unnest in SELECT position", - raises=com.OperationNotDefinedError, - ), ) @@ -354,7 +328,12 @@ def test_array_discovery_snowflake(backend): assert t.schema() == expected -@unnest +@builtin_array +@pytest.mark.notyet( + ["bigquery"], + reason="BigQuery doesn't support casting array to array", + raises=BadRequest, +) @pytest.mark.notimpl(["dask"], raises=ValueError) def test_unnest_simple(backend): array_types = backend.array_types @@ -370,7 +349,7 @@ def test_unnest_simple(backend): tm.assert_series_equal(result, expected) -@unnest +@builtin_array @pytest.mark.notimpl("dask", raises=com.OperationNotDefinedError) def test_unnest_complex(backend): array_types = backend.array_types @@ -396,7 +375,7 @@ def test_unnest_complex(backend): tm.assert_frame_equal(result, expected) -@unnest +@builtin_array @pytest.mark.never( "pyspark", reason="pyspark throws away nulls in collect_list", @@ -426,7 +405,7 @@ def test_unnest_idempotent(backend): tm.assert_frame_equal(result, expected) -@unnest +@builtin_array @pytest.mark.notimpl("dask", raises=ValueError) def test_unnest_no_nulls(backend): array_types = backend.array_types @@ -452,7 +431,7 @@ def test_unnest_no_nulls(backend): tm.assert_frame_equal(result, expected) -@unnest +@builtin_array @pytest.mark.notimpl("dask", raises=ValueError) def test_unnest_default_name(backend): array_types = backend.array_types @@ -720,7 +699,6 @@ def test_array_intersect(con): assert lhs == rhs, f"row {i:d} differs" -@unnest @builtin_array @pytest.mark.notimpl( ["clickhouse"], @@ -767,7 +745,7 @@ def test_zip(backend): assert len(x[0]) == len(s[0]) -@unnest +@builtin_array @pytest.mark.broken( ["clickhouse"], raises=sg.ParseError, diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 2a95cb2c34cc..35097895b639 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -972,11 +972,6 @@ def query(t, group_cols): @pytest.mark.notimpl(["dask", "pandas", "oracle"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl(["druid"], raises=AssertionError) -@pytest.mark.notyet( - ["bigquery"], - reason="backend doesn't implement unnest", - raises=com.OperationNotDefinedError, -) @pytest.mark.notyet( ["datafusion", "impala", "mssql", "mysql", "sqlite"], reason="backend doesn't support arrays and we don't implement pivot_longer with unions yet", diff --git a/ibis/backends/tests/test_join.py b/ibis/backends/tests/test_join.py index 0df1e208ed06..18d13267b489 100644 --- a/ibis/backends/tests/test_join.py +++ b/ibis/backends/tests/test_join.py @@ -122,7 +122,7 @@ def test_mutating_join(backend, batting, awards_players, how): @pytest.mark.parametrize("how", ["semi", "anti"]) -@pytest.mark.notimpl(["bigquery", "dask", "druid"]) +@pytest.mark.notimpl(["dask", "druid"]) def test_filtering_join(backend, batting, awards_players, how): left = batting[batting.yearID == 2015] right = awards_players[awards_players.lgID == "NL"].drop("yearID", "lgID") @@ -181,7 +181,7 @@ def test_mutate_then_join_no_column_overlap(batting, awards_players): assert not expr.limit(5).execute().empty -@pytest.mark.notimpl(["bigquery", "druid"]) +@pytest.mark.notimpl(["druid"]) @pytest.mark.notyet(["dask"], reason="dask doesn't support descending order by") @pytest.mark.broken( ["polars"], diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 837d46e7b414..b27419a819a3 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -149,14 +149,7 @@ def test_isin_bug(con, snapshot): ["sqlite", "mysql", "druid", "impala", "mssql"], reason="no unnest support upstream" ) @pytest.mark.notimpl( - ["bigquery", "oracle"], - reason="unnest not yet implemented", - raises=exc.OperationNotDefinedError, -) -@pytest.mark.xfail_version( - duckdb=["sqlglot<=11.4.5"], - raises=sg.ParseError, - reason="https://github.com/tobymao/sqlglot/pull/1379 not in the installed version of sqlglot", + ["oracle"], reason="unnest not yet implemented", raises=exc.OperationNotDefinedError ) @pytest.mark.parametrize("backend_name", _get_backends_to_test()) def test_union_aliasing(backend_name, snapshot): diff --git a/poetry.lock b/poetry.lock index c897430e82d4..448dd3ffa757 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5236,17 +5236,17 @@ sqlalchemy = ">=1.0.0" [[package]] name = "sqlglot" -version = "18.0.0" +version = "18.7.0" description = "An easily customizable SQL parser and transpiler" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "sqlglot-18.0.0-py3-none-any.whl", hash = "sha256:ed3edbde98bdfd5752ca765c788fcc9bdc7342a19cbd3058120175e93eec32f3"}, - {file = "sqlglot-18.0.0.tar.gz", hash = "sha256:7bd8eeabaf0474a20840df96657c7cf27f82a3e3784a8b3d9fb243105aae7142"}, + {file = "sqlglot-18.7.0-py3-none-any.whl", hash = "sha256:8dc40beb466324d9aa977ce5e994ffbc983998cb3e78381ea18bd418b29d9101"}, + {file = "sqlglot-18.7.0.tar.gz", hash = "sha256:8c6858d004c348758f6961058fec8f607f54324e1035fb00f8be989643851ecc"}, ] [package.extras] -dev = ["autoflake", "black", "duckdb (>=0.6)", "isort", "mypy (>=0.990)", "pandas", "pdoc", "pre-commit", "pyspark", "python-dateutil"] +dev = ["autoflake", "black", "duckdb (>=0.6)", "isort", "mypy (>=0.990)", "pandas", "pdoc", "pre-commit", "pyspark", "python-dateutil", "types-python-dateutil"] [[package]] name = "stack-data" @@ -5940,4 +5940,4 @@ visualization = ["graphviz"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "6394b3556597b3b270cb1e84c78d1c2a79c8bbcc667b9c825ed5ff2e12e83568" +content-hash = "59096eef883d4a14e45ada357c46efea88931925b75f5d04564581a6bc0b42f0" diff --git a/pyproject.toml b/pyproject.toml index 2a780f413d3c..b7247d164f83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ pyarrow = ">=2,<13" python-dateutil = ">=2.8.2,<3" pytz = ">=2022.7" rich = ">=12.4.4,<14" -sqlglot = ">=17.15.0,<19" +sqlglot = ">=18.7.0,<19" toolz = ">=0.11,<1" typing-extensions = ">=4.3.0,<5" black = { version = ">=22.1.0,<24", optional = true } diff --git a/requirements-dev.txt b/requirements-dev.txt index bf388f3ec925..89e6eb39400b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -193,7 +193,7 @@ sortedcontainers==2.4.0 ; python_version >= "3.9" and python_version < "4.0" sphobjinv==2.3.1 ; python_version >= "3.10" and python_version < "4.0" sqlalchemy-views==0.3.2 ; python_version >= "3.9" and python_version < "4.0" sqlalchemy==1.4.49 ; python_version >= "3.9" and python_version < "4.0" -sqlglot==18.0.0 ; python_version >= "3.9" and python_version < "4.0" +sqlglot==18.7.0 ; python_version >= "3.9" and python_version < "4.0" stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0" statsmodels==0.14.0 ; python_version >= "3.10" and python_version < "4.0" stdlib-list==0.9.0 ; python_version >= "3.9" and python_version < "4.0"