Skip to content

Commit

Permalink
fix(bigquery): ensure that generated names are used when compiling co…
Browse files Browse the repository at this point in the history
…lumns and allow flexible column names
  • Loading branch information
cpcloud committed Jun 29, 2023
1 parent 5e4d16b commit c7044fe
Show file tree
Hide file tree
Showing 46 changed files with 73 additions and 61 deletions.
1 change: 1 addition & 0 deletions .envrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ use flake
watch_file poetry.lock

export CLOUDSDK_ACTIVE_CONFIG_NAME=ibis-gbq
export GOOGLE_CLOUD_PROJECT="$CLOUDSDK_ACTIVE_CONFIG_NAME"
export SQLALCHEMY_WARN_20=1
2 changes: 1 addition & 1 deletion ibis/backends/bigquery/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def find_bigquery_udf(op):
return lin.proceed, result


_NAME_REGEX = re.compile(r"[_A-Za-z][A-Za-z_0-9]*")
_NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+')
_EXACT_NAME_REGEX = re.compile(f"^{_NAME_REGEX.pattern}$")


Expand Down
25 changes: 25 additions & 0 deletions ibis/backends/bigquery/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
import ibis.expr.operations as ops
from ibis.backends.base.sql.registry import (
fixed_arity,
helpers,
literal,
operation_registry,
reduction,
unary,
)
from ibis.backends.base.sql.registry.main import table_array_view
from ibis.backends.bigquery.datatypes import BigQueryType
from ibis.common.temporal import DateUnit, IntervalUnit, TimeUnit

Expand Down Expand Up @@ -635,6 +637,28 @@ def _interval_multiply(t, op):
return f"INTERVAL EXTRACT({unit} from {left}) * {right} {unit}"


def table_column(translator, op):
"""Override column references to adjust names for BigQuery."""
quoted_name = translator._gen_valid_name(
helpers.quote_identifier(op.name, force=True)
)

ctx = translator.context

# If the column does not originate from the table set in the current SELECT
# context, we should format as a subquery
if translator.permit_subquery and ctx.is_foreign_expr(op.table):
# TODO(kszucs): avoid the expression roundtrip
proj_expr = op.table.to_expr().select([op.name]).to_array().op()
return table_array_view(translator, proj_expr)

alias = ctx.get_ref(op.table, search_parents=True)
if alias is not None:
quoted_name = f"{alias}.{quoted_name}"

return quoted_name


OPERATION_REGISTRY = {
**operation_registry,
# Literal
Expand Down Expand Up @@ -781,6 +805,7 @@ def _interval_multiply(t, op):
ops.ArrayStringJoin: lambda t, op: f"ARRAY_TO_STRING({t.translate(op.arg)}, {t.translate(op.sep)})",
ops.StartsWith: fixed_arity("STARTS_WITH", 2),
ops.EndsWith: fixed_arity("ENDS_WITH", 2),
ops.TableColumn: table_column,
}

_invalid_operations = {
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/bigquery/tests/system/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def test_boolean_casting(alltypes):
t = alltypes
expr = t.group_by(k=t.string_col.nullif("1") == "9").count()
result = expr.execute().set_index("k")
count = result["count"]
count = result.iloc[:, 0]
assert count.at[False] == 5840
assert count.at[True] == 730

Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT APPROX_QUANTILES(if(t0.`month` > 0, t0.`double_col`, NULL), 2)[OFFSET(1)] AS `ApproxMedian_double_col_Greater_month`
SELECT APPROX_QUANTILES(if(t0.`month` > 0, t0.`double_col`, NULL), 2)[OFFSET(1)] AS `ApproxMedian_double_col_ Greater_month_ 0`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT APPROX_COUNT_DISTINCT(if(t0.`month` > 0, t0.`double_col`, NULL)) AS `ApproxCountDistinct_double_col_Greater_month`
SELECT APPROX_COUNT_DISTINCT(if(t0.`month` > 0, t0.`double_col`, NULL)) AS `ApproxCountDistinct_double_col_ Greater_month_ 0`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT CAST(t0.`value` AS BYTES) AS `Cast_value_binary`
SELECT CAST(t0.`value` AS BYTES) AS `Cast_value_ binary`
FROM t t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT BIT_AND(if(t0.`bigint_col` > 0, t0.`int_col`, NULL)) AS `BitAnd_int_col_Greater_bigint_col`
SELECT BIT_AND(if(t0.`bigint_col` > 0, t0.`int_col`, NULL)) AS `BitAnd_int_col_ Greater_bigint_col_ 0`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT BIT_OR(if(t0.`bigint_col` > 0, t0.`int_col`, NULL)) AS `BitOr_int_col_Greater_bigint_col`
SELECT BIT_OR(if(t0.`bigint_col` > 0, t0.`int_col`, NULL)) AS `BitOr_int_col_ Greater_bigint_col_ 0`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT BIT_XOR(if(t0.`bigint_col` > 0, t0.`int_col`, NULL)) AS `BitXor_int_col_Greater_bigint_col`
SELECT BIT_XOR(if(t0.`bigint_col` > 0, t0.`int_col`, NULL)) AS `BitXor_int_col_ Greater_bigint_col_ 0`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT sum(if((t0.`month` > 6) AND (t0.`month` < 10), CAST(t0.`bool_col` AS INT64), NULL)) AS `Sum_bool_col_And_Greater_month_Less_month`
SELECT sum(if((t0.`month` > 6) AND (t0.`month` < 10), CAST(t0.`bool_col` AS INT64), NULL)) AS `Sum_bool_col_ And_Greater_month_ 6_ Less_month_ 10`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT avg(if(t0.`month` > 6, CAST(t0.`bool_col` AS INT64), NULL)) AS `Mean_bool_col_Greater_month`
SELECT avg(if(t0.`month` > 6, CAST(t0.`bool_col` AS INT64), NULL)) AS `Mean_bool_col_ Greater_month_ 6`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT CAST(TRUNC(t0.`double_col`) AS INT64) AS `Cast_double_col_int64`
SELECT CAST(TRUNC(t0.`double_col`) AS INT64) AS `Cast_double_col_ int64`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT COVAR_POP(t0.`double_col`, t0.`double_col`) AS `Covariance_double_col_double_col`
SELECT COVAR_POP(t0.`double_col`, t0.`double_col`) AS `Covariance_double_col_ double_col`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT COVAR_SAMP(t0.`double_col`, t0.`double_col`) AS `Covariance_double_col_double_col`
SELECT COVAR_SAMP(t0.`double_col`, t0.`double_col`) AS `Covariance_double_col_ double_col`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT MOD(EXTRACT(DAYOFWEEK FROM DATE '2017-01-01') + 5, 7) AS `DayOfWeekIndex_datetime_date`
SELECT MOD(EXTRACT(DAYOFWEEK FROM DATE '2017-01-01') + 5, 7) AS `DayOfWeekIndex_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT INITCAP(CAST(DATE '2017-01-01' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date`
SELECT INITCAP(CAST(DATE '2017-01-01' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT MOD(EXTRACT(DAYOFWEEK FROM TIMESTAMP '2017-01-01 04:55:59') + 5, 7) AS `DayOfWeekIndex_datetime_datetime`
SELECT MOD(EXTRACT(DAYOFWEEK FROM TIMESTAMP '2017-01-01 04:55:59') + 5, 7) AS `DayOfWeekIndex_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT INITCAP(CAST(TIMESTAMP '2017-01-01 04:55:59' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime`
SELECT INITCAP(CAST(TIMESTAMP '2017-01-01 04:55:59' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT MOD(EXTRACT(DAYOFWEEK FROM DATE '2017-01-01') + 5, 7) AS `DayOfWeekIndex_datetime_date`
SELECT MOD(EXTRACT(DAYOFWEEK FROM DATE '2017-01-01') + 5, 7) AS `DayOfWeekIndex_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT INITCAP(CAST(DATE '2017-01-01' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date`
SELECT INITCAP(CAST(DATE '2017-01-01' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT MOD(EXTRACT(DAYOFWEEK FROM TIMESTAMP '2017-01-01 04:55:59') + 5, 7) AS `DayOfWeekIndex_datetime_datetime`
SELECT MOD(EXTRACT(DAYOFWEEK FROM TIMESTAMP '2017-01-01 04:55:59') + 5, 7) AS `DayOfWeekIndex_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT INITCAP(CAST(TIMESTAMP '2017-01-01 04:55:59' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime`
SELECT INITCAP(CAST(TIMESTAMP '2017-01-01 04:55:59' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT MOD(EXTRACT(DAYOFWEEK FROM TIMESTAMP '2017-01-01 04:55:59') + 5, 7) AS `DayOfWeekIndex_datetime_datetime`
SELECT MOD(EXTRACT(DAYOFWEEK FROM TIMESTAMP '2017-01-01 04:55:59') + 5, 7) AS `DayOfWeekIndex_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT INITCAP(CAST(TIMESTAMP '2017-01-01 04:55:59' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime`
SELECT INITCAP(CAST(TIMESTAMP '2017-01-01 04:55:59' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT MOD(EXTRACT(DAYOFWEEK FROM DATE '2017-01-01') + 5, 7) AS `DayOfWeekIndex_datetime_date`
SELECT MOD(EXTRACT(DAYOFWEEK FROM DATE '2017-01-01') + 5, 7) AS `DayOfWeekIndex_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT INITCAP(CAST(DATE '2017-01-01' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date`
SELECT INITCAP(CAST(DATE '2017-01-01' AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT CAST(FLOOR(IEEE_DIVIDE(t0.`double_col`, 0)) AS INT64) AS `FloorDivide_double_col`
SELECT CAST(FLOOR(IEEE_DIVIDE(t0.`double_col`, 0)) AS INT64) AS `FloorDivide_double_col_ 0`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT IEEE_DIVIDE(t0.`double_col`, 0) AS `Divide_double_col`
SELECT IEEE_DIVIDE(t0.`double_col`, 0) AS `Divide_double_col_ 0`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT farm_fingerprint(FROM_BASE64('dGVzdCBvZiBoYXNo')) AS `Hash_b_test_of_hash`
SELECT farm_fingerprint(FROM_BASE64('dGVzdCBvZiBoYXNo')) AS `Hash_b'test of hash'`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT farm_fingerprint('test of hash') AS `Hash_test_of_hash`
SELECT farm_fingerprint('test of hash') AS `Hash_'test of hash'`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT EXTRACT(year from DATE '2017-01-01') AS `ExtractYear_datetime_date`
SELECT EXTRACT(year from DATE '2017-01-01') AS `ExtractYear_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT EXTRACT(year from TIMESTAMP '2017-01-01 04:55:59') AS `ExtractYear_datetime_datetime`
SELECT EXTRACT(year from TIMESTAMP '2017-01-01 04:55:59') AS `ExtractYear_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT EXTRACT(year from DATE '2017-01-01') AS `ExtractYear_datetime_date`
SELECT EXTRACT(year from DATE '2017-01-01') AS `ExtractYear_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT EXTRACT(year from TIMESTAMP '2017-01-01 04:55:59') AS `ExtractYear_datetime_datetime`
SELECT EXTRACT(year from TIMESTAMP '2017-01-01 04:55:59') AS `ExtractYear_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT EXTRACT(year from TIMESTAMP '2017-01-01 04:55:59') AS `ExtractYear_datetime_datetime`
SELECT EXTRACT(year from TIMESTAMP '2017-01-01 04:55:59') AS `ExtractYear_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT EXTRACT(year from DATE '2017-01-01') AS `ExtractYear_datetime_date`
SELECT EXTRACT(year from DATE '2017-01-01') AS `ExtractYear_datetime_date_2017_ 1_ 1`
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT PARSE_TIMESTAMP('%F', t0.`date_string_col`) AS `StringToTimestamp_date_string_col_F`
SELECT PARSE_TIMESTAMP('%F', t0.`date_string_col`) AS `StringToTimestamp_date_string_col_ '%F'`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT PARSE_TIMESTAMP('%F %Z', CONCAT(t0.`date_string_col`, ' America/New_York')) AS `StringToTimestamp_StringConcat_F_Z`
SELECT PARSE_TIMESTAMP('%F %Z', CONCAT(t0.`date_string_col`, ' America/New_York')) AS `StringToTimestamp_StringConcat_ '%F %Z'`
FROM functional_alltypes t0
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ function my_len(s) {
return my_len(s);
""";

SELECT (my_len_0('abcd') + my_len_0('abcd')) + my_len_1('abcd') AS `Add_Add_my_len_0_abcd_my_len_0_abcd_my_len_1_abcd`
SELECT (my_len_0('abcd') + my_len_0('abcd')) + my_len_1('abcd') AS `Add_Add_my_len_0_'abcd'_ my_len_0_'abcd'_ my_len_1_'abcd'`
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ function my_len(s) {
return my_len(s);
""";

SELECT my_len_0('abcd') AS `my_len_0_abcd`
SELECT my_len_0('abcd') AS `my_len_0_'abcd'`
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ function my_len(s) {
return my_len(s);
""";

SELECT my_len_0('abcd') AS `my_len_0_abcd`
SELECT my_len_0('abcd') AS `my_len_0_'abcd'`
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ function my_len(s) {
return my_len(s);
""";

SELECT my_len_0('abcd') AS `my_len_0_abcd`
SELECT my_len_0('abcd') AS `my_len_0_'abcd'`
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ CREATE TEMPORARY FUNCTION format_t_0(input STRING)
RETURNS FLOAT64
AS (FORMAT('%T', input));

SELECT format_t_0('abcd') AS `format_t_0_abcd`
SELECT format_t_0('abcd') AS `format_t_0_'abcd'`
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ SELECT
WHERE
t0.`x` > 2
) AS t1
) AS `Contains_x_x`
) AS `Contains_x_ x`
FROM t AS t0
22 changes: 4 additions & 18 deletions ibis/backends/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,30 +1234,21 @@ def test_group_concat(
backend.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
('result_fn', 'expected_fn'),
[
param(
lambda t: t.string_col.topk(3),
lambda t: t.groupby('string_col')['string_col'].count().head(3),
id='string_col_top3',
)
],
)
@mark.notimpl(
["dask"],
raises=NotImplementedError,
reason="sorting on aggregations not yet implemented",
)
def test_topk_op(alltypes, df, result_fn, expected_fn):
def test_topk_op(alltypes, df):
# TopK expression will order rows by "count" but each backend
# can have different result for that.
# Note: Maybe would be good if TopK could order by "count"
# and the field used by TopK
t = alltypes.order_by(alltypes.string_col)
df = df.sort_values('string_col')
result = result_fn(t).execute()
expected = expected_fn(df)
expr = t.string_col.topk(3)
result = expr.execute()
expected = df.groupby('string_col')['string_col'].count().head(3)
assert all(result.iloc[:, 1].values == expected.values)


Expand Down Expand Up @@ -1489,11 +1480,6 @@ def test_group_concat_over_window(backend, con):
backend.assert_frame_equal(result, expected)


@pytest.mark.broken(
["bigquery"],
reason="auto generated names are still invalid for outer query",
raises=GoogleBadRequest,
)
@pytest.mark.notimpl(["dask"], raises=NotImplementedError)
@pytest.mark.notyet(
["polars"],
Expand Down

0 comments on commit c7044fe

Please sign in to comment.