From 8929fef71ec7ab1b2794ad51576c2dc4e0116c15 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Sep 2023 16:04:59 +0000 Subject: [PATCH 01/18] deps: migrate to `ibis-framework >= "7.0.0"` This should unlock some bug fixes as well as potential `UNNEST` support in a future change. --- bigframes/remote_function.py | 9 +++------ noxfile.py | 6 +++--- setup.py | 3 +-- .../ibis/expr/operations/analytic.py | 12 +++++++----- .../ibis/expr/operations/reductions.py | 8 ++++---- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 6fc2f8e59f..fbfbc45e51 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -485,17 +485,14 @@ def remote_function_node( """Creates an Ibis node representing a remote function call.""" fields = { - name: rlz.value(type_) if type_ else rlz.any + name: rlz.ValueOf(None if type_ == "ANY TYPE" else type_) for name, type_ in zip( ibis_signature.parameter_names, ibis_signature.input_types ) } - try: - fields["output_type"] = rlz.shape_like("args", dtype=ibis_signature.output_type) # type: ignore - except TypeError: - fields["output_dtype"] = property(lambda _: ibis_signature.output_type) - fields["output_shape"] = rlz.shape_like("args") + fields["dtype"] = ibis_signature.output_type + fields["shape"] = rlz.shape_like("args") node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore diff --git a/noxfile.py b/noxfile.py index 033bbfefe4..5ceaa5d06c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -84,9 +84,9 @@ "format", "docs", "docfx", - "unit", - "unit_noextras", - "system", + "unit_prerelease", + # "unit_noextras", + "system_prerelease", "doctest", "cover", "release_dry_run", diff --git a/setup.py b/setup.py index 29eacb74a9..063cdb480d 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - # TODO: Relax upper bound once we have fixed `system_prerelease` tests. - "ibis-framework[bigquery] >=6.2.0,<7.0.0dev", + "ibis-framework[bigquery] >=7.0.0dev,<8.0.0dev", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 038987cac9..9da6fe1115 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,21 +2,23 @@ from __future__ import annotations -from ibis.expr.operations.analytic import Analytic +import ibis.expr.datatypes as dt +import ibis.expr.operations.analytic as ibis_ops_analytic +import ibis.expr.operations.core as ibis_ops_core import ibis.expr.rules as rlz -class FirstNonNullValue(Analytic): +class FirstNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the first element.""" - arg = rlz.column(rlz.any) + arg: ibis_ops_core.Column[dt.Any] output_dtype = rlz.dtype_like("arg") -class LastNonNullValue(Analytic): +class LastNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the last element.""" - arg = rlz.column(rlz.any) + arg: ibis_ops_core.Column[dt.Any] output_dtype = rlz.dtype_like("arg") diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 5e6ad9ecf2..cdbb47b787 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -3,8 +3,8 @@ from __future__ import annotations import ibis.expr.datatypes as dt +import ibis.expr.operations.core as ibis_ops_core from ibis.expr.operations.reductions import Filterable, Reduction -import ibis.expr.rules as rlz class ApproximateMultiQuantile(Filterable, Reduction): @@ -13,9 +13,9 @@ class ApproximateMultiQuantile(Filterable, Reduction): See: https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_quantiles """ - arg = rlz.any - num_bins = rlz.value(dt.int64) - output_dtype = dt.Array(dt.float64) + arg: ibis_ops_core.Value + num_bins: ibis_ops_core.Value[dt.Int64] + output_dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] __all__ = [ From a4e995d2d977712f6c22d556c632b1bdaeb4ceeb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Sep 2023 16:22:09 +0000 Subject: [PATCH 02/18] use dtype instead of output_dtype in custom ops --- .../bigframes_vendored/ibis/expr/operations/analytic.py | 4 ++-- .../bigframes_vendored/ibis/expr/operations/reductions.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 9da6fe1115..78aeb67a37 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -12,14 +12,14 @@ class FirstNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the first element.""" arg: ibis_ops_core.Column[dt.Any] - output_dtype = rlz.dtype_like("arg") + dtype = rlz.dtype_like("arg") class LastNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the last element.""" arg: ibis_ops_core.Column[dt.Any] - output_dtype = rlz.dtype_like("arg") + dtype = rlz.dtype_like("arg") __all__ = [ diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index cdbb47b787..76aa34466a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -15,7 +15,7 @@ class ApproximateMultiQuantile(Filterable, Reduction): arg: ibis_ops_core.Value num_bins: ibis_ops_core.Value[dt.Int64] - output_dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] + dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] __all__ = [ From 8fd8fd4ed9d2b1c4e50a8e99b8047aab990cca56 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Sep 2023 16:45:12 +0000 Subject: [PATCH 03/18] adjust type annotations --- bigframes/core/__init__.py | 3 +-- bigframes/operations/__init__.py | 13 ++++++++++--- bigframes/operations/aggregations.py | 4 ++-- bigframes/remote_function.py | 2 +- tests/system/small/test_ibis.py | 5 +++-- .../ibis/expr/operations/reductions.py | 2 +- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 3b3754642e..d84ebd0b83 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -691,8 +691,7 @@ def project_window_op( case_statement = ibis.case() for clause in clauses: case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() - window_op = case_statement + window_op = case_statement.else_(window_op).end() result = self._set_or_replace_by_id(output_name or column_name, window_op) # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index bc08298eb7..bc1b44eac3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -18,6 +18,7 @@ import typing import ibis +import ibis.common.annotations import ibis.common.exceptions import ibis.expr.datatypes as ibis_dtypes import ibis.expr.operations.generic @@ -737,9 +738,15 @@ def add_op( ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): return - return typing.cast(ibis_types.NumericValue, x) + typing.cast( - ibis_types.NumericValue, y - ) + try: + # Could be string concatenation or numeric addition. + return x + y # type: ignore + except ibis.common.annotations.SignatureValidationError as exc: + left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) + right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) + raise TypeError( + f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" + ) from exc @short_circuit_nulls() diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 23271e8220..dd458c9e26 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -74,7 +74,7 @@ def _as_ibis( # Will be null if all inputs are null. Pandas defaults to zero sum though. bq_sum = _apply_window_if_present(column.sum(), window) return ( - ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() + ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore ) @@ -167,7 +167,7 @@ def _as_ibis( .else_(magnitude * pow(-1, negative_count_parity)) .end() ) - return float_result.cast(column.type()) + return float_result.cast(column.type()) # type: ignore class MaxOp(AggregateOp): diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index fbfbc45e51..5c3f07db6b 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -491,7 +491,7 @@ def remote_function_node( ) } - fields["dtype"] = ibis_signature.output_type + fields["dtype"] = ibis_signature.output_type # type: ignore fields["shape"] = rlz.shape_like("args") node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index 58b78e0048..a8927d2f2b 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -26,8 +26,9 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str _, dataset, table_id = scalars_table_id.split(".") ibis_table: ibis_types.Table = ibis_client.table(table_id, database=dataset) ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] - quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( # type: ignore - ibis_column, num_bins=num_bins + quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( + ibis_column, # type: ignore + num_bins=num_bins, # type: ignore ).to_expr() value = quantiles[1] num_edges = quantiles.length() diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 76aa34466a..e6644f477a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -15,7 +15,7 @@ class ApproximateMultiQuantile(Filterable, Reduction): arg: ibis_ops_core.Value num_bins: ibis_ops_core.Value[dt.Int64] - dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] + dtype = dt.Array(dt.float64) __all__ = [ From 14fb7c8b80b712fda7d88bafda6a627283a130a6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 13 Nov 2023 10:43:43 -0600 Subject: [PATCH 04/18] Update noxfile.py --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 14b00ed8a7..58e9a0bf97 100644 --- a/noxfile.py +++ b/noxfile.py @@ -85,7 +85,7 @@ "docs", "docfx", "unit_prerelease", - # "unit_noextras", + "unit_noextras", "system_prerelease", "doctest", "cover", From b1824539e33ce749a66fae49a8848e590fe74063 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 13 Nov 2023 22:30:33 +0000 Subject: [PATCH 05/18] update type annotations --- bigframes/core/compile/compiled.py | 2 +- bigframes/operations/__init__.py | 25 ++++++++++++++++++------- bigframes/operations/aggregations.py | 4 ++-- setup.py | 2 +- testing/constraints-3.9.txt | 4 ++-- tests/unit/resources.py | 2 +- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 4ba5e6bd08..1e74e13cdd 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -862,7 +862,7 @@ def project_window_op( case_statement = ibis.case() for clause in clauses: case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() + case_statement = case_statement.else_(window_op).end() # type: ignore window_op = case_statement result = self._set_or_replace_by_id(output_name or column_name, window_op) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 8c42b6c56a..ec3816ad11 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -353,14 +353,23 @@ def _as_ibis(self, x: ibis_types.Value): str_val = typing.cast(ibis_types.StringValue, x) # SQL pad operations will truncate, we do not want to truncate though. - pad_length = ibis.greatest(str_val.length(), self._length) + pad_length = typing.cast( + ibis_types.IntegerValue, ibis.greatest(str_val.length(), self._length) + ) if self._side == "left": return str_val.lpad(pad_length, self._fillchar) elif self._side == "right": return str_val.rpad(pad_length, self._fillchar) else: # side == both # Pad more on right side if can't pad both sides equally - lpad_amount = ((pad_length - str_val.length()) // 2) + str_val.length() + lpad_amount = typing.cast( + ibis_types.IntegerValue, + ( + (pad_length - str_val.length()) + // typing.cast(ibis_types.NumericValue, ibis.literal(2)) + ) + + str_val.length(), + ) return str_val.lpad(lpad_amount, self._fillchar).rpad( pad_length, self._fillchar ) @@ -1054,7 +1063,7 @@ def where_op( replacement: ibis_types.Value, ) -> ibis_types.Value: """Returns x if y is true, otherwise returns z.""" - return ibis.case().when(condition, original).else_(replacement).end() + return ibis.case().when(condition, original).else_(replacement).end() # type: ignore def clip_op( @@ -1067,7 +1076,7 @@ def clip_op( not isinstance(upper, ibis_types.NullScalar) ): return ( - ibis.case() + ibis.case() # type: ignore .when(upper.isnull() | (original > upper), upper) .else_(original) .end() @@ -1076,7 +1085,7 @@ def clip_op( upper, ibis_types.NullScalar ): return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .else_(original) .end() @@ -1086,9 +1095,11 @@ def clip_op( ): return original else: - # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound + # Note: Pandas has unchanged behavior when upper bound and lower bound + # are flipped. + # This implementation requires that lower_bound < upper_bound. return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .when(upper.isnull() | (original > upper), upper) .else_(original) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index f69d41afe6..363dfe819d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -290,7 +290,7 @@ def _as_ibis( dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), ) out = out.else_(None) - return out.end() + return out.end() # type: ignore @property def skips_nulls(self): @@ -482,7 +482,7 @@ def _map_to_literal( original: ibis_types.Value, literal: ibis_types.Scalar ) -> ibis_types.Column: # Hack required to perform aggregations on literals in ibis, even though bigquery will let you directly aggregate literals (eg. 'SELECT COUNT(1) from table1') - return ibis.ifelse(original.isnull(), literal, literal) + return ibis.ifelse(original.isnull(), literal, literal) # type: ignore sum_op = SumOp() diff --git a/setup.py b/setup.py index 063cdb480d..1ad795466e 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=7.0.0dev,<8.0.0dev", + "ibis-framework[bigquery] >=7.0.0,<8.0.0dev", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index f43d3b4ca0..63e3b43cc4 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==6.2.0 +ibis-framework==7.0.0 humanize==4.6.0 identify==2.5.22 idna==3.4 @@ -107,7 +107,7 @@ scikit-learn==1.2.2 SecretStorage==3.3.3 six==1.16.0 SQLAlchemy==1.4.0 -sqlglot==10.6.4 +sqlglot==18.7.0 tomli==2.0.1 toolz==0.12.0 tqdm==4.65.0 diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 8fc8acd175..6f74d1c8d9 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -80,7 +80,7 @@ def create_dataframe( # might not actually be used. Mock out the global session, too. monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) bigframes.options.bigquery._session_started = True - return bigframes.dataframe.DataFrame({}, session=session) + return bigframes.dataframe.DataFrame({"col": []}, session=session) def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session: From 0ab94cc33c1ea7b4f80f51ba018bf0368605b64f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:19:01 +0000 Subject: [PATCH 06/18] fix for deferred values --- bigframes/core/compile/compiled.py | 19 +++++++++++++++++-- bigframes/session/__init__.py | 6 ++++-- tests/system/small/test_ibis.py | 8 ++++++-- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 07bca1eca4..9593725396 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -21,6 +21,7 @@ import ibis import ibis.backends.bigquery as ibis_bigquery +import ibis.common.deferred import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas @@ -62,7 +63,14 @@ def __init__( self._columns = tuple(columns) # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: @@ -643,7 +651,14 @@ def __init__( # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } self._hidden_ordering_column_names = { column.get_name(): column for column in self._hidden_ordering_columns } diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 928123ce74..9211c4fa42 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -472,7 +472,8 @@ def _get_snapshot_sql_and_primary_key( ) table_expression = self.ibis_client.table( table_ref.table_id, - database=f"{table_ref.project}.{table_ref.dataset_id}", + schema=table_ref.dataset_id, + database=table_ref.project, ) # If there are primary keys defined, the query engine assumes these @@ -803,7 +804,8 @@ def _read_pandas( ) table_expression = self.ibis_client.table( load_table_destination.table_id, - database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", + schema=load_table_destination.dataset_id, + database=load_table_destination.project, ) # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index a8927d2f2b..86be81dc61 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -23,8 +23,12 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str): num_bins = 3 ibis_client = session.ibis_client - _, dataset, table_id = scalars_table_id.split(".") - ibis_table: ibis_types.Table = ibis_client.table(table_id, database=dataset) + project, dataset, table_id = scalars_table_id.split(".") + ibis_table: ibis_types.Table = ibis_client.table( + table_id, + schema=dataset, + database=project, + ) ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( ibis_column, # type: ignore From 4931923acd7596a5aa926a325ae041f8f3966a74 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:27:39 +0000 Subject: [PATCH 07/18] fix prerelease --- noxfile.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/noxfile.py b/noxfile.py index ec8642b5df..829d4d95ea 100644 --- a/noxfile.py +++ b/noxfile.py @@ -521,23 +521,11 @@ def prerelease(session: nox.sessions.Session, tests_path): ) already_installed.add("pandas") - # TODO(shobs): - # Commit https://github.com/ibis-project/ibis/commit/c20ba7feab6bdea6c299721310e04dbc10551cc2 - # introduced breaking change that removed the following: - # ibis.expr.rules.column - # ibis.expr.rules.value - # ibis.expr.rules.any - # Let's exclude ibis head from prerelease install list for now. Instead, use - # a working ibis-framework version resolved via setup.by (currently resolves - # to version 6.2.0 due to version requirement "6.2.0,<7.0.0dev"). - # We should enable the head back once bigframes support a version that - # includes the above commit. - # session.install( - # "--upgrade", - # "-e", # Use -e so that py.typed file is included. - # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", - # ) - session.install("--no-deps", "ibis-framework==6.2.0") + session.install( + "--upgrade", + "-e", # Use -e so that py.typed file is included. + "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + ) already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 From 9df0816110521107a570c61a7ffcc930ae79e2a9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:42:53 +0000 Subject: [PATCH 08/18] minimum 7.1.0 --- bigframes/core/compile/compiled.py | 6 +++++- bigframes/session/__init__.py | 4 ++-- setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/small/test_ibis.py | 2 +- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 9593725396..537d9c8b52 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -21,7 +21,7 @@ import ibis import ibis.backends.bigquery as ibis_bigquery -import ibis.common.deferred +import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas @@ -66,6 +66,8 @@ def __init__( self._column_names = { ( column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. if isinstance(column, ibis.common.deferred.Deferred) else column ).get_name(): column @@ -654,6 +656,8 @@ def __init__( self._column_names = { ( column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. if isinstance(column, ibis.common.deferred.Deferred) else column ).get_name(): column diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9211c4fa42..6045caa899 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -470,7 +470,7 @@ def _get_snapshot_sql_and_primary_key( ), None, ) - table_expression = self.ibis_client.table( + table_expression = self.ibis_client.table( # type: ignore table_ref.table_id, schema=table_ref.dataset_id, database=table_ref.project, @@ -802,7 +802,7 @@ def _read_pandas( total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) - table_expression = self.ibis_client.table( + table_expression = self.ibis_client.table( # type: ignore load_table_destination.table_id, schema=load_table_destination.dataset_id, database=load_table_destination.project, diff --git a/setup.py b/setup.py index 1ad795466e..153ce79782 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=7.0.0,<8.0.0dev", + "ibis-framework[bigquery] >=7.1.0,<8.0.0dev", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 63e3b43cc4..011aafdee9 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==7.0.0 +ibis-framework==7.1.0 humanize==4.6.0 identify==2.5.22 idna==3.4 diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index 86be81dc61..9fe1176068 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -24,7 +24,7 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str num_bins = 3 ibis_client = session.ibis_client project, dataset, table_id = scalars_table_id.split(".") - ibis_table: ibis_types.Table = ibis_client.table( + ibis_table: ibis_types.Table = ibis_client.table( # type: ignore table_id, schema=dataset, database=project, From 71f48891ff04792037bc8bfdd21eb79c856109ba Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:48:06 +0000 Subject: [PATCH 09/18] mypy --- mypy.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mypy.ini b/mypy.ini index 901394813a..3809f8e241 100644 --- a/mypy.ini +++ b/mypy.ini @@ -24,5 +24,8 @@ ignore_missing_imports = True [mypy-pyarrow] ignore_missing_imports = True +[mypy-ibis.*] +ignore_missing_imports = True + [mypy-ipywidgets] ignore_missing_imports = True From 6c64ec59169d7b4e70850827ba0bc5cdd920e7ed Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 23:33:58 +0000 Subject: [PATCH 10/18] revert presubmit changes --- bigframes/session/__init__.py | 8 ++++---- noxfile.py | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6045caa899..a9e65dc171 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -472,8 +472,8 @@ def _get_snapshot_sql_and_primary_key( ) table_expression = self.ibis_client.table( # type: ignore table_ref.table_id, - schema=table_ref.dataset_id, - database=table_ref.project, + # TODO: use "dataset_id" as the "schema" + database=f"{table_ref.project}.{table_ref.dataset_id}", ) # If there are primary keys defined, the query engine assumes these @@ -804,8 +804,8 @@ def _read_pandas( ) table_expression = self.ibis_client.table( # type: ignore load_table_destination.table_id, - schema=load_table_destination.dataset_id, - database=load_table_destination.project, + # TODO: use "dataset_id" as the "schema" + database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", ) # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. diff --git a/noxfile.py b/noxfile.py index 829d4d95ea..0fe059580d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -84,9 +84,9 @@ "format", "docs", "docfx", - "unit_prerelease", + "unit", "unit_noextras", - "system_prerelease", + "system", "doctest", "cover", ] @@ -521,10 +521,18 @@ def prerelease(session: nox.sessions.Session, tests_path): ) already_installed.add("pandas") + # Ibis has introduced breaking changes. Let's exclude ibis head + # from prerelease install list for now. We should enable the head back + # once bigframes supports the version at HEAD. + # session.install( + # "--upgrade", + # "-e", # Use -e so that py.typed file is included. + # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", + # ) session.install( "--upgrade", - "-e", # Use -e so that py.typed file is included. - "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + "--pre", + "ibis-framework", ) already_installed.add("ibis-framework") From e37571c3771c1f0c2a47ea40f3206d224a9a08c2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 23:39:05 +0000 Subject: [PATCH 11/18] fix minimum sqlglot --- testing/constraints-3.9.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 011aafdee9..218255c77e 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -107,7 +107,7 @@ scikit-learn==1.2.2 SecretStorage==3.3.3 six==1.16.0 SQLAlchemy==1.4.0 -sqlglot==18.7.0 +sqlglot==18.12.0 tomli==2.0.1 toolz==0.12.0 tqdm==4.65.0 From fad36a8cd48275b8701102309754bf0685707437 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 28 Nov 2023 18:41:06 +0000 Subject: [PATCH 12/18] fix custom op --- third_party/bigframes_vendored/ibis/expr/operations/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index dbb3fa3066..772c2e8ff4 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -6,4 +6,4 @@ class ToJsonString(Unary): - output_dtype = dt.string + dtype = dt.string From c318b186647290c71fb35bf5f4234f530f6b2780 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 14 Dec 2023 23:23:11 +0000 Subject: [PATCH 13/18] hack InMemoryTable formatter back in --- bigframes/session/__init__.py | 4 +- .../ibis/backends/bigquery/__init__.py | 3 + .../ibis/backends/bigquery/compiler.py | 59 +++++++++++++++++++ 3 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index a794e709ff..fb5fab86ce 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -79,9 +79,9 @@ import bigframes.session.clients import bigframes.version -# Even though the ibis.backends.bigquery.registry import is unused, it's needed +# Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. -import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery # noqa import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index e69de29bb2..43508fab11 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -0,0 +1,3 @@ +# Import all sub-modules to monkeypatch everything. +import third_party.bigframes_vendored.ibis.backends.bigquery.compiler # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py new file mode 100644 index 0000000000..414f0a7c81 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py @@ -0,0 +1,59 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/compiler.py +"""Module to convert from Ibis expression to SQL string.""" + +from __future__ import annotations + +import re + +from ibis.backends.base.sql import compiler as sql_compiler +import ibis.backends.bigquery.compiler +from ibis.backends.bigquery.datatypes import BigQueryType +import ibis.expr.datatypes as dt +import ibis.expr.operations as ops + +_NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+') +_EXACT_NAME_REGEX = re.compile(f"^{_NAME_REGEX.pattern}$") + + +class BigQueryTableSetFormatter(sql_compiler.TableSetFormatter): + def _quote_identifier(self, name): + """Restore 6.x version of identifier quoting. + + 7.x uses sqlglot which as of December 2023 doesn't know about the + extended unicode names for BigQuery yet. + """ + if _EXACT_NAME_REGEX.match(name) is not None: + return name + return f"`{name}`" + + def _format_in_memory_table(self, op): + """Restore 6.x version of InMemoryTable. + + BigQuery DataFrames explicitly uses InMemoryTable only when we know + the data is small enough to embed in SQL. + """ + schema = op.schema + names = schema.names + types = schema.types + + raw_rows = [] + for row in op.data.to_frame().itertuples(index=False): + raw_row = ", ".join( + f"{self._translate(lit)} AS {name}" + for lit, name in zip( + map(ops.Literal, row, types), map(self._quote_identifier, names) + ) + ) + raw_rows.append(f"STRUCT({raw_row})") + array_type = BigQueryType.from_ibis(dt.Array(op.schema.as_struct())) + + return f"UNNEST({array_type}[{', '.join(raw_rows)}])" + + +# Override implementation. +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._quote_identifier = ( + BigQueryTableSetFormatter._quote_identifier +) +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._format_in_memory_table = ( + BigQueryTableSetFormatter._format_in_memory_table +) From d3304b2897cd88097cb115287e05a30e88f490b8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 16:08:05 +0000 Subject: [PATCH 14/18] use ops module to avoid breaking changes if ops move around --- .../ibis/expr/operations/analytic.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 78aeb67a37..3d6a3b37b1 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,23 +2,21 @@ from __future__ import annotations -import ibis.expr.datatypes as dt -import ibis.expr.operations.analytic as ibis_ops_analytic -import ibis.expr.operations.core as ibis_ops_core +import ibis.expr.operations as ops import ibis.expr.rules as rlz -class FirstNonNullValue(ibis_ops_analytic.Analytic): +class FirstNonNullValue(ops.Analytic): """Retrieve the first element.""" - arg: ibis_ops_core.Column[dt.Any] + arg: ops.Column dtype = rlz.dtype_like("arg") -class LastNonNullValue(ibis_ops_analytic.Analytic): +class LastNonNullValue(ops.Analytic): """Retrieve the last element.""" - arg: ibis_ops_core.Column[dt.Any] + arg: ops.Column dtype = rlz.dtype_like("arg") From 33bd2e0a99e24c492646846ad22aaa0a160c45d4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 17:04:00 +0000 Subject: [PATCH 15/18] workaround nullscalar issue --- bigframes/operations/__init__.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index ec3816ad11..0655aafdb3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -732,10 +732,29 @@ def ne_op( return x != y +def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): + return ibis.where( + where_value, + value, + ibis.null(), + ) + + def and_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For AND, when we encounter a + # NULL value, we only know when the result is FALSE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(False)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(False)) + return typing.cast(ibis_types.BooleanValue, x) & typing.cast( ibis_types.BooleanValue, y ) @@ -745,6 +764,17 @@ def or_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For OR, when we encounter a + # NULL value, we only know when the result is TRUE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(True)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(True)) + return typing.cast(ibis_types.BooleanValue, x) | typing.cast( ibis_types.BooleanValue, y ) @@ -756,7 +786,7 @@ def add_op( y: ibis_types.Value, ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return + return ibis.null() try: # Could be string concatenation or numeric addition. return x + y # type: ignore From c373dc0edda6a41739b1985b685d95612d7f7f27 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 17:40:14 +0000 Subject: [PATCH 16/18] update usage of percent_rank to explicitly order by the value --- bigframes/core/reshape/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index dc61c3baad..24c1bff309 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -18,6 +18,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dataframe import bigframes.operations as ops @@ -145,7 +146,10 @@ def qcut( block, result = block.apply_window_op( x._value_column, agg_ops.QcutOp(q), - window_spec=core.WindowSpec(grouping_keys=(nullity_id,)), + window_spec=core.WindowSpec( + grouping_keys=(nullity_id,), + ordering=(order.OrderingColumnReference(x._value_column),), + ), ) block, result = block.apply_binary_op( result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label From 41364b591b14925ef113f7fd590b24a987059302 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 18:26:44 +0000 Subject: [PATCH 17/18] disable ibis prerelease tests for now --- noxfile.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/noxfile.py b/noxfile.py index 5416a29c9a..4b0f6cdb02 100644 --- a/noxfile.py +++ b/noxfile.py @@ -532,12 +532,12 @@ def prerelease(session: nox.sessions.Session, tests_path): # "-e", # Use -e so that py.typed file is included. # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", # ) - session.install( - "--upgrade", - "--pre", - "ibis-framework", - ) - already_installed.add("ibis-framework") + # session.install( + # "--upgrade", + # "--pre", + # "ibis-framework", + # ) + # already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 session.install("--no-deps", "db-dtypes") From 7a8784c2c0bbad63e73de9c799223220230e8233 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 19:12:31 +0000 Subject: [PATCH 18/18] fix unit_prerelease --- noxfile.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/noxfile.py b/noxfile.py index 4b0f6cdb02..c0ec3b0c54 100644 --- a/noxfile.py +++ b/noxfile.py @@ -532,12 +532,12 @@ def prerelease(session: nox.sessions.Session, tests_path): # "-e", # Use -e so that py.typed file is included. # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", # ) - # session.install( - # "--upgrade", - # "--pre", - # "ibis-framework", - # ) - # already_installed.add("ibis-framework") + session.install( + "--upgrade", + # "--pre", + "ibis-framework>=7.1.0,<8.0.0dev", + ) + already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 session.install("--no-deps", "db-dtypes")