From f5d9084079b25c38077a09da5ee9efb316e72ccd Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Tue, 30 Apr 2024 07:32:00 -0400 Subject: [PATCH] refactor(udf): remove hierarchical usage of schema (#9078) I did not deprecate `schema` as we have elsewhere, because I don't think we can determine the quoting behavior of the backend at UDF definition time, which makes accepting dotted string paths harder. BREAKING CHANGE: The `schema` parameter for UDF definition has been removed. A new `catalog` parameter has been added. Ibis uses the word database to refer to a collection of tables, and the word catalog to refer to a collection of databases. You can use a combination of `catalog` and `database` to specify a hierarchical location for the UDF. --- .../bigquery/tests/unit/udf/test_builtin.py | 2 +- ibis/backends/impala/udf.py | 4 +- ibis/backends/postgres/__init__.py | 10 +- ibis/backends/postgres/tests/test_udf.py | 12 +-- ibis/backends/sql/__init__.py | 7 +- ibis/expr/operations/udf.py | 94 +++++++++++++------ 6 files changed, 86 insertions(+), 43 deletions(-) diff --git a/ibis/backends/bigquery/tests/unit/udf/test_builtin.py b/ibis/backends/bigquery/tests/unit/udf/test_builtin.py index 711a728fa3fb..12468843c499 100644 --- a/ibis/backends/bigquery/tests/unit/udf/test_builtin.py +++ b/ibis/backends/bigquery/tests/unit/udf/test_builtin.py @@ -9,7 +9,7 @@ def farm_fingerprint(value: bytes) -> int: ... -@ibis.udf.scalar.builtin(schema="fn", database="bqutil") +@ibis.udf.scalar.builtin(database="fn", catalog="bqutil") def from_hex(value: str) -> int: """Community function to convert from hex string to integer. diff --git a/ibis/backends/impala/udf.py b/ibis/backends/impala/udf.py index 02801b3df167..32304c65dd35 100644 --- a/ibis/backends/impala/udf.py +++ b/ibis/backends/impala/udf.py @@ -68,7 +68,7 @@ def _create_operation_class(self): fn=self._make_fn(), name=self.name, signature=(self.inputs, self.output), - schema=self.database, + database=self.database, ) @@ -78,7 +78,7 @@ def _create_operation_class(self): fn=self._make_fn(), name=self.name, signature=(self.inputs, self.output), - schema=self.database, + database=self.database, ) diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py index b3631cbffde7..e7f54864fb48 100644 --- a/ibis/backends/postgres/__init__.py +++ b/ibis/backends/postgres/__init__.py @@ -414,15 +414,15 @@ def current_database(self) -> str: (schema,) = cur.fetchone() return schema - def function(self, name: str, *, schema: str | None = None) -> Callable: + def function(self, name: str, *, database: str | None = None) -> Callable: n = ColGen(table="n") p = ColGen(table="p") f = self.compiler.f predicates = [p.proname.eq(name)] - if schema is not None: - predicates.append(n.nspname.rlike(sge.convert(f"^({schema})$"))) + if database is not None: + predicates.append(n.nspname.rlike(sge.convert(f"^({database})$"))) query = ( sg.select( @@ -448,7 +448,7 @@ def split_name_type(arg: str) -> tuple[str, dt.DataType]: rows = cur.fetchall() if not rows: - name = f"{schema}.{name}" if schema else name + name = f"{database}.{name}" if database else name raise exc.MissingUDFError(name) elif len(rows) > 1: raise exc.AmbiguousUDFError(name) @@ -471,7 +471,7 @@ def fake_func(*args, **kwargs): ... return_annotation=return_type, ) fake_func.__annotations__ = {"return": return_type, **dict(signature)} - op = ops.udf.scalar.builtin(fake_func, schema=schema) + op = ops.udf.scalar.builtin(fake_func, database=database) return op def _get_udf_source(self, udf_node: ops.ScalarUDF): diff --git a/ibis/backends/postgres/tests/test_udf.py b/ibis/backends/postgres/tests/test_udf.py index eff3f73f18b4..c80112cc09e1 100644 --- a/ibis/backends/postgres/tests/test_udf.py +++ b/ibis/backends/postgres/tests/test_udf.py @@ -85,7 +85,7 @@ def table(con_for_udf, table_name, test_database): def test_existing_sql_udf(con_for_udf, test_database, table): """Test creating ibis UDF object based on existing UDF in the database.""" # Create ibis UDF objects referring to UDFs already created in the database - custom_length_udf = con_for_udf.function("custom_len", schema=test_database) + custom_length_udf = con_for_udf.function("custom_len", database=test_database) result_obj = table[table, custom_length_udf(table["user_name"]).name("custom_len")] result = result_obj.execute() assert result["custom_len"].sum() == result["name_length"].sum() @@ -93,7 +93,7 @@ def test_existing_sql_udf(con_for_udf, test_database, table): def test_existing_plpython_udf(con_for_udf, test_database, table): # Create ibis UDF objects referring to UDFs already created in the database - py_length_udf = con_for_udf.function("pylen", schema=test_database) + py_length_udf = con_for_udf.function("pylen", database=test_database) result_obj = table[table, py_length_udf(table["user_name"]).name("custom_len")] result = result_obj.execute() assert result["custom_len"].sum() == result["name_length"].sum() @@ -103,7 +103,7 @@ def test_udf(test_database, table): """Test creating a UDF in database based on Python function and then creating an ibis UDF object based on that.""" - @udf.scalar.python(schema=test_database) + @udf.scalar.python(database=test_database) def mult_a_b(a: int, b: int) -> int: return a * b @@ -129,7 +129,7 @@ def test_array_type(test_database, table): instantiated specifying the datatype of the elements of the array. """ - @udf.scalar.python(schema=test_database) + @udf.scalar.python(database=test_database) def pysplit(text: str, split: str) -> list[str]: return text.split(split) @@ -142,7 +142,7 @@ def test_client_udf_api(test_database, table): """Test creating a UDF in database based on Python function using an ibis client method.""" - @udf.scalar.python(schema=test_database) + @udf.scalar.python(database=test_database) def multiply(a: int, b: int) -> int: return a * b @@ -171,7 +171,7 @@ def wrapped(*args, **kwds): return wrapped @decorator - @udf.scalar.python(schema=test_database) + @udf.scalar.python(database=test_database) def multiply(a: int, b: int) -> int: return a * b diff --git a/ibis/backends/sql/__init__.py b/ibis/backends/sql/__init__.py index 3ad0ead0387c..537a29a2d15c 100644 --- a/ibis/backends/sql/__init__.py +++ b/ibis/backends/sql/__init__.py @@ -521,7 +521,7 @@ def _to_catalog_db_tuple(self, table_loc: sge.Table): def _to_sqlglot_table(self, database): if database is None: return None - elif isinstance(database, tuple): + elif isinstance(database, (list, tuple)): if len(database) > 2: raise ValueError( "Only database hierarchies of two or fewer levels are supported." @@ -558,6 +558,9 @@ def _to_sqlglot_table(self, database): db = table.args["this"] database = sg.exp.Table(catalog=catalog, db=db) else: - raise ValueError("oops") + raise ValueError( + """Invalid database hierarchy format. Please use either dotted + strings ('catalog.database') or tuples ('catalog', 'database').""" + ) return database diff --git a/ibis/expr/operations/udf.py b/ibis/expr/operations/udf.py index c5df57b0ee52..a5aa94a1d0fe 100644 --- a/ibis/expr/operations/udf.py +++ b/ibis/expr/operations/udf.py @@ -101,12 +101,17 @@ def _make_node( fn: Callable, input_type: InputType, name: str | None = None, - schema: str | None = None, database: str | None = None, + catalog: str | None = None, signature: tuple[tuple, Any] | None = None, **kwargs, ) -> type[S]: """Construct a scalar user-defined function that is built-in to the backend.""" + if "schema" in kwargs: + raise exc.UnsupportedArgumentError( + """schema` is not a valid argument. + You can use the `catalog` and `database` keywords to specify a UDF location.""" + ) if signature is None: annotations = typing.get_type_hints(fn) @@ -139,7 +144,7 @@ def _make_node( # method "__func__": property(fget=lambda _, fn=fn: fn), "__config__": FrozenDict(kwargs), - "__udf_namespace__": ops.Namespace(database=schema, catalog=database), + "__udf_namespace__": ops.Namespace(database=database, catalog=catalog), "__module__": fn.__module__, "__func_name__": func_name, } @@ -181,8 +186,8 @@ def builtin( cls, *, name: str | None = None, - schema: str | None = None, database: str | None = None, + catalog: str | None = None, signature: tuple[tuple[Any, ...], Any] | None = None, **kwargs: Any, ) -> Callable[[Callable], Callable[..., ir.Value]]: ... @@ -190,7 +195,14 @@ def builtin( @util.experimental @classmethod def builtin( - cls, fn=None, *, name=None, schema=None, database=None, signature=None, **kwargs + cls, + fn=None, + *, + name=None, + database=None, + catalog=None, + signature=None, + **kwargs, ): """Construct a scalar user-defined function that is built-in to the backend. @@ -200,10 +212,10 @@ def builtin( The function to wrap. name The name of the UDF in the backend if different from the function name. - schema - The schema in which the builtin function resides. database The database in which the builtin function resides. + catalog + The catalog in which the builtin function resides. signature If present, a tuple of the form `((arg0type, arg1type, ...), returntype)`. For example, a function taking an int and a float and returning a @@ -233,8 +245,8 @@ def builtin( InputType.BUILTIN, fn, name=name, - schema=schema, database=database, + catalog=catalog, signature=signature, **kwargs, ) @@ -249,8 +261,8 @@ def python( cls, *, name: str | None = None, - schema: str | None = None, database: str | None = None, + catalog: str | None = None, signature: tuple[tuple[Any, ...], Any] | None = None, **kwargs: Any, ) -> Callable[[Callable], Callable[..., ir.Value]]: ... @@ -258,7 +270,14 @@ def python( @util.experimental @classmethod def python( - cls, fn=None, *, name=None, schema=None, database=None, signature=None, **kwargs + cls, + fn=None, + *, + name=None, + database=None, + catalog=None, + signature=None, + **kwargs, ): """Construct a **non-vectorized** scalar user-defined function that accepts Python scalar values as inputs. @@ -281,10 +300,10 @@ def python( The function to wrap. name The name of the UDF in the backend if different from the function name. - schema - The schema in which to create the UDF. database The database in which to create the UDF. + catalog + The catalog in which to create the UDF. signature If present, a tuple of the form `((arg0type, arg1type, ...), returntype)`. For example, a function taking an int and a float and returning a @@ -346,8 +365,8 @@ def python( InputType.PYTHON, fn, name=name, - schema=schema, database=database, + catalog=catalog, signature=signature, **kwargs, ) @@ -362,8 +381,8 @@ def pandas( cls, *, name: str | None = None, - schema: str | None = None, database: str | None = None, + catalog: str | None = None, signature: tuple[tuple[Any, ...], Any] | None = None, **kwargs: Any, ) -> Callable[[Callable], Callable[..., ir.Value]]: ... @@ -371,7 +390,14 @@ def pandas( @util.experimental @classmethod def pandas( - cls, fn=None, *, name=None, schema=None, database=None, signature=None, **kwargs + cls, + fn=None, + *, + name=None, + database=None, + catalog=None, + signature=None, + **kwargs, ): """Construct a **vectorized** scalar user-defined function that accepts pandas Series' as inputs. @@ -381,10 +407,10 @@ def pandas( The function to wrap. name The name of the UDF in the backend if different from the function name. - schema - The schema in which to create the UDF. database The database in which to create the UDF. + catalog + The catalog in which to create the UDF. signature If present, a tuple of the form `((arg0type, arg1type, ...), returntype)`. For example, a function taking an int and a float and returning a @@ -436,8 +462,8 @@ def pandas( InputType.PANDAS, fn, name=name, - schema=schema, database=database, + catalog=catalog, signature=signature, **kwargs, ) @@ -452,8 +478,8 @@ def pyarrow( cls, *, name: str | None = None, - schema: str | None = None, database: str | None = None, + catalog: str | None = None, signature: tuple[tuple[Any, ...], Any] | None = None, **kwargs: Any, ) -> Callable[[Callable], Callable[..., ir.Value]]: ... @@ -461,7 +487,14 @@ def pyarrow( @util.experimental @classmethod def pyarrow( - cls, fn=None, *, name=None, schema=None, database=None, signature=None, **kwargs + cls, + fn=None, + *, + name=None, + database=None, + catalog=None, + signature=None, + **kwargs, ): """Construct a **vectorized** scalar user-defined function that accepts PyArrow Arrays as input. @@ -471,10 +504,10 @@ def pyarrow( The function to wrap. name The name of the UDF in the backend if different from the function name. - schema - The schema in which to create the UDF. database The database in which to create the UDF. + catalog + The catalog in which to create the UDF. signature If present, a tuple of the form `((arg0type, arg1type, ...), returntype)`. For example, a function taking an int and a float and returning a @@ -515,8 +548,8 @@ def pyarrow( InputType.PYARROW, fn, name=name, - schema=schema, database=database, + catalog=catalog, signature=signature, **kwargs, ) @@ -538,8 +571,8 @@ def builtin( cls, *, name: str | None = None, - schema: str | None = None, database: str | None = None, + catalog: str | None = None, signature: tuple[tuple[Any, ...], Any] | None = None, **kwargs: Any, ) -> Callable[[Callable], Callable[..., ir.Value]]: ... @@ -547,7 +580,14 @@ def builtin( @util.experimental @classmethod def builtin( - cls, fn=None, *, name=None, schema=None, database=None, signature=None, **kwargs + cls, + fn=None, + *, + name=None, + database=None, + catalog=None, + signature=None, + **kwargs, ): """Construct an aggregate user-defined function that is built-in to the backend. @@ -557,10 +597,10 @@ def builtin( The function to wrap. name The name of the UDF in the backend if different from the function name. - schema - The schema in which the builtin function resides. database The database in which the builtin function resides. + catalog + The catalog in which the builtin function resides. signature If present, a tuple of the form `((arg0type, arg1type, ...), returntype)`. For example, a function taking an int and a float and returning a @@ -587,8 +627,8 @@ def builtin( InputType.BUILTIN, fn, name=name, - schema=schema, database=database, + catalog=catalog, signature=signature, **kwargs, )