From 00e80871bfc80fb060dbf54efa477cb5c894a6ad Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Mon, 4 Apr 2022 10:23:43 -0400 Subject: [PATCH] feat(duckdb/postgres/mysql/pyspark): implement `.sql` on tables for mixing sql and expressions --- ibis/backends/base/sql/__init__.py | 6 + ibis/backends/base/sql/alchemy/__init__.py | 31 +++++ .../base/sql/alchemy/query_builder.py | 20 ++- .../base/sql/compiler/extract_subqueries.py | 12 +- ibis/backends/base/sql/compiler/translator.py | 2 +- ibis/backends/conftest.py | 24 +++- ibis/backends/duckdb/__init__.py | 15 ++- ibis/backends/mysql/__init__.py | 17 +++ ibis/backends/postgres/__init__.py | 77 +---------- ibis/backends/postgres/datatypes.py | 73 +++++++++++ ibis/backends/pyspark/compiler.py | 25 ++++ ibis/backends/pyspark/tests/conftest.py | 9 +- ibis/backends/tests/base.py | 3 + ibis/backends/tests/test_dot_sql.py | 123 ++++++++++++++++++ ibis/expr/format.py | 34 ++++- ibis/expr/operations/relations.py | 26 ++++ ibis/expr/types/relations.py | 24 ++++ ibis/tests/expr/test_format.py | 6 +- 18 files changed, 441 insertions(+), 86 deletions(-) create mode 100644 ibis/backends/postgres/datatypes.py create mode 100644 ibis/backends/tests/test_dot_sql.py diff --git a/ibis/backends/base/sql/__init__.py b/ibis/backends/base/sql/__init__.py index 496b5c9a92ac..f74b7ad83c3b 100644 --- a/ibis/backends/base/sql/__init__.py +++ b/ibis/backends/base/sql/__init__.py @@ -263,3 +263,9 @@ def has_operation(cls, operation: type[ops.ValueOp]) -> bool: translator = cls.compiler.translator_class op_classes = translator._registry.keys() | translator._rewrites.keys() return operation in op_classes + + def _create_temp_view(self, view, definition): + raise NotImplementedError( + f"The {self.name} backend does not implement temporary view " + "creation" + ) diff --git a/ibis/backends/base/sql/alchemy/__init__.py b/ibis/backends/base/sql/alchemy/__init__.py index ad6c8e9c935b..66adef818abb 100644 --- a/ibis/backends/base/sql/alchemy/__init__.py +++ b/ibis/backends/base/sql/alchemy/__init__.py @@ -86,6 +86,7 @@ def do_connect(self, con: sa.engine.Engine) -> None: self._inspector = sa.inspect(self.con) self.meta = sa.MetaData(bind=self.con) self._schemas: dict[str, sch.Schema] = {} + self._temp_views: set[str] = set() @property def version(self): @@ -478,3 +479,33 @@ def insert( "is not a pandas DataFrame or is not a ibis TableExpr." f"The given obj is of type {type(obj).__name__} ." ) + + def _get_temp_view_definition( + self, + name: str, + definition: sa.sql.compiler.Compiled, + ) -> str: + raise NotImplementedError( + f"The {self.name} backend does not implement temporary view " + "creation" + ) + + def _register_temp_view_cleanup(self, name: str, raw_name: str) -> None: + pass + + def _create_temp_view( + self, + view: sa.Table, + definition: sa.sql.Selectable, + ) -> None: + raw_name = view.name + if raw_name not in self._temp_views and raw_name in self.list_tables(): + raise ValueError(f"{raw_name} already exists as a table or view") + + name = self.con.dialect.identifier_preparer.quote_identifier(raw_name) + compiled = definition.compile() + defn = self._get_temp_view_definition(name, definition=compiled) + query = sa.text(defn).bindparams(**compiled.params) + self.con.execute(query, definition) + self._temp_views.add(raw_name) + self._register_temp_view_cleanup(name, raw_name) diff --git a/ibis/backends/base/sql/alchemy/query_builder.py b/ibis/backends/base/sql/alchemy/query_builder.py index b7b9557d8c05..a3ded3fab1e6 100644 --- a/ibis/backends/base/sql/alchemy/query_builder.py +++ b/ibis/backends/base/sql/alchemy/query_builder.py @@ -1,9 +1,12 @@ +from __future__ import annotations + import functools import sqlalchemy as sa import sqlalchemy.sql as sql import ibis.expr.operations as ops +import ibis.expr.schema as sch import ibis.expr.types as ir from ibis.backends.base.sql.compiler import ( Compiler, @@ -18,6 +21,10 @@ from .translator import AlchemyContext, AlchemyExprTranslator +def _schema_to_sqlalchemy_columns(schema: sch.Schema) -> list[sa.Column]: + return [sa.column(n, to_sqla_type(t)) for n, t in schema.items()] + + class _AlchemyTableSetFormatter(TableSetFormatter): def get_result(self): # Got to unravel the join stack; the nesting order could be @@ -85,8 +92,19 @@ def _format_table(self, expr): schema = ref_op.schema result = sa.table( ref_op.name, - *(sa.column(n, to_sqla_type(t)) for n, t in schema.items()), + *_schema_to_sqlalchemy_columns(schema), + ) + elif isinstance(ref_op, ops.SQLStringView): + columns = _schema_to_sqlalchemy_columns(ref_op.schema) + result = sa.text(ref_op.query).columns(*columns).cte(ref_op.name) + elif isinstance(ref_op, ops.View): + definition = ref_op.child.compile() + result = sa.table( + ref_op.name, + *_schema_to_sqlalchemy_columns(ref_op.schema), ) + backend = ref_op.child._find_backend() + backend._create_temp_view(view=result, definition=definition) else: # A subquery if ctx.is_extracted(ref_expr): diff --git a/ibis/backends/base/sql/compiler/extract_subqueries.py b/ibis/backends/base/sql/compiler/extract_subqueries.py index 03efe4721a0b..1b97526512a2 100644 --- a/ibis/backends/base/sql/compiler/extract_subqueries.py +++ b/ibis/backends/base/sql/compiler/extract_subqueries.py @@ -109,10 +109,6 @@ def visit_Difference(self, expr): self.visit(op.right) self.observe(expr) - def visit_MaterializedJoin(self, expr): - self.visit(expr.op().join) - self.observe(expr) - def visit_Selection(self, expr): self.visit(expr.op().table) self.observe(expr) @@ -120,6 +116,14 @@ def visit_Selection(self, expr): def visit_SQLQueryResult(self, expr): self.observe(expr) + def visit_View(self, expr): + self.visit(expr.op().child) + self.observe(expr) + + def visit_SQLStringView(self, expr): + self.visit(expr.op().child) + self.observe(expr) + def visit_TableColumn(self, expr): table = expr.op().table if not self.seen(table): diff --git a/ibis/backends/base/sql/compiler/translator.py b/ibis/backends/base/sql/compiler/translator.py index 5732cd1f84b5..905cfcd49520 100644 --- a/ibis/backends/base/sql/compiler/translator.py +++ b/ibis/backends/base/sql/compiler/translator.py @@ -75,7 +75,7 @@ def get_compiled_expr(self, expr): pass op = expr.op() - if isinstance(op, ops.SQLQueryResult): + if isinstance(op, (ops.SQLQueryResult, ops.SQLStringView)): result = op.query else: result = self._compile_subquery(expr) diff --git a/ibis/backends/conftest.py b/ibis/backends/conftest.py index 0ad3d24f1977..b74a5515658b 100644 --- a/ibis/backends/conftest.py +++ b/ibis/backends/conftest.py @@ -262,7 +262,11 @@ def pytest_runtest_call(item): def backend(request, data_directory): """Return an instance of BackendTest.""" cls = _get_backend_conf(request.param) - return cls(data_directory) + result = cls(data_directory) + try: + yield result + finally: + result.cleanup() @pytest.fixture(scope='session') @@ -286,7 +290,11 @@ def ddl_backend(request, data_directory): (sqlite, postgres, mysql, datafusion, clickhouse, pyspark, impala) """ cls = _get_backend_conf(request.param) - return cls(data_directory) + result = cls(data_directory) + try: + yield result + finally: + result.cleanup() @pytest.fixture(scope='session') @@ -315,7 +323,11 @@ def alchemy_backend(request, data_directory): ) else: cls = _get_backend_conf(request.param) - return cls(data_directory) + result = cls(data_directory) + try: + yield result + finally: + result.cleanup() @pytest.fixture(scope='session') @@ -335,7 +347,11 @@ def udf_backend(request, data_directory): Runs the UDF-supporting backends """ cls = _get_backend_conf(request.param) - return cls(data_directory) + result = cls(data_directory) + try: + yield result + finally: + result.cleanup() @pytest.fixture(scope='session') diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index d256bc420fda..cd43c23132bf 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -14,6 +14,7 @@ from ibis.backends.base.sql.alchemy import BaseAlchemyBackend from .compiler import DuckDBSQLCompiler +from .datatypes import parse_type class Backend(BaseAlchemyBackend): @@ -69,4 +70,16 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: """Return an ibis Schema from a SQL string.""" with self.con.connect() as con: rel = con.connection.c.query(query) - return sch.infer(rel) + return sch.Schema.from_dict( + { + name: parse_type(type) + for name, type in zip(rel.columns, rel.types) + } + ) + + def _get_temp_view_definition( + self, + name: str, + definition: sa.sql.compiler.Compiled, + ) -> str: + return f"CREATE OR REPLACE TEMPORARY VIEW {name} AS {definition}" diff --git a/ibis/backends/mysql/__init__.py b/ibis/backends/mysql/__init__.py index fb47b11b8492..3c0cdd2fc79d 100644 --- a/ibis/backends/mysql/__init__.py +++ b/ibis/backends/mysql/__init__.py @@ -2,6 +2,7 @@ from __future__ import annotations +import atexit import contextlib import warnings from typing import Literal @@ -133,6 +134,22 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: ] return sch.Schema.from_tuples(fields) + def _get_temp_view_definition( + self, + name: str, + definition: sa.sql.compiler.Compiled, + ) -> str: + return f"CREATE OR REPLACE VIEW {name} AS {definition}" + + def _register_temp_view_cleanup(self, name: str, raw_name: str) -> None: + query = f"DROP VIEW IF EXISTS {name}" + + def drop(self, raw_name: str, query: str): + self.con.execute(query) + self._temp_views.discard(raw_name) + + atexit.register(drop, self, raw_name, query) + # TODO(kszucs): unsigned integers diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py index d2057473ed1b..2a3acb8f2f42 100644 --- a/ibis/backends/postgres/__init__.py +++ b/ibis/backends/postgres/__init__.py @@ -7,13 +7,12 @@ import sqlalchemy as sa -import ibis.backends.duckdb.datatypes as ddb -import ibis.expr.datatypes as dt import ibis.expr.schema as sch from ibis import util from ibis.backends.base.sql.alchemy import BaseAlchemyBackend from .compiler import PostgreSQLCompiler +from .datatypes import _get_type from .udf import udf @@ -205,71 +204,9 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: tuples = [(col, _get_type(typestr)) for col, typestr in type_info] return sch.Schema.from_tuples(tuples) - -def _get_type(typestr: str) -> dt.DataType: - try: - return _type_mapping[typestr] - except KeyError: - return ddb.parse_type(typestr) - - -_type_mapping = { - "boolean": dt.bool, - "boolean[]": dt.Array(dt.bool), - "bytea": dt.binary, - "bytea[]": dt.Array(dt.binary), - "character(1)": dt.string, - "character(1)[]": dt.Array(dt.string), - "bigint": dt.int64, - "bigint[]": dt.Array(dt.int64), - "smallint": dt.int16, - "smallint[]": dt.Array(dt.int16), - "integer": dt.int32, - "integer[]": dt.Array(dt.int32), - "text": dt.string, - "text[]": dt.Array(dt.string), - "json": dt.json, - "json[]": dt.Array(dt.json), - "point": dt.point, - "point[]": dt.Array(dt.point), - "polygon": dt.polygon, - "polygon[]": dt.Array(dt.polygon), - "line": dt.linestring, - "line[]": dt.Array(dt.linestring), - "real": dt.float32, - "real[]": dt.Array(dt.float32), - "double precision": dt.float64, - "double precision[]": dt.Array(dt.float64), - "macaddr8": dt.macaddr, - "macaddr8[]": dt.Array(dt.macaddr), - "macaddr": dt.macaddr, - "macaddr[]": dt.Array(dt.macaddr), - "inet": dt.inet, - "inet[]": dt.Array(dt.inet), - "character": dt.string, - "character[]": dt.Array(dt.string), - "character varying": dt.string, - "character varying[]": dt.Array(dt.string), - "date": dt.date, - "date[]": dt.Array(dt.date), - "time without time zone": dt.time, - "time without time zone[]": dt.Array(dt.time), - "timestamp without time zone": dt.timestamp, - "timestamp without time zone[]": dt.Array(dt.timestamp), - "timestamp with time zone": dt.Timestamp("UTC"), - "timestamp with time zone[]": dt.Array(dt.Timestamp("UTC")), - "interval": dt.interval, - "interval[]": dt.Array(dt.interval), - # NB: this isn"t correct, but we try not to fail - "time with time zone": "time", - "numeric": dt.decimal, - "numeric[]": dt.Array(dt.decimal), - "uuid": dt.uuid, - "uuid[]": dt.Array(dt.uuid), - "jsonb": dt.jsonb, - "jsonb[]": dt.Array(dt.jsonb), - "geometry": dt.geometry, - "geometry[]": dt.Array(dt.geometry), - "geography": dt.geography, - "geography[]": dt.Array(dt.geography), -} + def _get_temp_view_definition( + self, + name: str, + definition: sa.sql.compiler.Compiled, + ) -> str: + return f"CREATE OR REPLACE TEMPORARY VIEW {name} AS {definition}" diff --git a/ibis/backends/postgres/datatypes.py b/ibis/backends/postgres/datatypes.py new file mode 100644 index 000000000000..dc02cf516e7c --- /dev/null +++ b/ibis/backends/postgres/datatypes.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import ibis.backends.duckdb.datatypes as ddb +import ibis.expr.datatypes as dt + + +def _get_type(typestr: str) -> dt.DataType: + try: + return _type_mapping[typestr] + except KeyError: + return ddb.parse_type(typestr) + + +_type_mapping = { + "boolean": dt.bool, + "boolean[]": dt.Array(dt.bool), + "bytea": dt.binary, + "bytea[]": dt.Array(dt.binary), + "character(1)": dt.string, + "character(1)[]": dt.Array(dt.string), + "bigint": dt.int64, + "bigint[]": dt.Array(dt.int64), + "smallint": dt.int16, + "smallint[]": dt.Array(dt.int16), + "integer": dt.int32, + "integer[]": dt.Array(dt.int32), + "text": dt.string, + "text[]": dt.Array(dt.string), + "json": dt.json, + "json[]": dt.Array(dt.json), + "point": dt.point, + "point[]": dt.Array(dt.point), + "polygon": dt.polygon, + "polygon[]": dt.Array(dt.polygon), + "line": dt.linestring, + "line[]": dt.Array(dt.linestring), + "real": dt.float32, + "real[]": dt.Array(dt.float32), + "double precision": dt.float64, + "double precision[]": dt.Array(dt.float64), + "macaddr8": dt.macaddr, + "macaddr8[]": dt.Array(dt.macaddr), + "macaddr": dt.macaddr, + "macaddr[]": dt.Array(dt.macaddr), + "inet": dt.inet, + "inet[]": dt.Array(dt.inet), + "character": dt.string, + "character[]": dt.Array(dt.string), + "character varying": dt.string, + "character varying[]": dt.Array(dt.string), + "date": dt.date, + "date[]": dt.Array(dt.date), + "time without time zone": dt.time, + "time without time zone[]": dt.Array(dt.time), + "timestamp without time zone": dt.timestamp, + "timestamp without time zone[]": dt.Array(dt.timestamp), + "timestamp with time zone": dt.Timestamp("UTC"), + "timestamp with time zone[]": dt.Array(dt.Timestamp("UTC")), + "interval": dt.interval, + "interval[]": dt.Array(dt.interval), + # NB: this isn"t correct, but we try not to fail + "time with time zone": "time", + "numeric": dt.decimal, + "numeric[]": dt.Array(dt.decimal), + "uuid": dt.uuid, + "uuid[]": dt.Array(dt.uuid), + "jsonb": dt.jsonb, + "jsonb[]": dt.Array(dt.jsonb), + "geometry": dt.geometry, + "geometry[]": dt.Array(dt.geometry), + "geography": dt.geography, + "geography[]": dt.Array(dt.geography), +} diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py index 74aa812500f6..065f62af6139 100644 --- a/ibis/backends/pyspark/compiler.py +++ b/ibis/backends/pyspark/compiler.py @@ -1873,3 +1873,28 @@ def compile_searched_case(t, expr, scope, timecontext, **kwargs): return existing_when.otherwise( t.translate(op.default, scope, timecontext, **kwargs) ) + + +@compiles(ops.View) +def compile_view(t, expr, scope, timecontext, **kwargs): + op = expr.op() + name = op.name + child = op.child + backend = child._find_backend() + tables = backend._session.catalog.listTables() + if any(name == table.name and not table.isTemporary for table in tables): + raise ValueError( + f"table or non-temporary view `{name}` already exists" + ) + result = t.translate(child, scope, timecontext, **kwargs) + result.createOrReplaceTempView(name) + return result + + +@compiles(ops.SQLStringView) +def compile_sql_view(t, expr, scope, timecontext, **kwargs): + op = expr.op() + backend = op.child._find_backend() + result = backend._session.sql(op.query) + result.createOrReplaceTempView(op.name) + return result diff --git a/ibis/backends/pyspark/tests/conftest.py b/ibis/backends/pyspark/tests/conftest.py index 491675a7a0cb..7049f041f6e4 100644 --- a/ibis/backends/pyspark/tests/conftest.py +++ b/ibis/backends/pyspark/tests/conftest.py @@ -1,5 +1,7 @@ import os +import shutil from datetime import datetime, timezone +from pathlib import Path import numpy as np import pandas as pd @@ -115,7 +117,7 @@ def get_common_spark_testing_client(data_directory, connect): .repartition(num_partitions) .sort('playerID') ) - df_batting.createOrReplaceTempView('batting') + df_batting.write.saveAsTable("batting", format="parquet", mode="overwrite") df_awards_players = ( s.read.csv( @@ -217,6 +219,11 @@ class TestConf(BackendTest, RoundAwayFromZero): def connect(data_directory): return get_pyspark_testing_client(data_directory) + def cleanup(self): + (path,) = map(Path, ibis.__path__) + path = path.parent / "spark-warehouse" + shutil.rmtree(path, ignore_errors=True) + @pytest.fixture(scope='session') def client(data_directory): diff --git a/ibis/backends/tests/base.py b/ibis/backends/tests/base.py index b497234369b5..9aec04a8eb13 100644 --- a/ibis/backends/tests/base.py +++ b/ibis/backends/tests/base.py @@ -150,3 +150,6 @@ def make_context( self, params: Optional[Mapping[ir.ValueExpr, Any]] = None ): return self.api.compiler.make_context(params=params) + + def cleanup(self): + pass diff --git a/ibis/backends/tests/test_dot_sql.py b/ibis/backends/tests/test_dot_sql.py new file mode 100644 index 000000000000..ce39d212067d --- /dev/null +++ b/ibis/backends/tests/test_dot_sql.py @@ -0,0 +1,123 @@ +import pandas as pd +import pytest + +dot_sql_notimpl = pytest.mark.notimpl( + ["clickhouse", "datafusion", "impala", "sqlite"] +) +dot_sql_never = pytest.mark.never( + ["dask", "pandas"], + reason="dask and pandas do not accept SQL", +) + +pytestmark = pytest.mark.xdist_group("dot_sql") + + +@dot_sql_notimpl +@dot_sql_never +def test_dot_sql(backend, con): + alltypes = con.table("functional_alltypes") + t = ( + alltypes.sql( + """ + SELECT + string_col as s, + double_col + 1.0 AS new_col + FROM functional_alltypes + """ + ) + .group_by("s") # group by a column from SQL + .aggregate(fancy_af=lambda t: t.new_col.mean()) + .alias("awesome_t") # create a name for the aggregate + .sql("SELECT fancy_af AS yas FROM awesome_t ORDER BY fancy_af") + ) + + alltypes_df = alltypes.execute() + result = t.execute()["yas"] + expected = ( + alltypes_df.assign( + s=alltypes_df.string_col, new_col=alltypes_df.double_col + 1.0 + ) + .groupby("s") + .new_col.mean() + .rename("yas") + .reset_index() + .yas + ) + backend.assert_series_equal(result, expected) + + +@dot_sql_notimpl +@dot_sql_never +def test_dot_sql_with_join(backend, con): + alltypes = con.table("functional_alltypes") + t = ( + alltypes.sql( + """ + SELECT + string_col as s, + double_col + 1.0 AS new_col + FROM functional_alltypes + """ + ) + .alias("ft") + .group_by("s") # group by a column from SQL + .aggregate(fancy_af=lambda t: t.new_col.mean()) + .alias("awesome_t") # create a name for the aggregate + .sql( + """ + SELECT + l.fancy_af AS yas, + r.s + FROM awesome_t AS l + LEFT JOIN ft AS r + ON l.s = r.s + """ + ) + .sort_by(["s", "yas"]) + ) + + alltypes_df = alltypes.execute() + result = t.execute() + + ft = alltypes_df.assign( + s=alltypes_df.string_col, new_col=alltypes_df.double_col + 1.0 + ) + expected = pd.merge( + ft.groupby("s").new_col.mean().rename("yas").reset_index(), + ft[["s"]], + on=["s"], + how="left", + )[["yas", "s"]].sort_values(["s", "yas"]) + backend.assert_frame_equal(result, expected) + + +@dot_sql_notimpl +@dot_sql_never +def test_dot_sql_repr(con): + alltypes = con.table("functional_alltypes") + t = ( + alltypes.sql( + """ + SELECT + string_col as s, + double_col + 1.0 AS new_col + FROM functional_alltypes + """ + ) + .group_by("s") # group by a column from SQL + .aggregate(fancy_af=lambda t: t.new_col.mean()) + .alias("awesome_t") # create a name for the aggregate + .sql("SELECT fancy_af AS yas FROM awesome_t ORDER BY fancy_af") + ) + + assert repr(t) + + +@dot_sql_notimpl +@dot_sql_never +def test_dot_sql_does_not_clobber_existing_tables(con): + expr = con.table("functional_alltypes").sql( + "SELECT 1 as x FROM functional_alltypes" + ) + with pytest.raises(ValueError): + expr.alias("batting") diff --git a/ibis/expr/format.py b/ibis/expr/format.py index 41cfd0963136..a590de6455bb 100644 --- a/ibis/expr/format.py +++ b/ibis/expr/format.py @@ -138,7 +138,7 @@ def fmt_table_op(op: ops.TableNode, **_: Any) -> str: @fmt_table_op.register def _fmt_table_op_physical_table(op: ops.PhysicalTable, **_: Any) -> str: - top = f"{op.__class__.__name__}[{op.name}]" + top = f"{op.__class__.__name__}: {op.name}" formatted_schema = fmt_schema(op.schema) return f"{top}\n{formatted_schema}" @@ -188,6 +188,38 @@ def _fmt_table_op_sql_query_result(op: ops.SQLQueryResult, **_: Any) -> str: return f"{top}\n{util.indent(query, spaces=2)}\n{schema_field}" +@fmt_table_op.register +def _fmt_table_op_view(op: ops.View, *, aliases: Aliases, **_: Any) -> str: + top = op.__class__.__name__ + formatted_schema = fmt_schema(op.schema) + schema_field = util.indent(f"schema:\n{formatted_schema}", spaces=2) + return f"{top}[{aliases[op.child.op()]}]: {op.name}\n{schema_field}" + + +@fmt_table_op.register +def _fmt_table_op_sql_view( + op: ops.SQLStringView, + *, + aliases: Aliases, + **_: Any, +) -> str: + short_query = textwrap.shorten( + op.query, + ibis.options.repr.query_text_length, + placeholder=f" {util.HORIZONTAL_ELLIPSIS}", + ) + query = f"query: {short_query!r}" + top = op.__class__.__name__ + formatted_schema = fmt_schema(op.schema) + schema_field = util.indent(f"schema:\n{formatted_schema}", spaces=2) + components = [ + f"{top}[{aliases[op.child.op()]}]: {op.name}", + util.indent(query, spaces=2), + schema_field, + ] + return "\n".join(components) + + @functools.singledispatch def fmt_join(op: ops.Join, *, aliases: Aliases) -> tuple[str, str]: assert False, f"join type {type(op)} not implemented" diff --git a/ibis/expr/operations/relations.py b/ibis/expr/operations/relations.py index a3369816d9cd..e75d73162bd9 100644 --- a/ibis/expr/operations/relations.py +++ b/ibis/expr/operations/relations.py @@ -797,6 +797,32 @@ def schema(self): return self.table.schema() +@public +class View(PhysicalTable): + """A view created from an expression.""" + + child = rlz.table + name = rlz.instance_of(str) + + @property + def schema(self): + return self.child.schema() + + +@public +class SQLStringView(PhysicalTable): + """A view created from a SQL string.""" + + child = rlz.table + name = rlz.instance_of(str) + query = rlz.instance_of(str) + + @cached_property + def schema(self): + backend = self.child._find_backend() + return backend._get_schema_using_query(self.query) + + def _dedup_join_columns( expr: ir.TableExpr, *, diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index b2c7ed8bc5dd..aabc2bf6faa1 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -2,6 +2,7 @@ import collections import functools +import itertools import operator from typing import IO, TYPE_CHECKING, Any, Iterable, Literal, Mapping, Sequence @@ -22,6 +23,9 @@ from .groupby import GroupedTableExpr +_ALIASES = (f"_ibis_view_{n:d}" for n in itertools.count()) + + def _regular_join_method( name: str, how: Literal[ @@ -1124,6 +1128,26 @@ def prevent_rewrite(self, client=None) -> TableExpr: def materialize(self) -> TableExpr: return self + def alias(self, alias: str) -> ir.TableExpr: + import ibis.expr.operations as ops + + expr = ops.View(child=self, name=alias).to_expr() + + # NB: calling compile is necessary so that any temporary views are + # created so that we can infer the schema without executing the entire + # query + expr.compile() + return expr + + def sql(self, query: str) -> ir.TableExpr: + import ibis.expr.operations as ops + + return ops.SQLStringView( + child=self, + name=next(_ALIASES), + query=query, + ).to_expr() + def _resolve_predicates(table: TableExpr, predicates) -> list[ir.BooleanValue]: from .. import analysis as an diff --git a/ibis/tests/expr/test_format.py b/ibis/tests/expr/test_format.py index f22d22dd0f1b..4517006d2dfe 100644 --- a/ibis/tests/expr/test_format.py +++ b/ibis/tests/expr/test_format.py @@ -35,7 +35,7 @@ def test_table_type_output(): result = repr(expr) assert 'SelfReference[r0]' in result - assert 'UnboundTable[foo]' in result + assert 'UnboundTable: foo' in result def test_aggregate_arg_names(table): @@ -193,7 +193,7 @@ def test_same_column_multiple_aliases(): expr = table[table.col.name('fakealias1'), table.col.name('fakealias2')] result = repr(expr) - assert "UnboundTable[t]" in result + assert "UnboundTable: t" in result assert "col int64" in result assert "fakealias1: r0.col" in result assert "fakealias2: r0.col" in result @@ -217,7 +217,7 @@ def test_repr_exact(): ).mutate(col4=lambda t: t.col2.length()) result = repr(table) expected = """\ -r0 := UnboundTable[t] +r0 := UnboundTable: t col int64 col2 string col3 float64