diff --git a/ibis/backends/sql/compilers/base.py b/ibis/backends/sql/compilers/base.py index 190f9e533879..921a568d3041 100644 --- a/ibis/backends/sql/compilers/base.py +++ b/ibis/backends/sql/compilers/base.py @@ -293,7 +293,7 @@ class SQLGlotCompiler(abc.ABC): LOWERED_OPS: dict[type[ops.Node], pats.Replace | None] = { ops.Bucket: lower_bucket, ops.Capitalize: lower_capitalize, - ops.Sample: lower_sample(supports_methods=()), + ops.Sample: lower_sample(supported_methods=()), ops.StringSlice: lower_stringslice, } """A mapping from an operation class to either a rewrite rule for rewriting that diff --git a/ibis/backends/sql/compilers/bigquery/__init__.py b/ibis/backends/sql/compilers/bigquery/__init__.py index 0e8f3a7d3017..19cd463ab1a2 100644 --- a/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/ibis/backends/sql/compilers/bigquery/__init__.py @@ -22,6 +22,7 @@ exclude_unsupported_window_frame_from_ops, exclude_unsupported_window_frame_from_rank, exclude_unsupported_window_frame_from_row_number, + lower_sample, split_select_distinct_with_order_by, ) from ibis.common.temporal import DateUnit, IntervalUnit, TimestampUnit, TimeUnit @@ -118,6 +119,14 @@ class BigQueryCompiler(SQLGlotCompiler): supports_qualify = True + LOWERED_OPS = { + ops.Sample: lower_sample( + supported_methods=("block",), + supports_seed=False, + physical_tables_only=True, + ), + } + UNSUPPORTED_OPS = ( ops.DateDiff, ops.ExtractAuthority, diff --git a/ibis/backends/sql/compilers/druid.py b/ibis/backends/sql/compilers/druid.py index 11a5bd536f62..4e2710b39992 100644 --- a/ibis/backends/sql/compilers/druid.py +++ b/ibis/backends/sql/compilers/druid.py @@ -65,6 +65,7 @@ class DruidCompiler(SQLGlotCompiler): ops.TypeOf, ops.Unnest, ops.Variance, + ops.Sample, ) SIMPLE_OPS = { diff --git a/ibis/backends/sql/compilers/impala.py b/ibis/backends/sql/compilers/impala.py index a21d89c3872b..06269cc4b0de 100644 --- a/ibis/backends/sql/compilers/impala.py +++ b/ibis/backends/sql/compilers/impala.py @@ -10,7 +10,7 @@ from ibis.backends.sql.compilers.base import NULL, STAR, SQLGlotCompiler from ibis.backends.sql.datatypes import ImpalaType from ibis.backends.sql.dialects import Impala -from ibis.backends.sql.rewrites import rewrite_empty_order_by_window +from ibis.backends.sql.rewrites import lower_sample, rewrite_empty_order_by_window class ImpalaCompiler(SQLGlotCompiler): @@ -23,6 +23,12 @@ class ImpalaCompiler(SQLGlotCompiler): *SQLGlotCompiler.rewrites, ) + LOWERED_OPS = { + ops.Sample: lower_sample( + supported_methods=("block",), physical_tables_only=True + ), + } + UNSUPPORTED_OPS = ( ops.ArgMax, ops.ArgMin, diff --git a/ibis/backends/sql/compilers/mssql.py b/ibis/backends/sql/compilers/mssql.py index dbb0e3f9fe2c..900877c5117a 100644 --- a/ibis/backends/sql/compilers/mssql.py +++ b/ibis/backends/sql/compilers/mssql.py @@ -22,6 +22,7 @@ from ibis.backends.sql.rewrites import ( exclude_unsupported_window_frame_from_ops, exclude_unsupported_window_frame_from_row_number, + lower_sample, p, replace, split_select_distinct_with_order_by, @@ -73,6 +74,12 @@ class MSSQLCompiler(SQLGlotCompiler): post_rewrites = (split_select_distinct_with_order_by,) copy_func_args = True + LOWERED_OPS = { + ops.Sample: lower_sample( + supported_methods=("block",), physical_tables_only=True + ), + } + UNSUPPORTED_OPS = ( ops.ApproxMedian, ops.ArgMax, diff --git a/ibis/backends/sql/compilers/oracle.py b/ibis/backends/sql/compilers/oracle.py index faee258b10e4..737a7515001a 100644 --- a/ibis/backends/sql/compilers/oracle.py +++ b/ibis/backends/sql/compilers/oracle.py @@ -16,6 +16,7 @@ exclude_unsupported_window_frame_from_row_number, lower_log2, lower_log10, + lower_sample, rewrite_empty_order_by_window, ) @@ -46,6 +47,7 @@ class OracleCompiler(SQLGlotCompiler): LOWERED_OPS = { ops.Log2: lower_log2, ops.Log10: lower_log10, + ops.Sample: lower_sample(physical_tables_only=True), } UNSUPPORTED_OPS = ( diff --git a/ibis/backends/sql/compilers/postgres.py b/ibis/backends/sql/compilers/postgres.py index 7197d6fd03df..ec9f4d374066 100644 --- a/ibis/backends/sql/compilers/postgres.py +++ b/ibis/backends/sql/compilers/postgres.py @@ -17,7 +17,7 @@ from ibis.backends.sql.compilers.base import NULL, STAR, AggGen, SQLGlotCompiler from ibis.backends.sql.datatypes import PostgresType from ibis.backends.sql.dialects import Postgres -from ibis.backends.sql.rewrites import split_select_distinct_with_order_by +from ibis.backends.sql.rewrites import lower_sample, split_select_distinct_with_order_by from ibis.common.exceptions import InvalidDecoratorError from ibis.util import gen_name @@ -50,6 +50,8 @@ class PostgresCompiler(SQLGlotCompiler): POS_INF = sge.Literal.number("'Inf'::double precision") NEG_INF = sge.Literal.number("'-Inf'::double precision") + LOWERED_OPS = {ops.Sample: lower_sample(physical_tables_only=True)} + UNSUPPORTED_OPS = ( ops.RowID, ops.TimeDelta, diff --git a/ibis/backends/sql/compilers/risingwave.py b/ibis/backends/sql/compilers/risingwave.py index 73f013eeb96a..c4baf94d6723 100644 --- a/ibis/backends/sql/compilers/risingwave.py +++ b/ibis/backends/sql/compilers/risingwave.py @@ -26,6 +26,7 @@ class RisingWaveCompiler(PostgresCompiler): ops.RandomUUID, ops.MultiQuantile, ops.ApproxMultiQuantile, + ops.Sample, *( op for op in ALL_OPERATIONS diff --git a/ibis/backends/sql/dialects.py b/ibis/backends/sql/dialects.py index f7a2eb38dca9..8f62c3c57f59 100644 --- a/ibis/backends/sql/dialects.py +++ b/ibis/backends/sql/dialects.py @@ -307,11 +307,22 @@ class Tokenizer(Hive.Tokenizer): STRING_ESCAPES = ["'"] +def tablesample_percent_to_int(self, expr): + """Impala's TABLESAMPLE only supports integer percentages.""" + expr = expr.copy() + expr.args["percent"] = sge.convert(round(float(expr.args["percent"].this))) + return self.tablesample_sql(expr) + + class Impala(Hive): NULL_ORDERING = "nulls_are_large" REGEXP_EXTRACT_DEFAULT_GROUP = 0 + TABLESAMPLE_SIZE_IS_PERCENT = True + ALIAS_POST_TABLESAMPLE = False class Generator(Hive.Generator): + TABLESAMPLE_WITH_METHOD = True + TRANSFORMS = Hive.Generator.TRANSFORMS.copy() | { sge.ApproxDistinct: rename_func("ndv"), sge.IsNan: rename_func("is_nan"), @@ -319,6 +330,7 @@ class Generator(Hive.Generator): sge.DayOfWeek: rename_func("dayofweek"), sge.Interval: lambda self, e: _interval(self, e, quote_arg=False), sge.CurrentDate: rename_func("current_date"), + sge.TableSample: tablesample_percent_to_int, } diff --git a/ibis/backends/sql/rewrites.py b/ibis/backends/sql/rewrites.py index 2f455893d528..ae4a6d80961c 100644 --- a/ibis/backends/sql/rewrites.py +++ b/ibis/backends/sql/rewrites.py @@ -593,7 +593,7 @@ def lower_capitalize(_, **kwargs): def lower_sample( - supports_methods=("row", "block"), + supported_methods=("row", "block"), supports_seed=True, physical_tables_only=False, ): @@ -605,7 +605,7 @@ def lower_sample( Parameters ---------- - supports_methods + supported_methods The sampling methods supported by the backend's native TABLESAMPLE operation. supports_seed Whether the backend's native TABLESAMPLE supports setting a `seed`. @@ -616,7 +616,7 @@ def lower_sample( @replace(p.Sample) def lower(_, **kwargs): if ( - (_.method not in supports_methods) + _.method not in supported_methods or (_.seed is not None and not supports_seed) or ( physical_tables_only diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-subquery/block.sql new file mode 100644 index 000000000000..0f6cc00d35ee --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM `test` AS `t0` TABLESAMPLE system (50.0 PERCENT) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-subquery/row.sql new file mode 100644 index 000000000000..41fafb2da62d --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-subquery/row.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM `test` AS `t0` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-table/block.sql new file mode 100644 index 000000000000..0e8e7838e323 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-table/row.sql new file mode 100644 index 000000000000..0e8e7838e323 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/bigquery-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-subquery/block.sql new file mode 100644 index 000000000000..93ab234de3b5 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-subquery/block.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + randCanonical() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-subquery/row.sql new file mode 100644 index 000000000000..93ab234de3b5 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-subquery/row.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + randCanonical() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-table/block.sql new file mode 100644 index 000000000000..7f4590759a56 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + randCanonical() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-table/row.sql new file mode 100644 index 000000000000..7f4590759a56 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/clickhouse-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + randCanonical() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-subquery/block.sql new file mode 100644 index 000000000000..f885113afef8 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-subquery/block.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-subquery/row.sql new file mode 100644 index 000000000000..f885113afef8 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-subquery/row.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-table/block.sql new file mode 100644 index 000000000000..0307d641ffea --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-table/row.sql new file mode 100644 index 000000000000..0307d641ffea --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/datafusion-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-subquery/block.sql new file mode 100644 index 000000000000..95f441e59611 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE system (50.0 PERCENT) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-subquery/row.sql new file mode 100644 index 000000000000..e22aab901495 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-subquery/row.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE bernoulli (50.0 PERCENT) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-table/block.sql new file mode 100644 index 000000000000..5b495ba2d0f1 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-table/block.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" TABLESAMPLE system (50.0 PERCENT) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-table/row.sql new file mode 100644 index 000000000000..0542a60cfdec --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/duckdb-table/row.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" TABLESAMPLE bernoulli (50.0 PERCENT) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-subquery/block.sql new file mode 100644 index 000000000000..f885113afef8 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-subquery/block.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-subquery/row.sql new file mode 100644 index 000000000000..f885113afef8 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-subquery/row.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-table/block.sql new file mode 100644 index 000000000000..0307d641ffea --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-table/row.sql new file mode 100644 index 000000000000..0307d641ffea --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/exasol-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/flink-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-subquery/block.sql new file mode 100644 index 000000000000..41fafb2da62d --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-subquery/block.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM `test` AS `t0` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/flink-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-subquery/row.sql new file mode 100644 index 000000000000..41fafb2da62d --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-subquery/row.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM `test` AS `t0` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/flink-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-table/block.sql new file mode 100644 index 000000000000..0e8e7838e323 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/flink-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-table/row.sql new file mode 100644 index 000000000000..0e8e7838e323 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/flink-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/impala-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-subquery/block.sql new file mode 100644 index 000000000000..83bd5d40d36d --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM `test` AS `t0` TABLESAMPLE system (50) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/impala-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-subquery/row.sql new file mode 100644 index 000000000000..48e9d653b370 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-subquery/row.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM `test` AS `t0` +WHERE + RAND(UTC_TO_UNIX_MICROS(UTC_TIMESTAMP())) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/impala-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-table/block.sql new file mode 100644 index 000000000000..5b84da7919ec --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND(UTC_TO_UNIX_MICROS(UTC_TIMESTAMP())) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/impala-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-table/row.sql new file mode 100644 index 000000000000..5b84da7919ec --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/impala-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND(UTC_TO_UNIX_MICROS(UTC_TIMESTAMP())) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-subquery/block.sql new file mode 100644 index 000000000000..5427966160cd --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM [test] AS [t0] TABLESAMPLE system (50.0 PERCENT) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-subquery/row.sql new file mode 100644 index 000000000000..1624a907034f --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-subquery/row.sql @@ -0,0 +1,6 @@ +SELECT + [t0].[x], + [t0].[y] +FROM [test] AS [t0] +WHERE + RAND(CHECKSUM(NEWID())) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-table/block.sql new file mode 100644 index 000000000000..a8b9fc180414 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-table/block.sql @@ -0,0 +1,13 @@ +SELECT + [t1].[x], + [t1].[y] +FROM ( + SELECT + [t0].[x], + [t0].[y] + FROM [test] AS [t0] + WHERE + [t0].[x] > 10 +) AS [t1] +WHERE + RAND(CHECKSUM(NEWID())) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-table/row.sql new file mode 100644 index 000000000000..a8b9fc180414 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mssql-table/row.sql @@ -0,0 +1,13 @@ +SELECT + [t1].[x], + [t1].[y] +FROM ( + SELECT + [t0].[x], + [t0].[y] + FROM [test] AS [t0] + WHERE + [t0].[x] > 10 +) AS [t1] +WHERE + RAND(CHECKSUM(NEWID())) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-subquery/block.sql new file mode 100644 index 000000000000..41fafb2da62d --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-subquery/block.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM `test` AS `t0` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-subquery/row.sql new file mode 100644 index 000000000000..41fafb2da62d --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-subquery/row.sql @@ -0,0 +1,5 @@ +SELECT + * +FROM `test` AS `t0` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-table/block.sql new file mode 100644 index 000000000000..0e8e7838e323 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-table/row.sql new file mode 100644 index 000000000000..0e8e7838e323 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/mysql-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) AS `t1` +WHERE + RAND() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-subquery/block.sql new file mode 100644 index 000000000000..36f81d7681a5 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" SAMPLE system (50.0) "t0" \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-subquery/row.sql new file mode 100644 index 000000000000..860e3a680338 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-subquery/row.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" SAMPLE bernoulli (50.0) "t0" \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-table/block.sql new file mode 100644 index 000000000000..7e8543314947 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" "t0" + WHERE + "t0"."x" > 10 +) "t1" +WHERE + DBMS_RANDOM.VALUE() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-table/row.sql new file mode 100644 index 000000000000..7e8543314947 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/oracle-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" "t0" + WHERE + "t0"."x" > 10 +) "t1" +WHERE + DBMS_RANDOM.VALUE() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-subquery/block.sql new file mode 100644 index 000000000000..27b5ab3d5046 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE system (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-subquery/row.sql new file mode 100644 index 000000000000..72f6ef978124 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-subquery/row.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE bernoulli (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-table/block.sql new file mode 100644 index 000000000000..0307d641ffea --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-table/block.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-table/row.sql new file mode 100644 index 000000000000..0307d641ffea --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/postgres-table/row.sql @@ -0,0 +1,11 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + RANDOM() <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-subquery/block.sql new file mode 100644 index 000000000000..c9099652a914 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM `test` TABLESAMPLE (50.0 PERCENT) AS `t0` \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-subquery/row.sql new file mode 100644 index 000000000000..c9099652a914 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-subquery/row.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM `test` TABLESAMPLE (50.0 PERCENT) AS `t0` \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-table/block.sql new file mode 100644 index 000000000000..27955a82792f --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-table/block.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) TABLESAMPLE (50.0 PERCENT) AS `t1` \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-table/row.sql new file mode 100644 index 000000000000..27955a82792f --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/pyspark-table/row.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM `test` AS `t0` + WHERE + `t0`.`x` > 10 +) TABLESAMPLE (50.0 PERCENT) AS `t1` \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-subquery/block.sql new file mode 100644 index 000000000000..27b5ab3d5046 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE system (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-subquery/row.sql new file mode 100644 index 000000000000..72f6ef978124 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-subquery/row.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE bernoulli (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-table/block.sql new file mode 100644 index 000000000000..2c9987d23ddf --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-table/block.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" TABLESAMPLE system (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-table/row.sql new file mode 100644 index 000000000000..38eb63631277 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/snowflake-table/row.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" TABLESAMPLE bernoulli (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-subquery/block.sql new file mode 100644 index 000000000000..929dbdeaf464 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-subquery/block.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + ( + 0.5 + ( + CAST(RANDOM() AS REAL) / -1.8446744073709552e+19 + ) + ) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-subquery/row.sql new file mode 100644 index 000000000000..929dbdeaf464 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-subquery/row.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM "test" AS "t0" +WHERE + ( + 0.5 + ( + CAST(RANDOM() AS REAL) / -1.8446744073709552e+19 + ) + ) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-table/block.sql new file mode 100644 index 000000000000..d7cac89b0e38 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-table/block.sql @@ -0,0 +1,15 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + ( + 0.5 + ( + CAST(RANDOM() AS REAL) / -1.8446744073709552e+19 + ) + ) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-table/row.sql new file mode 100644 index 000000000000..d7cac89b0e38 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/sqlite-table/row.sql @@ -0,0 +1,15 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" +WHERE + ( + 0.5 + ( + CAST(RANDOM() AS REAL) / -1.8446744073709552e+19 + ) + ) <= 0.5 \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/trino-subquery/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-subquery/block.sql new file mode 100644 index 000000000000..27b5ab3d5046 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-subquery/block.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE system (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/trino-subquery/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-subquery/row.sql new file mode 100644 index 000000000000..72f6ef978124 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-subquery/row.sql @@ -0,0 +1,3 @@ +SELECT + * +FROM "test" AS "t0" TABLESAMPLE bernoulli (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/trino-table/block.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-table/block.sql new file mode 100644 index 000000000000..2c9987d23ddf --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-table/block.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" TABLESAMPLE system (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/snapshots/test_sql/test_sample/trino-table/row.sql b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-table/row.sql new file mode 100644 index 000000000000..38eb63631277 --- /dev/null +++ b/ibis/backends/tests/snapshots/test_sql/test_sample/trino-table/row.sql @@ -0,0 +1,9 @@ +SELECT + * +FROM ( + SELECT + * + FROM "test" AS "t0" + WHERE + "t0"."x" > 10 +) AS "t1" TABLESAMPLE bernoulli (50.0) \ No newline at end of file diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 472aec7d54c2..714f987d76cb 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -2141,7 +2141,7 @@ def test_dynamic_table_slice_with_computed_offset(backend): ), ], ) -@pytest.mark.parametrize("subquery", [True, False]) +@pytest.mark.parametrize("subquery", [True, False], ids=["subquery", "table"]) @pytest.mark.xfail_version(pyspark=["sqlglot==25.17.0"]) def test_sample(backend, method, alltypes, subquery): if subquery: @@ -2175,7 +2175,6 @@ def test_sample_memtable(con, backend): "mysql", "oracle", "polars", - "postgres", "risingwave", "sqlite", "trino", diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index a36ad3da3d6e..9f94744cc29d 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -241,3 +241,21 @@ def test_rewrite_context(snapshot, backend_name): expr = table.select(new_col=ibis.ntile(2).over(order_by=ibis.random())).limit(10) result = ibis.to_sql(expr, dialect=backend_name) snapshot.assert_match(result, "out.sql") + + +@pytest.mark.parametrize("subquery", [False, True], ids=["subquery", "table"]) +@pytest.mark.parametrize("backend_name", _get_backends_to_test()) +@pytest.mark.notimpl(["polars"], raises=ValueError, reason="not a SQL backend") +@pytest.mark.notimpl( + ["druid", "risingwave"], + raises=exc.OperationNotDefinedError, + reason="sample not supported", +) +def test_sample(backend_name, snapshot, subquery): + t = ibis.table({"x": "int64", "y": "int64"}, name="test") + if subquery: + t = t.filter(t.x > 10) + block = ibis.to_sql(t.sample(0.5, method="block"), dialect=backend_name) + row = ibis.to_sql(t.sample(0.5, method="row"), dialect=backend_name) + snapshot.assert_match(block, "block.sql") + snapshot.assert_match(row, "row.sql") diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index 7e58417609ce..00450a44e837 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -1224,12 +1224,13 @@ def sample( method The sampling method to use. The default is "row", which includes each row with a probability of `fraction`. If method is "block", - some backends may instead perform sampling a fraction of blocks of - rows (where "block" is a backend dependent definition). This is - identical to "row" for backends lacking a blockwise sampling - implementation. For those coming from SQL, "row" and "block" - correspond to "bernoulli" and "system" respectively in a - TABLESAMPLE clause. + some backends may instead sample a fraction of blocks of rows + (where "block" is a backend dependent definition), which may be + significantly more efficient (at the cost of a less statistically + random sample). This is identical to "row" for backends lacking a + blockwise sampling implementation. For those coming from SQL, "row" + and "block" correspond to "bernoulli" and "system" respectively in + a TABLESAMPLE clause. seed An optional random seed to use, for repeatable sampling. The range of possible seed values is backend specific (most support at least