Skip to content

Commit

Permalink
feat(bigquery, impala, mssql, oracle, postgres): compile `Table.sampl…
Browse files Browse the repository at this point in the history
…e` to native `TABLESAMPLE` syntax when possible
  • Loading branch information
jcrist committed Sep 24, 2024
1 parent b95a036 commit 321a3b5
Show file tree
Hide file tree
Showing 73 changed files with 527 additions and 14 deletions.
2 changes: 1 addition & 1 deletion ibis/backends/sql/compilers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ class SQLGlotCompiler(abc.ABC):
LOWERED_OPS: dict[type[ops.Node], pats.Replace | None] = {
ops.Bucket: lower_bucket,
ops.Capitalize: lower_capitalize,
ops.Sample: lower_sample(supports_methods=()),
ops.Sample: lower_sample(supported_methods=()),
ops.StringSlice: lower_stringslice,
}
"""A mapping from an operation class to either a rewrite rule for rewriting that
Expand Down
9 changes: 9 additions & 0 deletions ibis/backends/sql/compilers/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
exclude_unsupported_window_frame_from_ops,
exclude_unsupported_window_frame_from_rank,
exclude_unsupported_window_frame_from_row_number,
lower_sample,
split_select_distinct_with_order_by,
)
from ibis.common.temporal import DateUnit, IntervalUnit, TimestampUnit, TimeUnit
Expand Down Expand Up @@ -118,6 +119,14 @@ class BigQueryCompiler(SQLGlotCompiler):

supports_qualify = True

LOWERED_OPS = {
ops.Sample: lower_sample(
supported_methods=("block",),
supports_seed=False,
physical_tables_only=True,
),
}

UNSUPPORTED_OPS = (
ops.DateDiff,
ops.ExtractAuthority,
Expand Down
1 change: 1 addition & 0 deletions ibis/backends/sql/compilers/druid.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class DruidCompiler(SQLGlotCompiler):
ops.TypeOf,
ops.Unnest,
ops.Variance,
ops.Sample,
)

SIMPLE_OPS = {
Expand Down
8 changes: 7 additions & 1 deletion ibis/backends/sql/compilers/impala.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ibis.backends.sql.compilers.base import NULL, STAR, SQLGlotCompiler
from ibis.backends.sql.datatypes import ImpalaType
from ibis.backends.sql.dialects import Impala
from ibis.backends.sql.rewrites import rewrite_empty_order_by_window
from ibis.backends.sql.rewrites import lower_sample, rewrite_empty_order_by_window


class ImpalaCompiler(SQLGlotCompiler):
Expand All @@ -23,6 +23,12 @@ class ImpalaCompiler(SQLGlotCompiler):
*SQLGlotCompiler.rewrites,
)

LOWERED_OPS = {
ops.Sample: lower_sample(
supported_methods=("block",), physical_tables_only=True
),
}

UNSUPPORTED_OPS = (
ops.ArgMax,
ops.ArgMin,
Expand Down
7 changes: 7 additions & 0 deletions ibis/backends/sql/compilers/mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ibis.backends.sql.rewrites import (
exclude_unsupported_window_frame_from_ops,
exclude_unsupported_window_frame_from_row_number,
lower_sample,
p,
replace,
split_select_distinct_with_order_by,
Expand Down Expand Up @@ -73,6 +74,12 @@ class MSSQLCompiler(SQLGlotCompiler):
post_rewrites = (split_select_distinct_with_order_by,)
copy_func_args = True

LOWERED_OPS = {
ops.Sample: lower_sample(
supported_methods=("block",), physical_tables_only=True
),
}

UNSUPPORTED_OPS = (
ops.ApproxMedian,
ops.ArgMax,
Expand Down
2 changes: 2 additions & 0 deletions ibis/backends/sql/compilers/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
exclude_unsupported_window_frame_from_row_number,
lower_log2,
lower_log10,
lower_sample,
rewrite_empty_order_by_window,
)

Expand Down Expand Up @@ -46,6 +47,7 @@ class OracleCompiler(SQLGlotCompiler):
LOWERED_OPS = {
ops.Log2: lower_log2,
ops.Log10: lower_log10,
ops.Sample: lower_sample(physical_tables_only=True),
}

UNSUPPORTED_OPS = (
Expand Down
4 changes: 3 additions & 1 deletion ibis/backends/sql/compilers/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ibis.backends.sql.compilers.base import NULL, STAR, AggGen, SQLGlotCompiler
from ibis.backends.sql.datatypes import PostgresType
from ibis.backends.sql.dialects import Postgres
from ibis.backends.sql.rewrites import split_select_distinct_with_order_by
from ibis.backends.sql.rewrites import lower_sample, split_select_distinct_with_order_by
from ibis.common.exceptions import InvalidDecoratorError
from ibis.util import gen_name

Expand Down Expand Up @@ -50,6 +50,8 @@ class PostgresCompiler(SQLGlotCompiler):
POS_INF = sge.Literal.number("'Inf'::double precision")
NEG_INF = sge.Literal.number("'-Inf'::double precision")

LOWERED_OPS = {ops.Sample: lower_sample(physical_tables_only=True)}

UNSUPPORTED_OPS = (
ops.RowID,
ops.TimeDelta,
Expand Down
1 change: 1 addition & 0 deletions ibis/backends/sql/compilers/risingwave.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class RisingWaveCompiler(PostgresCompiler):
ops.RandomUUID,
ops.MultiQuantile,
ops.ApproxMultiQuantile,
ops.Sample,
*(
op
for op in ALL_OPERATIONS
Expand Down
12 changes: 12 additions & 0 deletions ibis/backends/sql/dialects.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,18 +307,30 @@ class Tokenizer(Hive.Tokenizer):
STRING_ESCAPES = ["'"]


def tablesample_percent_to_int(self, expr):
"""Impala's TABLESAMPLE only supports integer percentages."""
expr = expr.copy()
expr.args["percent"] = sge.convert(round(float(expr.args["percent"].this)))
return self.tablesample_sql(expr)


class Impala(Hive):
NULL_ORDERING = "nulls_are_large"
REGEXP_EXTRACT_DEFAULT_GROUP = 0
TABLESAMPLE_SIZE_IS_PERCENT = True
ALIAS_POST_TABLESAMPLE = False

class Generator(Hive.Generator):
TABLESAMPLE_WITH_METHOD = True

TRANSFORMS = Hive.Generator.TRANSFORMS.copy() | {
sge.ApproxDistinct: rename_func("ndv"),
sge.IsNan: rename_func("is_nan"),
sge.IsInf: rename_func("is_inf"),
sge.DayOfWeek: rename_func("dayofweek"),
sge.Interval: lambda self, e: _interval(self, e, quote_arg=False),
sge.CurrentDate: rename_func("current_date"),
sge.TableSample: tablesample_percent_to_int,
}


Expand Down
6 changes: 3 additions & 3 deletions ibis/backends/sql/rewrites.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ def lower_capitalize(_, **kwargs):


def lower_sample(
supports_methods=("row", "block"),
supported_methods=("row", "block"),
supports_seed=True,
physical_tables_only=False,
):
Expand All @@ -605,7 +605,7 @@ def lower_sample(
Parameters
----------
supports_methods
supported_methods
The sampling methods supported by the backend's native TABLESAMPLE operation.
supports_seed
Whether the backend's native TABLESAMPLE supports setting a `seed`.
Expand All @@ -616,7 +616,7 @@ def lower_sample(
@replace(p.Sample)
def lower(_, **kwargs):
if (
(_.method not in supports_methods)
_.method not in supported_methods
or (_.seed is not None and not supports_seed)
or (
physical_tables_only
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT
*
FROM `test` AS `t0` TABLESAMPLE system (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM `test` AS `t0`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM `test` AS `t0`
WHERE
`t0`.`x` > 10
) AS `t1`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM `test` AS `t0`
WHERE
`t0`.`x` > 10
) AS `t1`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT
*
FROM "test" AS "t0" TABLESAMPLE system (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT
*
FROM "test" AS "t0" TABLESAMPLE bernoulli (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1" TABLESAMPLE system (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1" TABLESAMPLE bernoulli (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM `test` AS `t0`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM `test` AS `t0`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM `test` AS `t0`
WHERE
`t0`.`x` > 10
) AS `t1`
WHERE
RAND() <= 0.5
Loading

0 comments on commit 321a3b5

Please sign in to comment.