From 3ce261780b8ccbd50442e1e62d96790875e33f75 Mon Sep 17 00:00:00 2001 From: Jim Crist-Harif Date: Mon, 16 Oct 2023 15:05:27 -0500 Subject: [PATCH] feat(api): add `Table.sample` method for sampling rows from a table --- ibis/expr/format.py | 1 + ibis/expr/operations/relations.py | 18 ++++++- ibis/expr/types/relations.py | 74 +++++++++++++++++++++++++++++ ibis/tests/expr/test_value_exprs.py | 17 +++++++ 4 files changed, 108 insertions(+), 2 deletions(-) diff --git a/ibis/expr/format.py b/ibis/expr/format.py index 39504b818562..2c4028214b3f 100644 --- a/ibis/expr/format.py +++ b/ibis/expr/format.py @@ -291,6 +291,7 @@ def _join(op, left, right, predicates, **kwargs): @fmt.register(ops.Limit) +@fmt.register(ops.Sample) def _limit(op, table, **kwargs): params = inline_args(kwargs) return f"{op.__class__.__name__}[{table}, {params}]" diff --git a/ibis/expr/operations/relations.py b/ibis/expr/operations/relations.py index b4740d63a151..3f06fabb3dbb 100644 --- a/ibis/expr/operations/relations.py +++ b/ibis/expr/operations/relations.py @@ -3,7 +3,7 @@ import abc import itertools from abc import abstractmethod -from typing import TYPE_CHECKING, Any, Literal, Optional +from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional from typing import Union as UnionType from public import public @@ -17,7 +17,7 @@ from ibis.common.collections import FrozenDict # noqa: TCH001 from ibis.common.deferred import Deferred from ibis.common.grounds import Immutable -from ibis.common.patterns import Coercible, Eq +from ibis.common.patterns import Between, Coercible, Eq from ibis.common.typing import VarTuple # noqa: TCH001 from ibis.expr.operations.core import Column, Named, Node, Scalar, Value from ibis.expr.operations.sortkeys import SortKey # noqa: TCH001 @@ -580,6 +580,20 @@ def schema(self): return self.table.schema +@public +class Sample(Relation): + """Sample performs random sampling of records in a table.""" + + table: Relation + fraction: Annotated[float, Between(0, 1)] + method: Literal["row", "block"] = "row" + seed: UnionType[int, None] = None + + @attribute + def schema(self): + return self.table.schema + + # TODO(kszucs): split it into two operations, one working with a single replacement # value and the other with a mapping # TODO(kszucs): the single value case was limited to numeric and string types diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index 076b4d7058b5..f183037b0ae2 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -1191,6 +1191,80 @@ def distinct( return res.select(self.columns) return res + def sample( + self, + fraction: float, + *, + method: Literal["row", "block"] = "row", + seed: int | None = None, + ) -> Table: + """Sample a fraction of rows from a table. + + Parameters + ---------- + fraction + The percentage of rows to include in the sample, expressed as a + float between 0 and 1. + method + The sampling method to use. The default is "row", which includes + each row with a probability of ``fraction``. If method is "block", + some backends may instead perform sampling a fraction of blocks of + rows (where "block" is a backend dependent definition). This is + identical to "row" for backends lacking a blockwise sampling + implementation. For those coming from SQL, "row" and "block" + correspond to "bernoulli" and "system" respectively in a + TABLESAMPLE clause. + seed + An optional random seed to use, for repeatable sampling. Backends + that never support specifying a seed for repeatable sampling will + error appropriately. Note that some backends (like DuckDB) do + support specifying a seed, but may still not have repeatable + results in all cases. + + Returns + ------- + Table + The input table, with `fraction` of rows selected. + + Examples + -------- + >>> import ibis + >>> ibis.options.interactive = True + >>> t = ibis.memtable({"x": [1, 2, 3, 4], "y": ["a", "b", "c", "d"]}) + >>> t + ┏━━━━━━━┳━━━━━━━━┓ + ┃ x ┃ y ┃ + ┡━━━━━━━╇━━━━━━━━┩ + │ int64 │ string │ + ├───────┼────────┤ + │ 1 │ a │ + │ 2 │ b │ + │ 3 │ c │ + │ 4 │ d │ + └───────┴────────┘ + + Sample approximately half the rows, with a seed specified for + reproducibility. + + >>> t.sample(0.5, seed=1234) + ┏━━━━━━━┳━━━━━━━━┓ + ┃ x ┃ y ┃ + ┡━━━━━━━╇━━━━━━━━┩ + │ int64 │ string │ + ├───────┼────────┤ + │ 2 │ b │ + │ 3 │ c │ + └───────┴────────┘ + """ + if fraction == 1: + return self + elif fraction == 0: + return self.limit(0) + else: + return ops.Sample( + self, fraction=fraction, method=method, seed=seed + ).to_expr() + def limit(self, n: int | None, offset: int = 0) -> Table: """Select `n` rows from `self` starting at `offset`. diff --git a/ibis/tests/expr/test_value_exprs.py b/ibis/tests/expr/test_value_exprs.py index 506b9c293456..69ee3b873705 100644 --- a/ibis/tests/expr/test_value_exprs.py +++ b/ibis/tests/expr/test_value_exprs.py @@ -1666,3 +1666,20 @@ def test_quantile_shape(): (b1,) = expr.op().selections assert b1.shape.is_columnar() + + +def test_sample(): + t = ibis.table({"x": "int64", "y": "string"}) + + expr = t.sample(1) + assert expr.equals(t) + + expr = t.sample(0) + assert expr.equals(t.limit(0)) + + expr = t.sample(0.5, method="block", seed=1234) + assert expr.schema() == t.schema() + op = expr.op() + assert op.fraction == 0.5 + assert op.method == "block" + assert op.seed == 1234