Skip to content

Commit

Permalink
feat(api): add Table.sample method for sampling rows from a table
Browse files Browse the repository at this point in the history
  • Loading branch information
jcrist committed Oct 17, 2023
1 parent 8bde3e0 commit 3ce2617
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 2 deletions.
1 change: 1 addition & 0 deletions ibis/expr/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def _join(op, left, right, predicates, **kwargs):


@fmt.register(ops.Limit)
@fmt.register(ops.Sample)
def _limit(op, table, **kwargs):
params = inline_args(kwargs)
return f"{op.__class__.__name__}[{table}, {params}]"
Expand Down
18 changes: 16 additions & 2 deletions ibis/expr/operations/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import abc
import itertools
from abc import abstractmethod
from typing import TYPE_CHECKING, Any, Literal, Optional
from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional
from typing import Union as UnionType

from public import public
Expand All @@ -17,7 +17,7 @@
from ibis.common.collections import FrozenDict # noqa: TCH001
from ibis.common.deferred import Deferred
from ibis.common.grounds import Immutable
from ibis.common.patterns import Coercible, Eq
from ibis.common.patterns import Between, Coercible, Eq
from ibis.common.typing import VarTuple # noqa: TCH001
from ibis.expr.operations.core import Column, Named, Node, Scalar, Value
from ibis.expr.operations.sortkeys import SortKey # noqa: TCH001
Expand Down Expand Up @@ -580,6 +580,20 @@ def schema(self):
return self.table.schema


@public
class Sample(Relation):
"""Sample performs random sampling of records in a table."""

table: Relation
fraction: Annotated[float, Between(0, 1)]
method: Literal["row", "block"] = "row"
seed: UnionType[int, None] = None

@attribute
def schema(self):
return self.table.schema


# TODO(kszucs): split it into two operations, one working with a single replacement
# value and the other with a mapping
# TODO(kszucs): the single value case was limited to numeric and string types
Expand Down
74 changes: 74 additions & 0 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,80 @@ def distinct(
return res.select(self.columns)
return res

def sample(
self,
fraction: float,
*,
method: Literal["row", "block"] = "row",
seed: int | None = None,
) -> Table:
"""Sample a fraction of rows from a table.
Parameters
----------
fraction
The percentage of rows to include in the sample, expressed as a
float between 0 and 1.
method
The sampling method to use. The default is "row", which includes
each row with a probability of ``fraction``. If method is "block",
some backends may instead perform sampling a fraction of blocks of
rows (where "block" is a backend dependent definition). This is
identical to "row" for backends lacking a blockwise sampling
implementation. For those coming from SQL, "row" and "block"
correspond to "bernoulli" and "system" respectively in a
TABLESAMPLE clause.
seed
An optional random seed to use, for repeatable sampling. Backends
that never support specifying a seed for repeatable sampling will
error appropriately. Note that some backends (like DuckDB) do
support specifying a seed, but may still not have repeatable
results in all cases.
Returns
-------
Table
The input table, with `fraction` of rows selected.
Examples
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.memtable({"x": [1, 2, 3, 4], "y": ["a", "b", "c", "d"]})
>>> t
┏━━━━━━━┳━━━━━━━━┓
┃ x ┃ y ┃
┑━━━━━━━╇━━━━━━━━┩
β”‚ int64 β”‚ string β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 1 β”‚ a β”‚
β”‚ 2 β”‚ b β”‚
β”‚ 3 β”‚ c β”‚
β”‚ 4 β”‚ d β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜
Sample approximately half the rows, with a seed specified for
reproducibility.
>>> t.sample(0.5, seed=1234)
┏━━━━━━━┳━━━━━━━━┓
┃ x ┃ y ┃
┑━━━━━━━╇━━━━━━━━┩
β”‚ int64 β”‚ string β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 2 β”‚ b β”‚
β”‚ 3 β”‚ c β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜
"""
if fraction == 1:
return self
elif fraction == 0:
return self.limit(0)
else:
return ops.Sample(
self, fraction=fraction, method=method, seed=seed
).to_expr()

def limit(self, n: int | None, offset: int = 0) -> Table:
"""Select `n` rows from `self` starting at `offset`.
Expand Down
17 changes: 17 additions & 0 deletions ibis/tests/expr/test_value_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1666,3 +1666,20 @@ def test_quantile_shape():
(b1,) = expr.op().selections

assert b1.shape.is_columnar()


def test_sample():
t = ibis.table({"x": "int64", "y": "string"})

expr = t.sample(1)
assert expr.equals(t)

expr = t.sample(0)
assert expr.equals(t.limit(0))

expr = t.sample(0.5, method="block", seed=1234)
assert expr.schema() == t.schema()
op = expr.op()
assert op.fraction == 0.5
assert op.method == "block"
assert op.seed == 1234

0 comments on commit 3ce2617

Please sign in to comment.