Skip to content

Commit

Permalink
feat(api): add describe method to compute summary stats of table ex…
Browse files Browse the repository at this point in the history
…pressions (#8739)
  • Loading branch information
jitingxu1 authored Apr 15, 2024
1 parent 1ab7fa8 commit c8d98a1
Show file tree
Hide file tree
Showing 3 changed files with 306 additions and 2 deletions.
5 changes: 4 additions & 1 deletion ibis/backends/tests/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@
from polars import ComputeError as PolarsComputeError
from polars import PanicException as PolarsPanicException
from polars.exceptions import InvalidOperationError as PolarsInvalidOperationError
from polars.exceptions import SchemaError as PolarsSchemaError
except ImportError:
PolarsComputeError = PolarsPanicException = PolarsInvalidOperationError = None
PolarsComputeError = PolarsPanicException = PolarsInvalidOperationError = (
PolarsSchemaError
) = None

try:
from pyarrow import ArrowInvalid, ArrowNotImplementedError
Expand Down
170 changes: 169 additions & 1 deletion ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
import ibis.expr.datatypes as dt
import ibis.selectors as s
from ibis import _
from ibis.backends.conftest import is_older_than
from ibis.backends.conftest import is_newer_than, is_older_than
from ibis.backends.tests.errors import (
ClickHouseDatabaseError,
ExaQueryError,
GoogleBadRequest,
ImpalaHiveServer2Error,
MySQLProgrammingError,
OracleDatabaseError,
PolarsSchemaError,
PsycoPg2InternalError,
Py4JJavaError,
PyDruidProgrammingError,
Expand Down Expand Up @@ -586,6 +587,173 @@ def test_table_info(alltypes):
assert expr.columns == list(df.columns)


@pytest.mark.notimpl(
[
"datafusion",
"impala",
"trino",
"mysql",
"mssql",
"trino",
"flink",
],
raises=com.OperationNotDefinedError,
reason="quantile and mode is not supported",
)
@pytest.mark.notimpl(
[
"exasol",
"druid",
],
raises=com.OperationNotDefinedError,
reason="Mode and StandardDev is not supported",
)
@pytest.mark.notimpl(
["polars"],
raises=PolarsSchemaError,
reason="cannot extend/append Float64 with Float32",
)
@pytest.mark.notyet(
["druid"],
raises=PyDruidProgrammingError,
reason="Druid only supports trivial unions",
)
@pytest.mark.parametrize(
("selector", "expected_columns"),
[
param(
s.any_of(
s.of_type("numeric"),
s.of_type("string"),
s.of_type("bool"),
s.of_type("timestamp"),
),
[
"name",
"type",
"count",
"nulls",
"unique",
"mode",
"mean",
"std",
"min",
"p25",
"p50",
"p75",
"max",
],
marks=[
pytest.mark.notimpl(
["sqlite"],
raises=com.OperationNotDefinedError,
reason="quantile is not supported",
),
pytest.mark.notimpl(
[
"clickhouse",
"pyspark",
"clickhouse",
"risingwave",
"impala",
],
raises=com.OperationNotDefinedError,
reason="mode is not supported",
),
pytest.mark.notimpl(
["dask"],
raises=ValueError,
reason="Unable to concatenate DataFrame with unknown division specifying axis=1",
),
pytest.mark.notimpl(
["oracle"],
raises=(OracleDatabaseError, com.OperationNotDefinedError),
reason="Mode is not supported and ORA-02000: missing AS keyword",
),
pytest.mark.broken(
["pandas"],
condition=is_newer_than("pandas", "2.1.0"),
reason="FutureWarning: concat empty or all-NA entries is deprecated",
),
],
id="all_cols",
),
param(
s.of_type("numeric"),
[
"name",
"type",
"count",
"nulls",
"unique",
"mean",
"std",
"min",
"p25",
"p50",
"p75",
"max",
],
marks=[
pytest.mark.notimpl(
["sqlite"],
raises=com.OperationNotDefinedError,
reason="quantile is not supported",
),
pytest.mark.notimpl(
["oracle"],
raises=OracleDatabaseError,
reason="Mode is not supported and ORA-02000: missing AS keyword",
),
],
id="numeric_col",
),
param(
s.of_type("string"),
[
"name",
"type",
"count",
"nulls",
"unique",
"mode",
],
marks=[
pytest.mark.notimpl(
[
"clickhouse",
"pyspark",
"clickhouse",
"risingwave",
"impala",
],
raises=com.OperationNotDefinedError,
reason="mode is not supported",
),
pytest.mark.notimpl(
["oracle"],
raises=com.OperationNotDefinedError,
reason="Mode is not supported and ORA-02000: missing AS keyword",
),
pytest.mark.notimpl(
["dask"],
raises=ValueError,
reason="Unable to concatenate DataFrame with unknown division specifying axis=1",
),
],
id="string_col",
),
],
)
def test_table_describe(alltypes, selector, expected_columns):
sometypes = alltypes.select(selector)
expr = sometypes.describe()
df = expr.execute()
assert sorted(sometypes.columns) == sorted(df.name)
assert sorted(expr.columns) == sorted(expected_columns)
assert sorted(expr.columns) == sorted(df.columns)


@pytest.mark.parametrize(
("ibis_op", "pandas_op"),
[
Expand Down
133 changes: 133 additions & 0 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2886,6 +2886,139 @@ def info(self) -> Table:
aggs.append(agg)
return ibis.union(*aggs).order_by(ibis.asc("pos"))

def describe(
self, quantile: Sequence[ir.NumericValue | float] = (0.25, 0.5, 0.75)
) -> Table:
"""Return summary information about a table.
Parameters
----------
quantile
The quantiles to compute for numerical columns. Defaults to (0.25, 0.5, 0.75).
Returns
-------
Table
A table containing summary information about the columns of self.
Notes
-----
This function computes summary statistics for each column in the table. For
numerical columns, it computes statistics such as minimum, maximum, mean,
standard deviation, and quantiles. For string columns, it computes the mode
and the number of unique values.
Examples
--------
>>> import ibis
>>> import ibis.selectors as s
>>> ibis.options.interactive = True
>>> p = ibis.examples.penguins.fetch()
>>> p.describe()
┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━┓
┃ name ┃ type ┃ count ┃ nulls ┃ unique ┃ mode ┃ … ┃
┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━┩
│ string │ string │ int64 │ int64 │ int64 │ string │ … │
├───────────────────┼─────────┼───────┼───────┼────────┼────────┼───┤
│ species │ string │ 344 │ 0 │ 3 │ Adelie │ … │
│ island │ string │ 344 │ 0 │ 3 │ Biscoe │ … │
│ bill_length_mm │ float64 │ 344 │ 2 │ 164 │ NULL │ … │
│ bill_depth_mm │ float64 │ 344 │ 2 │ 80 │ NULL │ … │
│ flipper_length_mm │ int64 │ 344 │ 2 │ 55 │ NULL │ … │
│ body_mass_g │ int64 │ 344 │ 2 │ 94 │ NULL │ … │
│ sex │ string │ 344 │ 11 │ 2 │ male │ … │
│ year │ int64 │ 344 │ 0 │ 3 │ NULL │ … │
└───────────────────┴─────────┴───────┴───────┴────────┴────────┴───┘
>>> p.select(s.of_type("numeric")).describe()
┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━┓
┃ name ┃ type ┃ count ┃ nulls ┃ unique ┃ mean ┃ … ┃
┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━┩
│ string │ string │ int64 │ int64 │ int64 │ float64 │ … │
├───────────────────┼─────────┼───────┼───────┼────────┼─────────────┼───┤
│ bill_length_mm │ float64 │ 344 │ 2 │ 164 │ 43.921930 │ … │
│ bill_depth_mm │ float64 │ 344 │ 2 │ 80 │ 17.151170 │ … │
│ flipper_length_mm │ int64 │ 344 │ 2 │ 55 │ 200.915205 │ … │
│ body_mass_g │ int64 │ 344 │ 2 │ 94 │ 4201.754386 │ … │
│ year │ int64 │ 344 │ 0 │ 3 │ 2008.029070 │ … │
└───────────────────┴─────────┴───────┴───────┴────────┴─────────────┴───┘
>>> p.select(s.of_type("string")).describe()
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
┃ name ┃ type ┃ count ┃ nulls ┃ unique ┃ mode ┃
┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
│ string │ string │ int64 │ int64 │ int64 │ string │
├─────────┼────────┼───────┼───────┼────────┼────────┤
│ species │ string │ 344 │ 0 │ 3 │ Adelie │
│ island │ string │ 344 │ 0 │ 3 │ Biscoe │
│ sex │ string │ 344 │ 11 │ 2 │ male │
└─────────┴────────┴───────┴───────┴────────┴────────┘
"""
import ibis.selectors as s
from ibis import literal as lit

quantile = sorted(quantile)
aggs = []
string_col = False
numeric_col = False
for colname in self.columns:
col = self[colname]
typ = col.type()

# default statistics to None
col_mean = lit(None).cast(float)
col_std = lit(None).cast(float)
col_min = lit(None).cast(float)
col_max = lit(None).cast(float)
col_mode = lit(None).cast(str)
quantile_values = {
f"p{100*q:.6f}".rstrip("0").rstrip("."): lit(None).cast(float)
for q in quantile
}

if typ.is_numeric():
numeric_col = True
col_mean = col.mean()
col_std = col.std()
col_min = col.min().cast(float)
col_max = col.max().cast(float)
quantile_values = {
f"p{100*q:.6f}".rstrip("0").rstrip("."): col.quantile(q).cast(float)
for q in quantile
}
elif typ.is_string():
string_col = True
col_mode = col.mode()
elif typ.is_boolean():
numeric_col = True
col_mean = col.mean()
else:
# Will not calculate statistics for other types
continue

agg = self.agg(
name=lit(colname),
type=lit(str(typ)),
count=col.isnull().count(),
nulls=col.isnull().sum(),
unique=col.nunique(),
mode=col_mode,
mean=col_mean,
std=col_std,
min=col_min,
**quantile_values,
max=col_max,
)
aggs.append(agg)

t = ibis.union(*aggs)

# TODO(jiting): Need a better way to remove columns with all NULL
if string_col and not numeric_col:
t = t.select(~s.of_type("float"))
elif numeric_col and not string_col:
t = t.drop("mode")

return t

def join(
left: Table,
right: Table,
Expand Down

0 comments on commit c8d98a1

Please sign in to comment.