From c8d98a16e2a18ba0964b0b9f0c0ce2ea7bb274cf Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Mon, 15 Apr 2024 09:55:36 -0700 Subject: [PATCH] feat(api): add `describe` method to compute summary stats of table expressions (#8739) --- ibis/backends/tests/errors.py | 5 +- ibis/backends/tests/test_generic.py | 170 +++++++++++++++++++++++++++- ibis/expr/types/relations.py | 133 ++++++++++++++++++++++ 3 files changed, 306 insertions(+), 2 deletions(-) diff --git a/ibis/backends/tests/errors.py b/ibis/backends/tests/errors.py index 3613f097ad46..29023d861795 100644 --- a/ibis/backends/tests/errors.py +++ b/ibis/backends/tests/errors.py @@ -62,8 +62,11 @@ from polars import ComputeError as PolarsComputeError from polars import PanicException as PolarsPanicException from polars.exceptions import InvalidOperationError as PolarsInvalidOperationError + from polars.exceptions import SchemaError as PolarsSchemaError except ImportError: - PolarsComputeError = PolarsPanicException = PolarsInvalidOperationError = None + PolarsComputeError = PolarsPanicException = PolarsInvalidOperationError = ( + PolarsSchemaError + ) = None try: from pyarrow import ArrowInvalid, ArrowNotImplementedError diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 6b33348dc749..631a3f163872 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -17,7 +17,7 @@ import ibis.expr.datatypes as dt import ibis.selectors as s from ibis import _ -from ibis.backends.conftest import is_older_than +from ibis.backends.conftest import is_newer_than, is_older_than from ibis.backends.tests.errors import ( ClickHouseDatabaseError, ExaQueryError, @@ -25,6 +25,7 @@ ImpalaHiveServer2Error, MySQLProgrammingError, OracleDatabaseError, + PolarsSchemaError, PsycoPg2InternalError, Py4JJavaError, PyDruidProgrammingError, @@ -586,6 +587,173 @@ def test_table_info(alltypes): assert expr.columns == list(df.columns) +@pytest.mark.notimpl( + [ + "datafusion", + "impala", + "trino", + "mysql", + "mssql", + "trino", + "flink", + ], + raises=com.OperationNotDefinedError, + reason="quantile and mode is not supported", +) +@pytest.mark.notimpl( + [ + "exasol", + "druid", + ], + raises=com.OperationNotDefinedError, + reason="Mode and StandardDev is not supported", +) +@pytest.mark.notimpl( + ["polars"], + raises=PolarsSchemaError, + reason="cannot extend/append Float64 with Float32", +) +@pytest.mark.notyet( + ["druid"], + raises=PyDruidProgrammingError, + reason="Druid only supports trivial unions", +) +@pytest.mark.parametrize( + ("selector", "expected_columns"), + [ + param( + s.any_of( + s.of_type("numeric"), + s.of_type("string"), + s.of_type("bool"), + s.of_type("timestamp"), + ), + [ + "name", + "type", + "count", + "nulls", + "unique", + "mode", + "mean", + "std", + "min", + "p25", + "p50", + "p75", + "max", + ], + marks=[ + pytest.mark.notimpl( + ["sqlite"], + raises=com.OperationNotDefinedError, + reason="quantile is not supported", + ), + pytest.mark.notimpl( + [ + "clickhouse", + "pyspark", + "clickhouse", + "risingwave", + "impala", + ], + raises=com.OperationNotDefinedError, + reason="mode is not supported", + ), + pytest.mark.notimpl( + ["dask"], + raises=ValueError, + reason="Unable to concatenate DataFrame with unknown division specifying axis=1", + ), + pytest.mark.notimpl( + ["oracle"], + raises=(OracleDatabaseError, com.OperationNotDefinedError), + reason="Mode is not supported and ORA-02000: missing AS keyword", + ), + pytest.mark.broken( + ["pandas"], + condition=is_newer_than("pandas", "2.1.0"), + reason="FutureWarning: concat empty or all-NA entries is deprecated", + ), + ], + id="all_cols", + ), + param( + s.of_type("numeric"), + [ + "name", + "type", + "count", + "nulls", + "unique", + "mean", + "std", + "min", + "p25", + "p50", + "p75", + "max", + ], + marks=[ + pytest.mark.notimpl( + ["sqlite"], + raises=com.OperationNotDefinedError, + reason="quantile is not supported", + ), + pytest.mark.notimpl( + ["oracle"], + raises=OracleDatabaseError, + reason="Mode is not supported and ORA-02000: missing AS keyword", + ), + ], + id="numeric_col", + ), + param( + s.of_type("string"), + [ + "name", + "type", + "count", + "nulls", + "unique", + "mode", + ], + marks=[ + pytest.mark.notimpl( + [ + "clickhouse", + "pyspark", + "clickhouse", + "risingwave", + "impala", + ], + raises=com.OperationNotDefinedError, + reason="mode is not supported", + ), + pytest.mark.notimpl( + ["oracle"], + raises=com.OperationNotDefinedError, + reason="Mode is not supported and ORA-02000: missing AS keyword", + ), + pytest.mark.notimpl( + ["dask"], + raises=ValueError, + reason="Unable to concatenate DataFrame with unknown division specifying axis=1", + ), + ], + id="string_col", + ), + ], +) +def test_table_describe(alltypes, selector, expected_columns): + sometypes = alltypes.select(selector) + expr = sometypes.describe() + df = expr.execute() + assert sorted(sometypes.columns) == sorted(df.name) + assert sorted(expr.columns) == sorted(expected_columns) + assert sorted(expr.columns) == sorted(df.columns) + + @pytest.mark.parametrize( ("ibis_op", "pandas_op"), [ diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index eb47921e8806..ee4c2003a85b 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -2886,6 +2886,139 @@ def info(self) -> Table: aggs.append(agg) return ibis.union(*aggs).order_by(ibis.asc("pos")) + def describe( + self, quantile: Sequence[ir.NumericValue | float] = (0.25, 0.5, 0.75) + ) -> Table: + """Return summary information about a table. + + Parameters + ---------- + quantile + The quantiles to compute for numerical columns. Defaults to (0.25, 0.5, 0.75). + + Returns + ------- + Table + A table containing summary information about the columns of self. + + Notes + ----- + This function computes summary statistics for each column in the table. For + numerical columns, it computes statistics such as minimum, maximum, mean, + standard deviation, and quantiles. For string columns, it computes the mode + and the number of unique values. + + Examples + -------- + >>> import ibis + >>> import ibis.selectors as s + >>> ibis.options.interactive = True + >>> p = ibis.examples.penguins.fetch() + >>> p.describe() + ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━┓ + ┃ name ┃ type ┃ count ┃ nulls ┃ unique ┃ mode ┃ … ┃ + ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━┩ + │ string │ string │ int64 │ int64 │ int64 │ string │ … │ + ├───────────────────┼─────────┼───────┼───────┼────────┼────────┼───┤ + │ species │ string │ 344 │ 0 │ 3 │ Adelie │ … │ + │ island │ string │ 344 │ 0 │ 3 │ Biscoe │ … │ + │ bill_length_mm │ float64 │ 344 │ 2 │ 164 │ NULL │ … │ + │ bill_depth_mm │ float64 │ 344 │ 2 │ 80 │ NULL │ … │ + │ flipper_length_mm │ int64 │ 344 │ 2 │ 55 │ NULL │ … │ + │ body_mass_g │ int64 │ 344 │ 2 │ 94 │ NULL │ … │ + │ sex │ string │ 344 │ 11 │ 2 │ male │ … │ + │ year │ int64 │ 344 │ 0 │ 3 │ NULL │ … │ + └───────────────────┴─────────┴───────┴───────┴────────┴────────┴───┘ + >>> p.select(s.of_type("numeric")).describe() + ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━┓ + ┃ name ┃ type ┃ count ┃ nulls ┃ unique ┃ mean ┃ … ┃ + ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━┩ + │ string │ string │ int64 │ int64 │ int64 │ float64 │ … │ + ├───────────────────┼─────────┼───────┼───────┼────────┼─────────────┼───┤ + │ bill_length_mm │ float64 │ 344 │ 2 │ 164 │ 43.921930 │ … │ + │ bill_depth_mm │ float64 │ 344 │ 2 │ 80 │ 17.151170 │ … │ + │ flipper_length_mm │ int64 │ 344 │ 2 │ 55 │ 200.915205 │ … │ + │ body_mass_g │ int64 │ 344 │ 2 │ 94 │ 4201.754386 │ … │ + │ year │ int64 │ 344 │ 0 │ 3 │ 2008.029070 │ … │ + └───────────────────┴─────────┴───────┴───────┴────────┴─────────────┴───┘ + >>> p.select(s.of_type("string")).describe() + ┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ + ┃ name ┃ type ┃ count ┃ nulls ┃ unique ┃ mode ┃ + ┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ + │ string │ string │ int64 │ int64 │ int64 │ string │ + ├─────────┼────────┼───────┼───────┼────────┼────────┤ + │ species │ string │ 344 │ 0 │ 3 │ Adelie │ + │ island │ string │ 344 │ 0 │ 3 │ Biscoe │ + │ sex │ string │ 344 │ 11 │ 2 │ male │ + └─────────┴────────┴───────┴───────┴────────┴────────┘ + """ + import ibis.selectors as s + from ibis import literal as lit + + quantile = sorted(quantile) + aggs = [] + string_col = False + numeric_col = False + for colname in self.columns: + col = self[colname] + typ = col.type() + + # default statistics to None + col_mean = lit(None).cast(float) + col_std = lit(None).cast(float) + col_min = lit(None).cast(float) + col_max = lit(None).cast(float) + col_mode = lit(None).cast(str) + quantile_values = { + f"p{100*q:.6f}".rstrip("0").rstrip("."): lit(None).cast(float) + for q in quantile + } + + if typ.is_numeric(): + numeric_col = True + col_mean = col.mean() + col_std = col.std() + col_min = col.min().cast(float) + col_max = col.max().cast(float) + quantile_values = { + f"p{100*q:.6f}".rstrip("0").rstrip("."): col.quantile(q).cast(float) + for q in quantile + } + elif typ.is_string(): + string_col = True + col_mode = col.mode() + elif typ.is_boolean(): + numeric_col = True + col_mean = col.mean() + else: + # Will not calculate statistics for other types + continue + + agg = self.agg( + name=lit(colname), + type=lit(str(typ)), + count=col.isnull().count(), + nulls=col.isnull().sum(), + unique=col.nunique(), + mode=col_mode, + mean=col_mean, + std=col_std, + min=col_min, + **quantile_values, + max=col_max, + ) + aggs.append(agg) + + t = ibis.union(*aggs) + + # TODO(jiting): Need a better way to remove columns with all NULL + if string_col and not numeric_col: + t = t.select(~s.of_type("float")) + elif numeric_col and not string_col: + t = t.drop("mode") + + return t + def join( left: Table, right: Table,