feat(api): add describe method to compute summary stats of table ex…

…pressions (#8739)
ibis-project · Apr 15, 2024 · c8d98a1 · c8d98a1
1 parent 1ab7fa8
commit c8d98a1
Show file tree

Hide file tree

Showing 3 changed files with 306 additions and 2 deletions.
diff --git a/ibis/backends/tests/errors.py b/ibis/backends/tests/errors.py
@@ -62,8 +62,11 @@
     from polars import ComputeError as PolarsComputeError
     from polars import PanicException as PolarsPanicException
     from polars.exceptions import InvalidOperationError as PolarsInvalidOperationError
+    from polars.exceptions import SchemaError as PolarsSchemaError
 except ImportError:
-    PolarsComputeError = PolarsPanicException = PolarsInvalidOperationError = None
+    PolarsComputeError = PolarsPanicException = PolarsInvalidOperationError = (
+        PolarsSchemaError
+    ) = None
 
 try:
     from pyarrow import ArrowInvalid, ArrowNotImplementedError

diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
@@ -17,14 +17,15 @@
 import ibis.expr.datatypes as dt
 import ibis.selectors as s
 from ibis import _
-from ibis.backends.conftest import is_older_than
+from ibis.backends.conftest import is_newer_than, is_older_than
 from ibis.backends.tests.errors import (
     ClickHouseDatabaseError,
     ExaQueryError,
     GoogleBadRequest,
     ImpalaHiveServer2Error,
     MySQLProgrammingError,
     OracleDatabaseError,
+    PolarsSchemaError,
     PsycoPg2InternalError,
     Py4JJavaError,
     PyDruidProgrammingError,
@@ -586,6 +587,173 @@ def test_table_info(alltypes):
     assert expr.columns == list(df.columns)
 
 
+@pytest.mark.notimpl(
+    [
+        "datafusion",
+        "impala",
+        "trino",
+        "mysql",
+        "mssql",
+        "trino",
+        "flink",
+    ],
+    raises=com.OperationNotDefinedError,
+    reason="quantile and mode is not supported",
+)
+@pytest.mark.notimpl(
+    [
+        "exasol",
+        "druid",
+    ],
+    raises=com.OperationNotDefinedError,
+    reason="Mode and StandardDev is not supported",
+)
+@pytest.mark.notimpl(
+    ["polars"],
+    raises=PolarsSchemaError,
+    reason="cannot extend/append Float64 with Float32",
+)
+@pytest.mark.notyet(
+    ["druid"],
+    raises=PyDruidProgrammingError,
+    reason="Druid only supports trivial unions",
+)
+@pytest.mark.parametrize(
+    ("selector", "expected_columns"),
+    [
+        param(
+            s.any_of(
+                s.of_type("numeric"),
+                s.of_type("string"),
+                s.of_type("bool"),
+                s.of_type("timestamp"),
+            ),
+            [
+                "name",
+                "type",
+                "count",
+                "nulls",
+                "unique",
+                "mode",
+                "mean",
+                "std",
+                "min",
+                "p25",
+                "p50",
+                "p75",
+                "max",
+            ],
+            marks=[
+                pytest.mark.notimpl(
+                    ["sqlite"],
+                    raises=com.OperationNotDefinedError,
+                    reason="quantile is not supported",
+                ),
+                pytest.mark.notimpl(
+                    [
+                        "clickhouse",
+                        "pyspark",
+                        "clickhouse",
+                        "risingwave",
+                        "impala",
+                    ],
+                    raises=com.OperationNotDefinedError,
+                    reason="mode is not supported",
+                ),
+                pytest.mark.notimpl(
+                    ["dask"],
+                    raises=ValueError,
+                    reason="Unable to concatenate DataFrame with unknown division specifying axis=1",
+                ),
+                pytest.mark.notimpl(
+                    ["oracle"],
+                    raises=(OracleDatabaseError, com.OperationNotDefinedError),
+                    reason="Mode is not supported and ORA-02000: missing AS keyword",
+                ),
+                pytest.mark.broken(
+                    ["pandas"],
+                    condition=is_newer_than("pandas", "2.1.0"),
+                    reason="FutureWarning: concat empty or all-NA entries is deprecated",
+                ),
+            ],
+            id="all_cols",
+        ),
+        param(
+            s.of_type("numeric"),
+            [
+                "name",
+                "type",
+                "count",
+                "nulls",
+                "unique",
+                "mean",
+                "std",
+                "min",
+                "p25",
+                "p50",
+                "p75",
+                "max",
+            ],
+            marks=[
+                pytest.mark.notimpl(
+                    ["sqlite"],
+                    raises=com.OperationNotDefinedError,
+                    reason="quantile is not supported",
+                ),
+                pytest.mark.notimpl(
+                    ["oracle"],
+                    raises=OracleDatabaseError,
+                    reason="Mode is not supported and ORA-02000: missing AS keyword",
+                ),
+            ],
+            id="numeric_col",
+        ),
+        param(
+            s.of_type("string"),
+            [
+                "name",
+                "type",
+                "count",
+                "nulls",
+                "unique",
+                "mode",
+            ],
+            marks=[
+                pytest.mark.notimpl(
+                    [
+                        "clickhouse",
+                        "pyspark",
+                        "clickhouse",
+                        "risingwave",
+                        "impala",
+                    ],
+                    raises=com.OperationNotDefinedError,
+                    reason="mode is not supported",
+                ),
+                pytest.mark.notimpl(
+                    ["oracle"],
+                    raises=com.OperationNotDefinedError,
+                    reason="Mode is not supported and ORA-02000: missing AS keyword",
+                ),
+                pytest.mark.notimpl(
+                    ["dask"],
+                    raises=ValueError,
+                    reason="Unable to concatenate DataFrame with unknown division specifying axis=1",
+                ),
+            ],
+            id="string_col",
+        ),
+    ],
+)
+def test_table_describe(alltypes, selector, expected_columns):
+    sometypes = alltypes.select(selector)
+    expr = sometypes.describe()
+    df = expr.execute()
+    assert sorted(sometypes.columns) == sorted(df.name)
+    assert sorted(expr.columns) == sorted(expected_columns)
+    assert sorted(expr.columns) == sorted(df.columns)
+
+
 @pytest.mark.parametrize(
     ("ibis_op", "pandas_op"),
     [

diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py
@@ -2886,6 +2886,139 @@ def info(self) -> Table:
             aggs.append(agg)
         return ibis.union(*aggs).order_by(ibis.asc("pos"))
 
+    def describe(
+        self, quantile: Sequence[ir.NumericValue | float] = (0.25, 0.5, 0.75)
+    ) -> Table:
+        """Return summary information about a table.
+
+        Parameters
+        ----------
+        quantile
+            The quantiles to compute for numerical columns. Defaults to (0.25, 0.5, 0.75).
+
+        Returns
+        -------
+        Table
+            A table containing summary information about the columns of self.
+
+        Notes
+        -----
+        This function computes summary statistics for each column in the table. For
+        numerical columns, it computes statistics such as minimum, maximum, mean,
+        standard deviation, and quantiles. For string columns, it computes the mode
+        and the number of unique values.
+
+        Examples
+        --------
+        >>> import ibis
+        >>> import ibis.selectors as s
+        >>> ibis.options.interactive = True
+        >>> p = ibis.examples.penguins.fetch()
+        >>> p.describe()
+        ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━┓
+        ┃ name              ┃ type    ┃ count ┃ nulls ┃ unique ┃ mode   ┃ … ┃
+        ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━┩
+        │ string            │ string  │ int64 │ int64 │ int64  │ string │ … │
+        ├───────────────────┼─────────┼───────┼───────┼────────┼────────┼───┤
+        │ species           │ string  │   344 │     0 │      3 │ Adelie │ … │
+        │ island            │ string  │   344 │     0 │      3 │ Biscoe │ … │
+        │ bill_length_mm    │ float64 │   344 │     2 │    164 │ NULL   │ … │
+        │ bill_depth_mm     │ float64 │   344 │     2 │     80 │ NULL   │ … │
+        │ flipper_length_mm │ int64   │   344 │     2 │     55 │ NULL   │ … │
+        │ body_mass_g       │ int64   │   344 │     2 │     94 │ NULL   │ … │
+        │ sex               │ string  │   344 │    11 │      2 │ male   │ … │
+        │ year              │ int64   │   344 │     0 │      3 │ NULL   │ … │
+        └───────────────────┴─────────┴───────┴───────┴────────┴────────┴───┘
+        >>> p.select(s.of_type("numeric")).describe()
+        ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━┓
+        ┃ name              ┃ type    ┃ count ┃ nulls ┃ unique ┃ mean        ┃ … ┃
+        ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━┩
+        │ string            │ string  │ int64 │ int64 │ int64  │ float64     │ … │
+        ├───────────────────┼─────────┼───────┼───────┼────────┼─────────────┼───┤
+        │ bill_length_mm    │ float64 │   344 │     2 │    164 │   43.921930 │ … │
+        │ bill_depth_mm     │ float64 │   344 │     2 │     80 │   17.151170 │ … │
+        │ flipper_length_mm │ int64   │   344 │     2 │     55 │  200.915205 │ … │
+        │ body_mass_g       │ int64   │   344 │     2 │     94 │ 4201.754386 │ … │
+        │ year              │ int64   │   344 │     0 │      3 │ 2008.029070 │ … │
+        └───────────────────┴─────────┴───────┴───────┴────────┴─────────────┴───┘
+        >>> p.select(s.of_type("string")).describe()
+        ┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
+        ┃ name    ┃ type   ┃ count ┃ nulls ┃ unique ┃ mode   ┃
+        ┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
+        │ string  │ string │ int64 │ int64 │ int64  │ string │
+        ├─────────┼────────┼───────┼───────┼────────┼────────┤
+        │ species │ string │   344 │     0 │      3 │ Adelie │
+        │ island  │ string │   344 │     0 │      3 │ Biscoe │
+        │ sex     │ string │   344 │    11 │      2 │ male   │
+        └─────────┴────────┴───────┴───────┴────────┴────────┘
+        """
+        import ibis.selectors as s
+        from ibis import literal as lit
+
+        quantile = sorted(quantile)
+        aggs = []
+        string_col = False
+        numeric_col = False
+        for colname in self.columns:
+            col = self[colname]
+            typ = col.type()
+
+            # default statistics to None
+            col_mean = lit(None).cast(float)
+            col_std = lit(None).cast(float)
+            col_min = lit(None).cast(float)
+            col_max = lit(None).cast(float)
+            col_mode = lit(None).cast(str)
+            quantile_values = {
+                f"p{100*q:.6f}".rstrip("0").rstrip("."): lit(None).cast(float)
+                for q in quantile
+            }
+
+            if typ.is_numeric():
+                numeric_col = True
+                col_mean = col.mean()
+                col_std = col.std()
+                col_min = col.min().cast(float)
+                col_max = col.max().cast(float)
+                quantile_values = {
+                    f"p{100*q:.6f}".rstrip("0").rstrip("."): col.quantile(q).cast(float)
+                    for q in quantile
+                }
+            elif typ.is_string():
+                string_col = True
+                col_mode = col.mode()
+            elif typ.is_boolean():
+                numeric_col = True
+                col_mean = col.mean()
+            else:
+                # Will not calculate statistics for other types
+                continue
+
+            agg = self.agg(
+                name=lit(colname),
+                type=lit(str(typ)),
+                count=col.isnull().count(),
+                nulls=col.isnull().sum(),
+                unique=col.nunique(),
+                mode=col_mode,
+                mean=col_mean,
+                std=col_std,
+                min=col_min,
+                **quantile_values,
+                max=col_max,
+            )
+            aggs.append(agg)
+
+        t = ibis.union(*aggs)
+
+        # TODO(jiting): Need a better way to remove columns with all NULL
+        if string_col and not numeric_col:
+            t = t.select(~s.of_type("float"))
+        elif numeric_col and not string_col:
+            t = t.drop("mode")
+
+        return t
+
     def join(
         left: Table,
         right: Table,