feat(steps): implement DiscretizeKBins transform (#57)

ibis-project · Apr 24, 2024 · 54d5d6f · 54d5d6f
1 parent 986385c
commit 54d5d6f
Show file tree

Hide file tree

Showing 3 changed files with 179 additions and 0 deletions.
diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py
@@ -1,4 +1,5 @@
 from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
+from ibisml.steps.discretize import DiscretizeKBins
 from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode
 from ibisml.steps.feature_engineering import PolynomialFeatures
 from ibisml.steps.feature_selection import ZeroVariance
@@ -10,6 +11,7 @@
     "Cast",
     "CategoricalEncode",
     "CountEncode",
+    "DiscretizeKBins",
     "Drop",
     "ExpandDate",
     "ExpandDateTime",

diff --git a/ibisml/steps/discretize.py b/ibisml/steps/discretize.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from typing import Any, Iterable
+
+import ibis
+import ibis.expr.types as ir
+import numpy as np
+
+from ibisml.core import Metadata, Step
+from ibisml.select import SelectionType, selector
+
+
+class DiscretizeKBins(Step):
+    """A step for binning numeric data into intervals.
+
+    Parameters
+    ----------
+    inputs
+        A selection of columns to bin.
+    n_bins : int, default=5
+        Number of bins to create.
+    strategy : str, {'uniform', 'quantile'}, default='uniform'
+        Strategy used to define the bin edges.
+        - 'uniform': Evenly spaced bins between the minimum and maximum values.
+        - 'quantile': Bins are created based on data quantiles.
+
+
+    Raises
+    ----------
+    ValueError
+        If `n_bins` is less than or equal to 1 or if an unsupported
+        `strategy` is provided.
+
+    Examples
+    --------
+    >>> import ibis
+    >>> import ibisml as ml
+    >>> from ibisml.core import Metadata
+    >>> ibis.options.interactive = True
+
+    Load penguins dataset
+
+    >>> p = ibis.examples.penguins.fetch()
+
+    Bin all numeric columns.
+
+    >>> step = ml.KBinsDiscretizer(ml.numeric(), n_bins=10)
+    >>> step.fit_table(p, Metadata())
+    >>> step.transform_table(p)
+
+    Bin specific numeric columns.
+
+    >>> step = ml.KBinsDiscretizer(["bill_length_mm"], strategy="quantile")
+    >>> step.fit_table(p, Metadata())
+    >>> step.transform_table(p)
+    """
+
+    def __init__(
+        self, inputs: SelectionType, *, n_bins: int = 5, strategy: str = "uniform"
+    ):
+        if n_bins <= 1:
+            raise ValueError("Number of bins must be greater than 1.")
+
+        if strategy not in ["uniform", "quantile"]:
+            raise ValueError(
+                f"Unsupported strategy {strategy!r} encountered."
+                "Supported strategies are 'uniform' and 'quantile'."
+            )
+
+        self.inputs = selector(inputs)
+        self.n_bins = n_bins
+        self.strategy = strategy
+
+    def _repr(self) -> Iterable[tuple[str, Any]]:
+        yield ("", self.inputs)
+        yield ("n_bins", self.n_bins)
+        yield ("strategy", self.strategy)
+
+    def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
+        columns = self.inputs.select_columns(table, metadata)
+        bins_edge = {}
+        if columns:
+            if self.strategy == "uniform":
+                bins_edge = self._fit_uniform_strategy(table, columns)
+            elif self.strategy == "quantile":
+                bins_edge = self._fit_quantile_strategy(table, columns)
+        self.bins_edge_ = bins_edge
+
+    def _fit_uniform_strategy(
+        self, table: ir.Table, columns: list[str]
+    ) -> dict[str, list[float]]:
+        aggs = []
+        for col_name in columns:
+            col = table[col_name]
+            if not isinstance(col, ir.NumericColumn):
+                raise ValueError(
+                    f"Cannot discretize {col_name!r} - this column is not numeric"
+                )
+            aggs.append(col.max().name(f"{col_name}_max"))
+            aggs.append(col.min().name(f"{col_name}_min"))
+
+        results = table.aggregate(aggs).execute().to_dict("records")[0]
+
+        return {
+            col_name: np.linspace(
+                results[f"{col_name}_min"], results[f"{col_name}_max"], self.n_bins + 1
+            )
+            for col_name in columns
+        }
+
+    def _fit_quantile_strategy(
+        self, table: ir.Table, columns: list[str]
+    ) -> dict[str, list[float]]:
+        aggs = []
+        percentiles = np.linspace(0, 1, self.n_bins + 1)
+        for col_name in columns:
+            col = table[col_name]
+            if not isinstance(col, ir.NumericColumn):
+                raise ValueError(
+                    f"Cannot discretize {col_name!r} - this column is not numeric"
+                )
+            aggs.extend([col.quantile(q).name(f"{col_name}_{q}") for q in percentiles])
+
+        results = table.aggregate(aggs).execute().to_dict("records")[0]
+
+        return {
+            col_name: [results[f"{col_name}_{q}"] for q in percentiles]
+            for col_name in columns
+        }
+
+    def transform_table(self, table: ir.Table) -> ir.Table:
+        aggs = []
+        for col_name, edges in self.bins_edge_.items():
+            edges = edges[1:-1]
+            col = table[col_name]
+            case_builder = ibis.case()
+            if len(edges) >= 1:
+                case_builder = case_builder.when(col <= edges[0], 0)
+                case_builder = case_builder.when(col > edges[-1], len(edges))
+            for i, cutoff in enumerate(edges):
+                if i == 0:
+                    continue
+                prev_cutoff = edges[i - 1]
+                case_builder = case_builder.when(
+                    (col > prev_cutoff) & (col <= cutoff), i
+                )
+            case_builder = case_builder.end()
+            col_name = f"{col_name}_{self.n_bins}_bin_{self.strategy}"
+            aggs.append({col_name: case_builder})
+
+        return table.mutate(
+            **{name: expr for agg in aggs for name, expr in agg.items()}
+        )
diff --git a/tests/test_discretize.py b/tests/test_discretize.py
@@ -0,0 +1,24 @@
+import ibis
+import pandas as pd
+import pandas.testing as tm
+import pytest
+
+import ibisml as ml
+
+
+@pytest.mark.parametrize("strategy", ["uniform", "quantile"])
+def test_discretize(strategy):
+    col = "col"
+    k = 9
+    train_table = ibis.memtable({col: range(1, 11)})
+    variable_col_data = [float("-inf"), 1.5, 2.5, 3.5, 8.5, float("inf")]
+    test_table = ibis.memtable({col: variable_col_data})
+    expected = pd.DataFrame(
+        {col: variable_col_data, f"{col}_{k}_bin_{strategy}": [0, 0, 1, 2, 7, 8]}
+    )
+
+    step = ml.DiscretizeKBins(col, n_bins=k, strategy=strategy)
+    step.fit_table(train_table, ml.core.Metadata())
+    result = step.transform_table(test_table)
+
+    tm.assert_frame_equal(result.execute(), expected, check_dtype=False)