Skip to content

Commit

Permalink
feat(steps): implement DiscretizeKBins transform (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
jitingxu1 authored Apr 24, 2024
1 parent 986385c commit 54d5d6f
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 0 deletions.
2 changes: 2 additions & 0 deletions ibisml/steps/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
from ibisml.steps.discretize import DiscretizeKBins
from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode
from ibisml.steps.feature_engineering import PolynomialFeatures
from ibisml.steps.feature_selection import ZeroVariance
Expand All @@ -10,6 +11,7 @@
"Cast",
"CategoricalEncode",
"CountEncode",
"DiscretizeKBins",
"Drop",
"ExpandDate",
"ExpandDateTime",
Expand Down
153 changes: 153 additions & 0 deletions ibisml/steps/discretize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
from __future__ import annotations

from typing import Any, Iterable

import ibis
import ibis.expr.types as ir
import numpy as np

from ibisml.core import Metadata, Step
from ibisml.select import SelectionType, selector


class DiscretizeKBins(Step):
"""A step for binning numeric data into intervals.
Parameters
----------
inputs
A selection of columns to bin.
n_bins : int, default=5
Number of bins to create.
strategy : str, {'uniform', 'quantile'}, default='uniform'
Strategy used to define the bin edges.
- 'uniform': Evenly spaced bins between the minimum and maximum values.
- 'quantile': Bins are created based on data quantiles.
Raises
----------
ValueError
If `n_bins` is less than or equal to 1 or if an unsupported
`strategy` is provided.
Examples
--------
>>> import ibis
>>> import ibisml as ml
>>> from ibisml.core import Metadata
>>> ibis.options.interactive = True
Load penguins dataset
>>> p = ibis.examples.penguins.fetch()
Bin all numeric columns.
>>> step = ml.KBinsDiscretizer(ml.numeric(), n_bins=10)
>>> step.fit_table(p, Metadata())
>>> step.transform_table(p)
Bin specific numeric columns.
>>> step = ml.KBinsDiscretizer(["bill_length_mm"], strategy="quantile")
>>> step.fit_table(p, Metadata())
>>> step.transform_table(p)
"""

def __init__(
self, inputs: SelectionType, *, n_bins: int = 5, strategy: str = "uniform"
):
if n_bins <= 1:
raise ValueError("Number of bins must be greater than 1.")

if strategy not in ["uniform", "quantile"]:
raise ValueError(
f"Unsupported strategy {strategy!r} encountered."
"Supported strategies are 'uniform' and 'quantile'."
)

self.inputs = selector(inputs)
self.n_bins = n_bins
self.strategy = strategy

def _repr(self) -> Iterable[tuple[str, Any]]:
yield ("", self.inputs)
yield ("n_bins", self.n_bins)
yield ("strategy", self.strategy)

def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
columns = self.inputs.select_columns(table, metadata)
bins_edge = {}
if columns:
if self.strategy == "uniform":
bins_edge = self._fit_uniform_strategy(table, columns)
elif self.strategy == "quantile":
bins_edge = self._fit_quantile_strategy(table, columns)
self.bins_edge_ = bins_edge

def _fit_uniform_strategy(
self, table: ir.Table, columns: list[str]
) -> dict[str, list[float]]:
aggs = []
for col_name in columns:
col = table[col_name]
if not isinstance(col, ir.NumericColumn):
raise ValueError(
f"Cannot discretize {col_name!r} - this column is not numeric"
)
aggs.append(col.max().name(f"{col_name}_max"))
aggs.append(col.min().name(f"{col_name}_min"))

results = table.aggregate(aggs).execute().to_dict("records")[0]

return {
col_name: np.linspace(
results[f"{col_name}_min"], results[f"{col_name}_max"], self.n_bins + 1
)
for col_name in columns
}

def _fit_quantile_strategy(
self, table: ir.Table, columns: list[str]
) -> dict[str, list[float]]:
aggs = []
percentiles = np.linspace(0, 1, self.n_bins + 1)
for col_name in columns:
col = table[col_name]
if not isinstance(col, ir.NumericColumn):
raise ValueError(
f"Cannot discretize {col_name!r} - this column is not numeric"
)
aggs.extend([col.quantile(q).name(f"{col_name}_{q}") for q in percentiles])

results = table.aggregate(aggs).execute().to_dict("records")[0]

return {
col_name: [results[f"{col_name}_{q}"] for q in percentiles]
for col_name in columns
}

def transform_table(self, table: ir.Table) -> ir.Table:
aggs = []
for col_name, edges in self.bins_edge_.items():
edges = edges[1:-1]
col = table[col_name]
case_builder = ibis.case()
if len(edges) >= 1:
case_builder = case_builder.when(col <= edges[0], 0)
case_builder = case_builder.when(col > edges[-1], len(edges))
for i, cutoff in enumerate(edges):
if i == 0:
continue
prev_cutoff = edges[i - 1]
case_builder = case_builder.when(
(col > prev_cutoff) & (col <= cutoff), i
)
case_builder = case_builder.end()
col_name = f"{col_name}_{self.n_bins}_bin_{self.strategy}"
aggs.append({col_name: case_builder})

return table.mutate(
**{name: expr for agg in aggs for name, expr in agg.items()}
)
24 changes: 24 additions & 0 deletions tests/test_discretize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import ibis
import pandas as pd
import pandas.testing as tm
import pytest

import ibisml as ml


@pytest.mark.parametrize("strategy", ["uniform", "quantile"])
def test_discretize(strategy):
col = "col"
k = 9
train_table = ibis.memtable({col: range(1, 11)})
variable_col_data = [float("-inf"), 1.5, 2.5, 3.5, 8.5, float("inf")]
test_table = ibis.memtable({col: variable_col_data})
expected = pd.DataFrame(
{col: variable_col_data, f"{col}_{k}_bin_{strategy}": [0, 0, 1, 2, 7, 8]}
)

step = ml.DiscretizeKBins(col, n_bins=k, strategy=strategy)
step.fit_table(train_table, ml.core.Metadata())
result = step.transform_table(test_table)

tm.assert_frame_equal(result.execute(), expected, check_dtype=False)

0 comments on commit 54d5d6f

Please sign in to comment.