Skip to content

Commit

Permalink
feat(steps): implement zero-variance filter (#48)
Browse files Browse the repository at this point in the history
  • Loading branch information
jitingxu1 authored Apr 5, 2024
1 parent 913a543 commit 3e64906
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 0 deletions.
2 changes: 2 additions & 0 deletions ibisml/steps/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode
from ibisml.steps.feature_selection import ZeroVariance
from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
from ibisml.steps.standardize import ScaleMinMax, ScaleStandard
from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime
Expand All @@ -21,4 +22,5 @@
"OneHotEncode",
"ScaleMinMax",
"ScaleStandard",
"ZeroVariance",
)
74 changes: 74 additions & 0 deletions ibisml/steps/feature_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations

from typing import Any, Iterable

import ibis.expr.types as ir

from ibisml.core import Metadata, Step
from ibisml.select import SelectionType, selector


class ZeroVariance(Step):
"""A step for removing columns with zero variance.
Parameters
----------
inputs : SelectionType
A selection of columns to analyze for zero variance.
tolerance : int | float, optional
Tolerance level for considering variance as zero.
Columns with variance less than this tolerance will be removed.
Default is 1e-4.
Examples
--------
>>> import ibisml as ml
To remove columns with zero variance:
>>> step = ml.ZeroVariance(ml.everything())
To remove all numeric columns with zero variance:
>>> step = ml.ZeroVariance(ml.numeric())
To remove all string or categorical columns with only one unique value:
>>> step = ml.ZeroVariance(ml.nominal())
"""

def __init__(self, inputs: SelectionType, *, tolerance: int | float = 1e-4):
self.inputs = selector(inputs)
self.tolerance = tolerance

def _repr(self) -> Iterable[tuple[str, Any]]:
yield ("", self.inputs)
yield ("tolerance", self.tolerance)

def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
columns = self.inputs.select_columns(table, metadata)
cols = []
if columns:
aggs = []
for name in columns:
c = table[name]
if isinstance(c, ir.NumericColumn):
# Compute variance for numeric columns
aggs.append(c.var().name(f"{name}_var"))
else:
# Compute unique count for non-numeric columns
# NULL value is not counted in nunique()
aggs.append(c.nunique().name(f"{name}_var"))

results = table.aggregate(aggs).execute().to_dict("records")[0]
for name in columns:
c = table[name]
if isinstance(c, ir.NumericColumn):
# Check variance for numeric columns
if results[f"{name}_var"] < self.tolerance:
cols.append(name)
elif results[f"{name}_var"] < 2:
# Check unique count for non-numeric columns
cols.append(name)

self.cols_ = cols

def transform_table(self, table: ir.Table) -> ir.Table:
return table.drop(self.cols_)
49 changes: 49 additions & 0 deletions tests/test_feature_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import ibis
import pandas as pd

import ibisml as ml


def test_zero_variance():
zv_numeric_col = [1.0] * 10
non_zv_numeric_col = list(range(10))
zv_string_col = ["String"] * 10
non_zv_string_col = [f"String_{i}" for i in range(10)]
start_timestamp = pd.Timestamp("2000-01-01 00:00:00.000")
zv_timestamp_col = [start_timestamp] * 10
non_zv_timestamp_col = [
start_timestamp + pd.Timedelta(minutes=i) for i in range(10)
]

zv_cols = {
"zero_variance_numeric_col",
"zero_variance_string_col",
"zero_variance_timestamp_col",
}

t_train = ibis.memtable(
{
"zero_variance_numeric_col": zv_numeric_col,
"non_zero_variance_numeric_col": non_zv_numeric_col,
"zero_variance_string_col": zv_string_col,
"non_zero_variance_string_col": non_zv_string_col,
"zero_variance_timestamp_col": zv_timestamp_col,
"non_zero_variance_timestamp_col": non_zv_timestamp_col,
}
)
t_test = ibis.memtable(
{
"zero_variance_numeric_col": [],
"non_zero_variance_numeric_col": [],
"zero_variance_string_col": [],
"non_zero_variance_string_col": [],
"zero_variance_timestamp_col": [],
"non_zero_variance_timestamp_col": [],
}
)

step = ml.ZeroVariance(ml.everything())
step.fit_table(t_train, ml.core.Metadata())
res = step.transform_table(t_test)
sol = t_test.drop(zv_cols)
assert sol.equals(res)

0 comments on commit 3e64906

Please sign in to comment.