From 3e649063a313356666b21908c78cb1d708dd44b2 Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:44:28 -0700 Subject: [PATCH] feat(steps): implement zero-variance filter (#48) --- ibisml/steps/__init__.py | 2 + ibisml/steps/feature_selection.py | 74 +++++++++++++++++++++++++++++++ tests/test_feature_selection.py | 49 ++++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 ibisml/steps/feature_selection.py create mode 100644 tests/test_feature_selection.py diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py index 8b6d7c7..bdb71d2 100644 --- a/ibisml/steps/__init__.py +++ b/ibisml/steps/__init__.py @@ -1,5 +1,6 @@ from ibisml.steps.common import Cast, Drop, Mutate, MutateAt from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode +from ibisml.steps.feature_selection import ZeroVariance from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode from ibisml.steps.standardize import ScaleMinMax, ScaleStandard from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime @@ -21,4 +22,5 @@ "OneHotEncode", "ScaleMinMax", "ScaleStandard", + "ZeroVariance", ) diff --git a/ibisml/steps/feature_selection.py b/ibisml/steps/feature_selection.py new file mode 100644 index 0000000..fb99648 --- /dev/null +++ b/ibisml/steps/feature_selection.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import Any, Iterable + +import ibis.expr.types as ir + +from ibisml.core import Metadata, Step +from ibisml.select import SelectionType, selector + + +class ZeroVariance(Step): + """A step for removing columns with zero variance. + + Parameters + ---------- + inputs : SelectionType + A selection of columns to analyze for zero variance. + tolerance : int | float, optional + Tolerance level for considering variance as zero. + Columns with variance less than this tolerance will be removed. + Default is 1e-4. + + Examples + -------- + >>> import ibisml as ml + + To remove columns with zero variance: + >>> step = ml.ZeroVariance(ml.everything()) + + To remove all numeric columns with zero variance: + >>> step = ml.ZeroVariance(ml.numeric()) + + To remove all string or categorical columns with only one unique value: + >>> step = ml.ZeroVariance(ml.nominal()) + """ + + def __init__(self, inputs: SelectionType, *, tolerance: int | float = 1e-4): + self.inputs = selector(inputs) + self.tolerance = tolerance + + def _repr(self) -> Iterable[tuple[str, Any]]: + yield ("", self.inputs) + yield ("tolerance", self.tolerance) + + def fit_table(self, table: ir.Table, metadata: Metadata) -> None: + columns = self.inputs.select_columns(table, metadata) + cols = [] + if columns: + aggs = [] + for name in columns: + c = table[name] + if isinstance(c, ir.NumericColumn): + # Compute variance for numeric columns + aggs.append(c.var().name(f"{name}_var")) + else: + # Compute unique count for non-numeric columns + # NULL value is not counted in nunique() + aggs.append(c.nunique().name(f"{name}_var")) + + results = table.aggregate(aggs).execute().to_dict("records")[0] + for name in columns: + c = table[name] + if isinstance(c, ir.NumericColumn): + # Check variance for numeric columns + if results[f"{name}_var"] < self.tolerance: + cols.append(name) + elif results[f"{name}_var"] < 2: + # Check unique count for non-numeric columns + cols.append(name) + + self.cols_ = cols + + def transform_table(self, table: ir.Table) -> ir.Table: + return table.drop(self.cols_) diff --git a/tests/test_feature_selection.py b/tests/test_feature_selection.py new file mode 100644 index 0000000..be7191f --- /dev/null +++ b/tests/test_feature_selection.py @@ -0,0 +1,49 @@ +import ibis +import pandas as pd + +import ibisml as ml + + +def test_zero_variance(): + zv_numeric_col = [1.0] * 10 + non_zv_numeric_col = list(range(10)) + zv_string_col = ["String"] * 10 + non_zv_string_col = [f"String_{i}" for i in range(10)] + start_timestamp = pd.Timestamp("2000-01-01 00:00:00.000") + zv_timestamp_col = [start_timestamp] * 10 + non_zv_timestamp_col = [ + start_timestamp + pd.Timedelta(minutes=i) for i in range(10) + ] + + zv_cols = { + "zero_variance_numeric_col", + "zero_variance_string_col", + "zero_variance_timestamp_col", + } + + t_train = ibis.memtable( + { + "zero_variance_numeric_col": zv_numeric_col, + "non_zero_variance_numeric_col": non_zv_numeric_col, + "zero_variance_string_col": zv_string_col, + "non_zero_variance_string_col": non_zv_string_col, + "zero_variance_timestamp_col": zv_timestamp_col, + "non_zero_variance_timestamp_col": non_zv_timestamp_col, + } + ) + t_test = ibis.memtable( + { + "zero_variance_numeric_col": [], + "non_zero_variance_numeric_col": [], + "zero_variance_string_col": [], + "non_zero_variance_string_col": [], + "zero_variance_timestamp_col": [], + "non_zero_variance_timestamp_col": [], + } + ) + + step = ml.ZeroVariance(ml.everything()) + step.fit_table(t_train, ml.core.Metadata()) + res = step.transform_table(t_test) + sol = t_test.drop(zv_cols) + assert sol.equals(res)