feat(steps): implement zero-variance filter (#48)

ibis-project · Apr 5, 2024 · 3e64906 · 3e64906
1 parent 913a543
commit 3e64906
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 0 deletions.
diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py
@@ -1,5 +1,6 @@
 from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
 from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode
+from ibisml.steps.feature_selection import ZeroVariance
 from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
 from ibisml.steps.standardize import ScaleMinMax, ScaleStandard
 from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime
@@ -21,4 +22,5 @@
     "OneHotEncode",
     "ScaleMinMax",
     "ScaleStandard",
+    "ZeroVariance",
 )
diff --git a/ibisml/steps/feature_selection.py b/ibisml/steps/feature_selection.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from typing import Any, Iterable
+
+import ibis.expr.types as ir
+
+from ibisml.core import Metadata, Step
+from ibisml.select import SelectionType, selector
+
+
+class ZeroVariance(Step):
+    """A step for removing columns with zero variance.
+
+    Parameters
+    ----------
+    inputs : SelectionType
+        A selection of columns to analyze for zero variance.
+    tolerance : int | float, optional
+        Tolerance level for considering variance as zero.
+        Columns with variance less than this tolerance will be removed.
+        Default is 1e-4.
+
+    Examples
+    --------
+    >>> import ibisml as ml
+
+    To remove columns with zero variance:
+    >>> step = ml.ZeroVariance(ml.everything())
+
+    To remove all numeric columns with zero variance:
+    >>> step = ml.ZeroVariance(ml.numeric())
+
+    To remove all string or categorical columns with only one unique value:
+    >>> step = ml.ZeroVariance(ml.nominal())
+    """
+
+    def __init__(self, inputs: SelectionType, *, tolerance: int | float = 1e-4):
+        self.inputs = selector(inputs)
+        self.tolerance = tolerance
+
+    def _repr(self) -> Iterable[tuple[str, Any]]:
+        yield ("", self.inputs)
+        yield ("tolerance", self.tolerance)
+
+    def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
+        columns = self.inputs.select_columns(table, metadata)
+        cols = []
+        if columns:
+            aggs = []
+            for name in columns:
+                c = table[name]
+                if isinstance(c, ir.NumericColumn):
+                    # Compute variance for numeric columns
+                    aggs.append(c.var().name(f"{name}_var"))
+                else:
+                    # Compute unique count for non-numeric columns
+                    # NULL value is not counted in nunique()
+                    aggs.append(c.nunique().name(f"{name}_var"))
+
+            results = table.aggregate(aggs).execute().to_dict("records")[0]
+            for name in columns:
+                c = table[name]
+                if isinstance(c, ir.NumericColumn):
+                    # Check variance for numeric columns
+                    if results[f"{name}_var"] < self.tolerance:
+                        cols.append(name)
+                elif results[f"{name}_var"] < 2:
+                    # Check unique count for non-numeric columns
+                    cols.append(name)
+
+        self.cols_ = cols
+
+    def transform_table(self, table: ir.Table) -> ir.Table:
+        return table.drop(self.cols_)
diff --git a/tests/test_feature_selection.py b/tests/test_feature_selection.py
@@ -0,0 +1,49 @@
+import ibis
+import pandas as pd
+
+import ibisml as ml
+
+
+def test_zero_variance():
+    zv_numeric_col = [1.0] * 10
+    non_zv_numeric_col = list(range(10))
+    zv_string_col = ["String"] * 10
+    non_zv_string_col = [f"String_{i}" for i in range(10)]
+    start_timestamp = pd.Timestamp("2000-01-01 00:00:00.000")
+    zv_timestamp_col = [start_timestamp] * 10
+    non_zv_timestamp_col = [
+        start_timestamp + pd.Timedelta(minutes=i) for i in range(10)
+    ]
+
+    zv_cols = {
+        "zero_variance_numeric_col",
+        "zero_variance_string_col",
+        "zero_variance_timestamp_col",
+    }
+
+    t_train = ibis.memtable(
+        {
+            "zero_variance_numeric_col": zv_numeric_col,
+            "non_zero_variance_numeric_col": non_zv_numeric_col,
+            "zero_variance_string_col": zv_string_col,
+            "non_zero_variance_string_col": non_zv_string_col,
+            "zero_variance_timestamp_col": zv_timestamp_col,
+            "non_zero_variance_timestamp_col": non_zv_timestamp_col,
+        }
+    )
+    t_test = ibis.memtable(
+        {
+            "zero_variance_numeric_col": [],
+            "non_zero_variance_numeric_col": [],
+            "zero_variance_string_col": [],
+            "non_zero_variance_string_col": [],
+            "zero_variance_timestamp_col": [],
+            "non_zero_variance_timestamp_col": [],
+        }
+    )
+
+    step = ml.ZeroVariance(ml.everything())
+    step.fit_table(t_train, ml.core.Metadata())
+    res = step.transform_table(t_test)
+    sol = t_test.drop(zv_cols)
+    assert sol.equals(res)