ibis-project · deepyaman · Apr 23, 2024 · Apr 18, 2024 · Apr 22, 2024 · Apr 23, 2024
diff --git a/ibisml/steps/encode.py b/ibisml/steps/encode.py
@@ -72,7 +72,9 @@ class OneHotEncode(Step):
     """A step for one-hot encoding select columns.
 
     The original input column is dropped, and N-category new columns are
-    created with names like ``{input_column}_{category}``.
+    created with names like ``{input_column}_{category}``. Unknown categories
+    will be ignored during transformation, the resulting one-hot encoded
+    columns for this row will be all zeros.
 
     Parameters
     ----------
@@ -152,7 +154,7 @@ def transform_table(self, table: ir.Table) -> ir.Table:
 
         return table.mutate(
             [
-                (table[col] == cat).cast("int8").name(f"{col}_{cat}")
+                ibis.ifelse((table[col] == cat), 1, 0).cast("int8").name(f"{col}_{cat}")
                 for col, cats in self.categories_.items()
                 for cat in cats
             ]

diff --git a/tests/test_encode.py b/tests/test_encode.py
@@ -1,11 +1,15 @@
+from functools import reduce
+
 import ibis
 import pandas as pd
+import pytest
 
 import ibisml as ml
 
 
-def test_count_encode():
-    t_train = ibis.memtable(
+@pytest.fixture()
+def t_train():
+    return ibis.memtable(
         {
             "time": [
                 pd.Timestamp("2016-05-25 13:30:00.023"),
@@ -20,7 +24,11 @@ def test_count_encode():
             "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", None, "AAPL", "GOOG", "MSFT"],
         }
     )
-    t_test = ibis.memtable(
+
+
+@pytest.fixture()
+def t_test():
+    return ibis.memtable(
         {
             "time": [
                 pd.Timestamp("2016-05-25 13:30:00.023"),
@@ -30,11 +38,57 @@ def test_count_encode():
                 pd.Timestamp("2016-05-25 13:30:00.050"),
                 pd.Timestamp("2016-05-25 13:30:00.051"),
             ],
+            # AMZN is unkown category for training dataset
             "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AMZN", None],
         }
     )
 
+
+def test_count_encode(t_train, t_test):
     step = ml.CountEncode("ticker")
     step.fit_table(t_train, ml.core.Metadata())
     res = step.transform_table(t_test)
     assert res.to_pandas().sort_values(by="time").ticker.to_list() == [4, 4, 2, 2, 0, 0]
+
+
+def test_onehotencode(t_train, t_test):
+    col = "ticker"
+    step = ml.OneHotEncode(col)
+    step.fit_table(t_train, ml.core.Metadata())
+    result = step.transform_table(t_test)
+
+    encoded_cols = [
+        f"{col}_{v!s}"
+        for v in t_train.select("ticker").distinct().ticker.to_pyarrow().to_pylist()
+    ]
+
+    # Check the number of columns
+    assert (
+        len(result.columns) == len(t_test.columns) + len(encoded_cols) - 1
+    ), "Incorrect number of encoded columns"
+
+    # Ensure all encoded columns are present
+    assert set(result.columns).issuperset(
+        set(encoded_cols)
+    ), "Not all encoded columns are present"
+
+    # Verify that each encoded value is either 0 or 1
+    assert all(
+        ((result[col_name] == 0) | (result[col_name] == 1)).all().execute()
+        for col_name in encoded_cols
+    ), "Encoded values are not all 0 or 1"
+
+    # Check the sum of all encoded columns for each row
+    result = result.mutate(
+        sum_encode_per_row=reduce(
+            lambda acc, col_name: acc + result[col_name], encoded_cols, 0
+        )
+    )
+    assert result.to_pandas().sum_encode_per_row.to_list() == [
+        1,
+        1,
+        1,
+        1,
+        0,  # The 5th row's ticker "AMZN" is unknown for the training data
+        1,
+    ], "Incorrect sum of encoded columns per row per feature"