[Feature] add categorical encoder for dataset (#32)

RektPunk · Sep 11, 2024 · 41670da · 41670da · github-actions · Sep 11, 2024
1 parent 6156fda
commit 41670da
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 14 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -27,7 +27,7 @@ jobs:
       run: poetry install --no-interaction --with dev
 
     - name: Run tests
-      run: poetry run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=tests/ > pytest-coverage.txt
+      run: poetry run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=app tests/ | tee pytest-coverage.txt
 
     - name: Pytest coverage comment
       uses: MishaKav/pytest-coverage-comment@main

diff --git a/mqboost/dataset.py b/mqboost/dataset.py
@@ -12,7 +12,8 @@
     XdataLike,
     YdataLike,
 )
-from mqboost.utils import alpha_validate, prepare_x, prepare_y
+from mqboost.encoder import MQLabelEncoder
+from mqboost.utils import alpha_validate, prepare_x, prepare_y, to_dataframe
 
 
 class MQDataset:
@@ -56,9 +57,16 @@ def __init__(
         self._train_dtype: Callable = _funcs.get(TypeName.train_dtype)
         self._predict_dtype: Callable = _funcs.get(TypeName.predict_dtype)
 
-        self._data = prepare_x(x=data, alphas=self._alphas)
-        self._columns = self._data.columns
+        _data = to_dataframe(data)
+        self.encoders: dict[str, MQLabelEncoder] = {}
+        for col in _data.columns:
+            if _data[col].dtype == "object":
+                _encoder = MQLabelEncoder()
+                _data[col] = _encoder.fit_transform(_data[col])
+                self.encoders.update({col: _encoder})
 
+        self._data = prepare_x(x=_data, alphas=self._alphas)
+        self._columns = self._data.columns
         if label is not None:
             self._label = prepare_y(y=label, alphas=self._alphas)
             self._is_none_label = False

diff --git a/mqboost/encoder.py b/mqboost/encoder.py
@@ -0,0 +1,25 @@
+import numpy as np
+from sklearn.preprocessing import LabelEncoder
+
+from mqboost.base import XdataLike
+
+
+class MQLabelEncoder:
+    def __init__(self) -> None:
+        self.label_encoder = LabelEncoder()
+
+    def fit(self, series: XdataLike) -> None:
+        self.label_encoder.fit(list(series[~series.isna()]) + ["Unseen", "NaN"])
+
+    def transform(self, series: XdataLike) -> XdataLike:
+        return self.label_encoder.transform(
+            np.select(
+                [series.isna(), ~series.isin(self.label_encoder.classes_)],
+                ["NaN", "Unseen"],
+                series,
+            )
+        )
+
+    def fit_transform(self, series: XdataLike) -> XdataLike:
+        self.fit(series=series)
+        return self.transform(series=series)
diff --git a/mqboost/utils.py b/mqboost/utils.py
@@ -39,17 +39,20 @@ def alpha_validate(
     return alphas
 
 
+def to_dataframe(x: XdataLike) -> pd.DataFrame:
+    if isinstance(x, np.ndarray) or isinstance(x, pd.Series):
+        x = pd.DataFrame(x)
+    return x
+
+
 def prepare_x(
-    x: XdataLike,
+    x: pd.DataFrame,
     alphas: list[float],
 ) -> pd.DataFrame:
     """Prepares and returns a stacked DataFrame of features repeated for each alpha, with an additional column indicating the alpha value.
     Raises:
         ValidationException: If the input data contains a column named '_tau'.
     """
-    if isinstance(x, np.ndarray) or isinstance(x, pd.Series):
-        x = pd.DataFrame(x)
-
     if "_tau" in x.columns:
         raise ValidationException("Column name '_tau' is not allowed.")
 

diff --git a/tests/test_encoder.py b/tests/test_encoder.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from mqboost.encoder import MQLabelEncoder
+
+
+# Test data for categorical variables
+@pytest.fixture
+def sample_data():
+    return pd.Series(["apple", "banana", "orange", None, "kiwi", np.nan])
+
+
+# Test data for label encoding
+@pytest.fixture
+def sample_label_data():
+    return np.array([2, 3, 5, 0, 4, 0])
+
+
+def test_fit_transform(sample_data):
+    encoder = MQLabelEncoder()
+    transformed = encoder.fit_transform(sample_data)
+
+    # Check that the transformed result is numeric
+    assert transformed is not None
+    assert transformed.dtype == int
+    assert len(transformed) == len(sample_data)
+
+
+def test_unseen_and_nan_values(sample_data):
+    encoder = MQLabelEncoder()
+    encoder.fit(sample_data)
+
+    # Include new unseen value and check behavior
+    test_data = pd.Series(["apple", "unknown", None, "melon", np.nan])
+    transformed = encoder.transform(test_data)
+
+    # Check for correct handling of unseen and NaN values
+    assert (
+        transformed
+        == encoder.label_encoder.transform(["apple", "Unseen", "NaN", "Unseen", "NaN"])
+    ).all()
diff --git a/tests/test_objective.py b/tests/test_objective.py
@@ -49,7 +49,7 @@ def test_mqobjective_check_loss_initialization():
 
 def test_mqobjective_huber_loss_initialization():
     """Test MQObjective initialization with huber loss."""
-    delta = 0.1
+    delta = 0.05
     mq_objective = MQObjective(
         alphas=alphas,
         objective=ObjectiveName.huber,

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -9,6 +9,7 @@
     params_validate,
     prepare_x,
     prepare_y,
+    to_dataframe,
 )
 
 
@@ -47,6 +48,38 @@ def test_alpha_validate_raises_on_empty_alphas():
         alpha_validate([])
 
 
+# Test for to_dataframe
+def test_to_dataframe_with_dataframe():
+    x = pd.DataFrame(
+        {
+            "feature_1": [1, 2],
+            "feature_2": [3, 4],
+        }
+    )
+    pd.testing.assert_frame_equal(x, to_dataframe(x))
+
+
+def test_to_dataframe_with_series():
+    x = pd.Series([1, 2, 3])
+    expected = pd.DataFrame(
+        {
+            0: [1, 2, 3],
+        }
+    )
+    pd.testing.assert_frame_equal(expected, to_dataframe(x))
+
+
+def test_to_dataframe_with_array():
+    x = np.array([[1, 2], [3, 4]])
+    expected = pd.DataFrame(
+        {
+            0: [1, 3],
+            1: [2, 4],
+        }
+    )
+    pd.testing.assert_frame_equal(expected, to_dataframe(x))
+
+
 # Test for prepare_x
 def test_prepare_x_with_dataframe():
     x = pd.DataFrame(
@@ -57,7 +90,6 @@ def test_prepare_x_with_dataframe():
     )
     alphas = [0.1, 0.2]
     result = prepare_x(x, alphas)
-
     expected = pd.DataFrame(
         {
             "feature_1": [1, 2, 1, 2],
@@ -72,8 +104,7 @@ def test_prepare_x_with_dataframe():
 def test_prepare_x_with_series():
     x = pd.Series([1, 2, 3])
     alphas = [0.1, 0.2]
-    result = prepare_x(x, alphas)
-
+    result = prepare_x(to_dataframe(x), alphas)
     expected = pd.DataFrame(
         {
             0: [1, 2, 3, 1, 2, 3],
@@ -87,8 +118,7 @@ def test_prepare_x_with_series():
 def test_prepare_x_with_array():
     x = np.array([[1, 2], [3, 4]])
     alphas = [0.1, 0.2]
-    result = prepare_x(x, alphas)
-
+    result = prepare_x(to_dataframe(x), alphas)
     expected = pd.DataFrame(
         {
             0: [1, 3, 1, 3],