Skip to content

Commit

Permalink
[Feature] add categorical encoder for dataset (#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
RektPunk authored Sep 11, 2024
1 parent 6156fda commit 41670da
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
run: poetry install --no-interaction --with dev

- name: Run tests
run: poetry run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=tests/ > pytest-coverage.txt
run: poetry run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=app tests/ | tee pytest-coverage.txt

- name: Pytest coverage comment
uses: MishaKav/pytest-coverage-comment@main
Expand Down
14 changes: 11 additions & 3 deletions mqboost/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
XdataLike,
YdataLike,
)
from mqboost.utils import alpha_validate, prepare_x, prepare_y
from mqboost.encoder import MQLabelEncoder
from mqboost.utils import alpha_validate, prepare_x, prepare_y, to_dataframe


class MQDataset:
Expand Down Expand Up @@ -56,9 +57,16 @@ def __init__(
self._train_dtype: Callable = _funcs.get(TypeName.train_dtype)
self._predict_dtype: Callable = _funcs.get(TypeName.predict_dtype)

self._data = prepare_x(x=data, alphas=self._alphas)
self._columns = self._data.columns
_data = to_dataframe(data)
self.encoders: dict[str, MQLabelEncoder] = {}
for col in _data.columns:
if _data[col].dtype == "object":
_encoder = MQLabelEncoder()
_data[col] = _encoder.fit_transform(_data[col])
self.encoders.update({col: _encoder})

self._data = prepare_x(x=_data, alphas=self._alphas)
self._columns = self._data.columns
if label is not None:
self._label = prepare_y(y=label, alphas=self._alphas)
self._is_none_label = False
Expand Down
25 changes: 25 additions & 0 deletions mqboost/encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np
from sklearn.preprocessing import LabelEncoder

from mqboost.base import XdataLike


class MQLabelEncoder:
def __init__(self) -> None:
self.label_encoder = LabelEncoder()

def fit(self, series: XdataLike) -> None:
self.label_encoder.fit(list(series[~series.isna()]) + ["Unseen", "NaN"])

def transform(self, series: XdataLike) -> XdataLike:
return self.label_encoder.transform(
np.select(
[series.isna(), ~series.isin(self.label_encoder.classes_)],
["NaN", "Unseen"],
series,
)
)

def fit_transform(self, series: XdataLike) -> XdataLike:
self.fit(series=series)
return self.transform(series=series)
11 changes: 7 additions & 4 deletions mqboost/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,20 @@ def alpha_validate(
return alphas


def to_dataframe(x: XdataLike) -> pd.DataFrame:
if isinstance(x, np.ndarray) or isinstance(x, pd.Series):
x = pd.DataFrame(x)
return x


def prepare_x(
x: XdataLike,
x: pd.DataFrame,
alphas: list[float],
) -> pd.DataFrame:
"""Prepares and returns a stacked DataFrame of features repeated for each alpha, with an additional column indicating the alpha value.
Raises:
ValidationException: If the input data contains a column named '_tau'.
"""
if isinstance(x, np.ndarray) or isinstance(x, pd.Series):
x = pd.DataFrame(x)

if "_tau" in x.columns:
raise ValidationException("Column name '_tau' is not allowed.")

Expand Down
42 changes: 42 additions & 0 deletions tests/test_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import numpy as np
import pandas as pd
import pytest

from mqboost.encoder import MQLabelEncoder


# Test data for categorical variables
@pytest.fixture
def sample_data():
return pd.Series(["apple", "banana", "orange", None, "kiwi", np.nan])


# Test data for label encoding
@pytest.fixture
def sample_label_data():
return np.array([2, 3, 5, 0, 4, 0])


def test_fit_transform(sample_data):
encoder = MQLabelEncoder()
transformed = encoder.fit_transform(sample_data)

# Check that the transformed result is numeric
assert transformed is not None
assert transformed.dtype == int
assert len(transformed) == len(sample_data)


def test_unseen_and_nan_values(sample_data):
encoder = MQLabelEncoder()
encoder.fit(sample_data)

# Include new unseen value and check behavior
test_data = pd.Series(["apple", "unknown", None, "melon", np.nan])
transformed = encoder.transform(test_data)

# Check for correct handling of unseen and NaN values
assert (
transformed
== encoder.label_encoder.transform(["apple", "Unseen", "NaN", "Unseen", "NaN"])
).all()
2 changes: 1 addition & 1 deletion tests/test_objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_mqobjective_check_loss_initialization():

def test_mqobjective_huber_loss_initialization():
"""Test MQObjective initialization with huber loss."""
delta = 0.1
delta = 0.05
mq_objective = MQObjective(
alphas=alphas,
objective=ObjectiveName.huber,
Expand Down
40 changes: 35 additions & 5 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
params_validate,
prepare_x,
prepare_y,
to_dataframe,
)


Expand Down Expand Up @@ -47,6 +48,38 @@ def test_alpha_validate_raises_on_empty_alphas():
alpha_validate([])


# Test for to_dataframe
def test_to_dataframe_with_dataframe():
x = pd.DataFrame(
{
"feature_1": [1, 2],
"feature_2": [3, 4],
}
)
pd.testing.assert_frame_equal(x, to_dataframe(x))


def test_to_dataframe_with_series():
x = pd.Series([1, 2, 3])
expected = pd.DataFrame(
{
0: [1, 2, 3],
}
)
pd.testing.assert_frame_equal(expected, to_dataframe(x))


def test_to_dataframe_with_array():
x = np.array([[1, 2], [3, 4]])
expected = pd.DataFrame(
{
0: [1, 3],
1: [2, 4],
}
)
pd.testing.assert_frame_equal(expected, to_dataframe(x))


# Test for prepare_x
def test_prepare_x_with_dataframe():
x = pd.DataFrame(
Expand All @@ -57,7 +90,6 @@ def test_prepare_x_with_dataframe():
)
alphas = [0.1, 0.2]
result = prepare_x(x, alphas)

expected = pd.DataFrame(
{
"feature_1": [1, 2, 1, 2],
Expand All @@ -72,8 +104,7 @@ def test_prepare_x_with_dataframe():
def test_prepare_x_with_series():
x = pd.Series([1, 2, 3])
alphas = [0.1, 0.2]
result = prepare_x(x, alphas)

result = prepare_x(to_dataframe(x), alphas)
expected = pd.DataFrame(
{
0: [1, 2, 3, 1, 2, 3],
Expand All @@ -87,8 +118,7 @@ def test_prepare_x_with_series():
def test_prepare_x_with_array():
x = np.array([[1, 2], [3, 4]])
alphas = [0.1, 0.2]
result = prepare_x(x, alphas)

result = prepare_x(to_dataframe(x), alphas)
expected = pd.DataFrame(
{
0: [1, 3, 1, 3],
Expand Down

1 comment on commit 41670da

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests Skipped Failures Errors Time
86 0 💤 0 ❌ 0 🔥 6.992s ⏱️

Please sign in to comment.