Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] add categorical encoder for dataset #32

Merged
merged 7 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
run: poetry install --no-interaction --with dev

- name: Run tests
run: poetry run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=tests/ > pytest-coverage.txt
run: poetry run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=app tests/ | tee pytest-coverage.txt

- name: Pytest coverage comment
uses: MishaKav/pytest-coverage-comment@main
Expand Down
14 changes: 11 additions & 3 deletions mqboost/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
XdataLike,
YdataLike,
)
from mqboost.utils import alpha_validate, prepare_x, prepare_y
from mqboost.encoder import MQLabelEncoder
from mqboost.utils import alpha_validate, prepare_x, prepare_y, to_dataframe


class MQDataset:
Expand Down Expand Up @@ -56,9 +57,16 @@ def __init__(
self._train_dtype: Callable = _funcs.get(TypeName.train_dtype)
self._predict_dtype: Callable = _funcs.get(TypeName.predict_dtype)

self._data = prepare_x(x=data, alphas=self._alphas)
self._columns = self._data.columns
_data = to_dataframe(data)
self.encoders: dict[str, MQLabelEncoder] = {}
for col in _data.columns:
if _data[col].dtype == "object":
_encoder = MQLabelEncoder()
_data[col] = _encoder.fit_transform(_data[col])
self.encoders.update({col: _encoder})

self._data = prepare_x(x=_data, alphas=self._alphas)
self._columns = self._data.columns
if label is not None:
self._label = prepare_y(y=label, alphas=self._alphas)
self._is_none_label = False
Expand Down
25 changes: 25 additions & 0 deletions mqboost/encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np
from sklearn.preprocessing import LabelEncoder

from mqboost.base import XdataLike


class MQLabelEncoder:
def __init__(self) -> None:
self.label_encoder = LabelEncoder()

def fit(self, series: XdataLike) -> None:
self.label_encoder.fit(list(series[~series.isna()]) + ["Unseen", "NaN"])

def transform(self, series: XdataLike) -> XdataLike:
return self.label_encoder.transform(
np.select(
[series.isna(), ~series.isin(self.label_encoder.classes_)],
["NaN", "Unseen"],
series,
)
)

def fit_transform(self, series: XdataLike) -> XdataLike:
self.fit(series=series)
return self.transform(series=series)
11 changes: 7 additions & 4 deletions mqboost/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,20 @@ def alpha_validate(
return alphas


def to_dataframe(x: XdataLike) -> pd.DataFrame:
if isinstance(x, np.ndarray) or isinstance(x, pd.Series):
x = pd.DataFrame(x)
return x


def prepare_x(
x: XdataLike,
x: pd.DataFrame,
alphas: list[float],
) -> pd.DataFrame:
"""Prepares and returns a stacked DataFrame of features repeated for each alpha, with an additional column indicating the alpha value.
Raises:
ValidationException: If the input data contains a column named '_tau'.
"""
if isinstance(x, np.ndarray) or isinstance(x, pd.Series):
x = pd.DataFrame(x)

if "_tau" in x.columns:
raise ValidationException("Column name '_tau' is not allowed.")

Expand Down
42 changes: 42 additions & 0 deletions tests/test_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import numpy as np
import pandas as pd
import pytest

from mqboost.encoder import MQLabelEncoder


# Test data for categorical variables
@pytest.fixture
def sample_data():
return pd.Series(["apple", "banana", "orange", None, "kiwi", np.nan])


# Test data for label encoding
@pytest.fixture
def sample_label_data():
return np.array([2, 3, 5, 0, 4, 0])


def test_fit_transform(sample_data):
encoder = MQLabelEncoder()
transformed = encoder.fit_transform(sample_data)

# Check that the transformed result is numeric
assert transformed is not None
assert transformed.dtype == int
assert len(transformed) == len(sample_data)


def test_unseen_and_nan_values(sample_data):
encoder = MQLabelEncoder()
encoder.fit(sample_data)

# Include new unseen value and check behavior
test_data = pd.Series(["apple", "unknown", None, "melon", np.nan])
transformed = encoder.transform(test_data)

# Check for correct handling of unseen and NaN values
assert (
transformed
== encoder.label_encoder.transform(["apple", "Unseen", "NaN", "Unseen", "NaN"])
).all()
2 changes: 1 addition & 1 deletion tests/test_objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_mqobjective_check_loss_initialization():

def test_mqobjective_huber_loss_initialization():
"""Test MQObjective initialization with huber loss."""
delta = 0.1
delta = 0.05
mq_objective = MQObjective(
alphas=alphas,
objective=ObjectiveName.huber,
Expand Down
40 changes: 35 additions & 5 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
params_validate,
prepare_x,
prepare_y,
to_dataframe,
)


Expand Down Expand Up @@ -47,6 +48,38 @@ def test_alpha_validate_raises_on_empty_alphas():
alpha_validate([])


# Test for to_dataframe
def test_to_dataframe_with_dataframe():
x = pd.DataFrame(
{
"feature_1": [1, 2],
"feature_2": [3, 4],
}
)
pd.testing.assert_frame_equal(x, to_dataframe(x))


def test_to_dataframe_with_series():
x = pd.Series([1, 2, 3])
expected = pd.DataFrame(
{
0: [1, 2, 3],
}
)
pd.testing.assert_frame_equal(expected, to_dataframe(x))


def test_to_dataframe_with_array():
x = np.array([[1, 2], [3, 4]])
expected = pd.DataFrame(
{
0: [1, 3],
1: [2, 4],
}
)
pd.testing.assert_frame_equal(expected, to_dataframe(x))


# Test for prepare_x
def test_prepare_x_with_dataframe():
x = pd.DataFrame(
Expand All @@ -57,7 +90,6 @@ def test_prepare_x_with_dataframe():
)
alphas = [0.1, 0.2]
result = prepare_x(x, alphas)

expected = pd.DataFrame(
{
"feature_1": [1, 2, 1, 2],
Expand All @@ -72,8 +104,7 @@ def test_prepare_x_with_dataframe():
def test_prepare_x_with_series():
x = pd.Series([1, 2, 3])
alphas = [0.1, 0.2]
result = prepare_x(x, alphas)

result = prepare_x(to_dataframe(x), alphas)
expected = pd.DataFrame(
{
0: [1, 2, 3, 1, 2, 3],
Expand All @@ -87,8 +118,7 @@ def test_prepare_x_with_series():
def test_prepare_x_with_array():
x = np.array([[1, 2], [3, 4]])
alphas = [0.1, 0.2]
result = prepare_x(x, alphas)

result = prepare_x(to_dataframe(x), alphas)
expected = pd.DataFrame(
{
0: [1, 3, 1, 3],
Expand Down
Loading