Skip to content

Commit

Permalink
Add unit tests for prepare_categorical_data (#5)
Browse files Browse the repository at this point in the history
* Add unit tests for prepare_categorical_data

* Fix for high cardinal nominal feature

* Add support for discrete numerical features

* Remove handling of numerical features

* Move test values to conftest.py

* Add check_x to transformers

* Improve project structure

* Fix workflows

* Codecov fix

* Add CodeCov

* Add files via upload

* Add project files

* Exchange links in readme file

* Exchange images

* Fix failed tests due to changes made in the PR

* Add MathExpressionTransformer and ValueIndicatorTransformer

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Setup dependabot

* Fix EOF errors

* Fix isort error

* Fix black errors

* Fix all precommit errors

* Bump version number

Co-authored-by: Prem Srinivasan <prem.srinivasan@invia.de>
Co-authored-by: Chris Lemke <11752694+chrislemke@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 25, 2022
1 parent ed98f9a commit 4950acf
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 20 deletions.
2 changes: 1 addition & 1 deletion feature_reviser/feature_selection/reviser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def revise_classifier(
X = prepare_categorical_data(X, cat_features)

cat_df = X.select_dtypes(include=["category"])
num_df = X.select_dtypes(include=[np.float32])
num_df = X.select_dtypes(include=[np.float64, np.int64])

if len(X.select_dtypes(include=["category"]).columns) == 0:
raise ValueError(
Expand Down
4 changes: 2 additions & 2 deletions feature_reviser/feature_selection/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def select_with_classifier(

if cat_features:
# pylint: disable=consider-using-set-comprehension
if not set([f[0] for f in cat_features]).issubset(set(X.columns)):
if not {f[0] for f in cat_features}.issubset(set(X.columns)):
raise ValueError("cat_features must be in the dataframe!")
X = prepare_categorical_data(X, cat_features)

Expand All @@ -77,7 +77,7 @@ def select_with_classifier(
)

cat_df = X.select_dtypes(include=["category"])
num_df = X.select_dtypes(include=[np.float32])
num_df = X.select_dtypes(include=[np.float64, np.int64])

print("Selecting categorical features...")
cat_transformer = SelectKBest(chi2, k=min(cat_k_best, cat_df.shape[1] - 1)).fit(
Expand Down
14 changes: 4 additions & 10 deletions feature_reviser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from typing import List, Tuple

import numpy as np
import pandas as pd


Expand Down Expand Up @@ -42,12 +41,11 @@ def prepare_categorical_data(
Args:
X (pandas.DataFrame): The dataframe containing the categorical features.
categories (List[Tuple[str, int]]): The list of categorical features and their thresholds.
If the number of unique values is greater than the threshold, the feature is considered numerical and not categorical.
If the number of unique values is greater than the threshold, the feature is not considered categorical.
Raises:
TypeError: If the features are not a `pandas.DataFrame` or the categorical features are not a `List[str]`.
ValueError: If the categorical features are not in the dataframe.
ValueError: If the dataframe does not contain any categorical features.
Returns:
pandas.DataFrame: The original dataframe with the categorical features converted to `category` dtype.
Expand All @@ -59,21 +57,17 @@ def prepare_categorical_data(
if not set(set(cat_features)).issubset(set(X.columns)):
raise ValueError("cat_features must be in the dataframe!")

cont_features = []
for feature, threshold in categories:
if X[feature].nunique() > threshold:
cont_features.append(feature)
if (str(X[feature].dtype) != "object") or (X[feature].nunique() > threshold):
cat_features.remove(feature)
print(
f"""{feature} has less unique vlaues that {threshold}.
So it is not a categorical feature and will be handled as a numerical feature."""
f"""{feature} has fewer unique values than {threshold}.
So it will not be converted to Category dtype."""
)

pd.options.mode.chained_assignment = None
for column in X.columns:
if column in cat_features:
X[column] = X[column].astype("category").copy()
else:
X[column] = X[column].astype(np.float32).copy()

return X
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "feature-reviser"
version = "0.3.6"
version = "0.3.7"
description = "Construct and find the right features of your dataset for the right model."
authors = ["Christopher Lemke <chris@syhbl.mozmail.com>"]
license = "MIT"
Expand Down
19 changes: 16 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def X() -> pd.DataFrame:
{
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"b": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1],
"c": [1, 1, 1, 1, 2, 2, 2, 3, 3, 4],
"d": [5, 5, 5, 6, 6, 6, 6, 7, 7, 7],
"e": [5, 5, 5, 5, 5, 5, 5, 5, 7, 7],
"c": ["1", "1", "1", "1", "2", "2", "2", "3", "3", "4"],
"d": ["5", "5", "5", "6", "6", "6", "6", "7", "7", "7"],
"e": ["5", "5", "5", "5", "5", "5", "5", "5", "7", "7"],
}
)

Expand Down Expand Up @@ -162,3 +162,16 @@ def X_strings() -> pd.DataFrame:
@pytest.fixture()
def y() -> pd.Series:
return pd.Series([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])


@pytest.fixture()
def X_categorical() -> pd.DataFrame:
return pd.DataFrame(
{
"a": ["A1", "A2", "A2", "A1", "A1", "A2", "A1", "A1"],
"b": [1, 2, 3, 4, 5, 6, 7, 8],
"c": [1, 2, 3, 1, 2, 3, 1, 3],
"d": [1.1, 2, 3, 4, 5, 6, 7, 8],
"e": ["A", "B", "C", "D", "E", "F", "G", "H"],
}
)
4 changes: 2 additions & 2 deletions tests/test_feature_selection/test_reviser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ def test_revise_classifier_chi2(clf, X, y) -> None:

def test_revise_classifier_f_statistic(clf, X, y) -> None:
result = revise_classifier(clf, X, y, [("c", 10), ("d", 10), ("e", 10)])
assert result[1].iloc[0][0] == 21.33333396911621
assert result[1].iloc[0][1] == 23.945640563964844
assert result[1].iloc[0][0] == 21.333333333333332
assert result[1].iloc[0][1] == 23.9456209150327
32 changes: 31 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
import pytest

from feature_reviser.utils import check_data
from feature_reviser.utils import check_data, prepare_categorical_data

# pylint: disable=missing-function-docstring

Expand Down Expand Up @@ -34,3 +34,33 @@ def test_check_data_y_nan() -> None:
check_data(pd.DataFrame([1, 2, 3]), pd.Series([1, None, 3]))

assert "y must not contain NaN values!" == str(error.value)


def test_prepare_categorical_data_x_type() -> None:
with pytest.raises(TypeError) as error:
prepare_categorical_data("wrong_type", [("a", 1), ("b", 2)])

assert "features must be a pandas.DataFrame!" == str(error.value)


def test_prepare_categorical_data_x_value(X_categorical) -> None:
with pytest.raises(ValueError) as error:
prepare_categorical_data(X_categorical, [("f", 1)])

assert "cat_features must be in the dataframe!" == str(error.value)


def test_prepare_categorical_data(X_categorical) -> None:
categories = [("a", 2), ("b", 3), ("c", 3), ("d", 3), ("e", 3)]
result = prepare_categorical_data(X_categorical, categories).dtypes
expected = pd.Series(
[
"category",
"int64",
"int64",
"float64",
"object",
],
index=X_categorical.columns,
)
assert result.equals(expected)

0 comments on commit 4950acf

Please sign in to comment.