Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unit tests for prepare_categorical_data #5

Merged
merged 27 commits into from
Sep 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6741255
Add unit tests for prepare_categorical_data
premsrii Sep 23, 2022
11c9075
Fix for high cardinal nominal feature
premsrii Sep 23, 2022
110c53f
Add support for discrete numerical features
premsrii Sep 23, 2022
f46344f
Remove handling of numerical features
premsrii Sep 24, 2022
aa6e224
Move test values to conftest.py
premsrii Sep 24, 2022
48e85bb
Add check_x to transformers
chrislemke Sep 23, 2022
36e4c4b
Improve project structure
chrislemke Sep 24, 2022
fd5a659
Fix workflows
chrislemke Sep 24, 2022
8101344
Codecov fix
chrislemke Sep 24, 2022
e2e8fac
Add CodeCov
chrislemke Sep 24, 2022
46549a0
Add files via upload
chrislemke Sep 24, 2022
1a1c722
Add project files
chrislemke Sep 24, 2022
bd5573e
Exchange links in readme file
chrislemke Sep 24, 2022
3f78546
Exchange images
chrislemke Sep 24, 2022
cc2f68a
Fix failed tests due to changes made in the PR
premsrii Sep 25, 2022
6fb7740
Add MathExpressionTransformer and ValueIndicatorTransformer
chrislemke Sep 25, 2022
bc83f38
Update README.md
chrislemke Sep 25, 2022
2e498e1
Update README.md
chrislemke Sep 25, 2022
5d36161
Update README.md
chrislemke Sep 25, 2022
f189a7b
Update README.md
chrislemke Sep 25, 2022
7fd9127
Setup dependabot
chrislemke Sep 25, 2022
5360477
Merge branch 'main' into prem
premsrii Sep 25, 2022
16b7328
Fix EOF errors
premsrii Sep 25, 2022
ae0bf21
Fix isort error
premsrii Sep 25, 2022
6d1bfa9
Fix black errors
premsrii Sep 25, 2022
6edfb74
Fix all precommit errors
premsrii Sep 25, 2022
646570d
Bump version number
premsrii Sep 25, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion feature_reviser/feature_selection/reviser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def revise_classifier(
X = prepare_categorical_data(X, cat_features)

cat_df = X.select_dtypes(include=["category"])
num_df = X.select_dtypes(include=[np.float32])
num_df = X.select_dtypes(include=[np.float64, np.int64])

if len(X.select_dtypes(include=["category"]).columns) == 0:
raise ValueError(
Expand Down
4 changes: 2 additions & 2 deletions feature_reviser/feature_selection/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def select_with_classifier(

if cat_features:
# pylint: disable=consider-using-set-comprehension
if not set([f[0] for f in cat_features]).issubset(set(X.columns)):
if not {f[0] for f in cat_features}.issubset(set(X.columns)):
raise ValueError("cat_features must be in the dataframe!")
X = prepare_categorical_data(X, cat_features)

Expand All @@ -77,7 +77,7 @@ def select_with_classifier(
)

cat_df = X.select_dtypes(include=["category"])
num_df = X.select_dtypes(include=[np.float32])
num_df = X.select_dtypes(include=[np.float64, np.int64])

print("Selecting categorical features...")
cat_transformer = SelectKBest(chi2, k=min(cat_k_best, cat_df.shape[1] - 1)).fit(
Expand Down
14 changes: 4 additions & 10 deletions feature_reviser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from typing import List, Tuple

import numpy as np
import pandas as pd


Expand Down Expand Up @@ -42,12 +41,11 @@ def prepare_categorical_data(
Args:
X (pandas.DataFrame): The dataframe containing the categorical features.
categories (List[Tuple[str, int]]): The list of categorical features and their thresholds.
If the number of unique values is greater than the threshold, the feature is considered numerical and not categorical.
If the number of unique values is greater than the threshold, the feature is not considered categorical.

Raises:
TypeError: If the features are not a `pandas.DataFrame` or the categorical features are not a `List[str]`.
ValueError: If the categorical features are not in the dataframe.
ValueError: If the dataframe does not contain any categorical features.

Returns:
pandas.DataFrame: The original dataframe with the categorical features converted to `category` dtype.
Expand All @@ -59,21 +57,17 @@ def prepare_categorical_data(
if not set(set(cat_features)).issubset(set(X.columns)):
raise ValueError("cat_features must be in the dataframe!")

cont_features = []
for feature, threshold in categories:
if X[feature].nunique() > threshold:
cont_features.append(feature)
if (str(X[feature].dtype) != "object") or (X[feature].nunique() > threshold):
cat_features.remove(feature)
print(
f"""{feature} has less unique vlaues that {threshold}.
So it is not a categorical feature and will be handled as a numerical feature."""
f"""{feature} has fewer unique values than {threshold}.
chrislemke marked this conversation as resolved.
Show resolved Hide resolved
So it will not be converted to Category dtype."""
)

pd.options.mode.chained_assignment = None
for column in X.columns:
if column in cat_features:
X[column] = X[column].astype("category").copy()
else:
X[column] = X[column].astype(np.float32).copy()
chrislemke marked this conversation as resolved.
Show resolved Hide resolved

return X
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "feature-reviser"
version = "0.3.6"
version = "0.3.7"
description = "Construct and find the right features of your dataset for the right model."
authors = ["Christopher Lemke <chris@syhbl.mozmail.com>"]
license = "MIT"
Expand Down
19 changes: 16 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def X() -> pd.DataFrame:
{
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"b": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1],
"c": [1, 1, 1, 1, 2, 2, 2, 3, 3, 4],
"d": [5, 5, 5, 6, 6, 6, 6, 7, 7, 7],
"e": [5, 5, 5, 5, 5, 5, 5, 5, 7, 7],
"c": ["1", "1", "1", "1", "2", "2", "2", "3", "3", "4"],
"d": ["5", "5", "5", "6", "6", "6", "6", "7", "7", "7"],
"e": ["5", "5", "5", "5", "5", "5", "5", "5", "7", "7"],
}
)

Expand Down Expand Up @@ -162,3 +162,16 @@ def X_strings() -> pd.DataFrame:
@pytest.fixture()
def y() -> pd.Series:
return pd.Series([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])


@pytest.fixture()
def X_categorical() -> pd.DataFrame:
return pd.DataFrame(
{
"a": ["A1", "A2", "A2", "A1", "A1", "A2", "A1", "A1"],
"b": [1, 2, 3, 4, 5, 6, 7, 8],
"c": [1, 2, 3, 1, 2, 3, 1, 3],
"d": [1.1, 2, 3, 4, 5, 6, 7, 8],
"e": ["A", "B", "C", "D", "E", "F", "G", "H"],
}
)
4 changes: 2 additions & 2 deletions tests/test_feature_selection/test_reviser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ def test_revise_classifier_chi2(clf, X, y) -> None:

def test_revise_classifier_f_statistic(clf, X, y) -> None:
result = revise_classifier(clf, X, y, [("c", 10), ("d", 10), ("e", 10)])
assert result[1].iloc[0][0] == 21.33333396911621
assert result[1].iloc[0][1] == 23.945640563964844
assert result[1].iloc[0][0] == 21.333333333333332
assert result[1].iloc[0][1] == 23.9456209150327
32 changes: 31 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
import pytest

from feature_reviser.utils import check_data
from feature_reviser.utils import check_data, prepare_categorical_data

# pylint: disable=missing-function-docstring

Expand Down Expand Up @@ -34,3 +34,33 @@ def test_check_data_y_nan() -> None:
check_data(pd.DataFrame([1, 2, 3]), pd.Series([1, None, 3]))

assert "y must not contain NaN values!" == str(error.value)


def test_prepare_categorical_data_x_type() -> None:
with pytest.raises(TypeError) as error:
prepare_categorical_data("wrong_type", [("a", 1), ("b", 2)])

assert "features must be a pandas.DataFrame!" == str(error.value)


def test_prepare_categorical_data_x_value(X_categorical) -> None:
with pytest.raises(ValueError) as error:
prepare_categorical_data(X_categorical, [("f", 1)])

assert "cat_features must be in the dataframe!" == str(error.value)


def test_prepare_categorical_data(X_categorical) -> None:
categories = [("a", 2), ("b", 3), ("c", 3), ("d", 3), ("e", 3)]
result = prepare_categorical_data(X_categorical, categories).dtypes
expected = pd.Series(
[
"category",
"int64",
"int64",
"float64",
"object",
],
index=X_categorical.columns,
)
assert result.equals(expected)