Add unit tests for prepare_categorical_data (#5)

* Add unit tests for prepare_categorical_data * Fix for high cardinal nominal feature * Add support for discrete numerical features * Remove handling of numerical features * Move test values to conftest.py * Add check_x to transformers * Improve project structure * Fix workflows * Codecov fix * Add CodeCov * Add files via upload * Add project files * Exchange links in readme file * Exchange images * Fix failed tests due to changes made in the PR * Add MathExpressionTransformer and ValueIndicatorTransformer * Update README.md * Update README.md * Update README.md * Update README.md * Setup dependabot * Fix EOF errors * Fix isort error * Fix black errors * Fix all precommit errors * Bump version number Co-authored-by: Prem Srinivasan <prem.srinivasan@invia.de> Co-authored-by: Chris Lemke <11752694+chrislemke@users.noreply.github.com>
chrislemke · Sep 25, 2022 · 4950acf · 4950acf
1 parent ed98f9a
commit 4950acf
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 20 deletions.
diff --git a/feature_reviser/feature_selection/reviser.py b/feature_reviser/feature_selection/reviser.py
@@ -38,7 +38,7 @@ def revise_classifier(
     X = prepare_categorical_data(X, cat_features)
 
     cat_df = X.select_dtypes(include=["category"])
-    num_df = X.select_dtypes(include=[np.float32])
+    num_df = X.select_dtypes(include=[np.float64, np.int64])
 
     if len(X.select_dtypes(include=["category"]).columns) == 0:
         raise ValueError(

diff --git a/feature_reviser/feature_selection/selector.py b/feature_reviser/feature_selection/selector.py
@@ -58,7 +58,7 @@ def select_with_classifier(
 
     if cat_features:
         # pylint: disable=consider-using-set-comprehension
-        if not set([f[0] for f in cat_features]).issubset(set(X.columns)):
+        if not {f[0] for f in cat_features}.issubset(set(X.columns)):
             raise ValueError("cat_features must be in the dataframe!")
         X = prepare_categorical_data(X, cat_features)
 
@@ -77,7 +77,7 @@ def select_with_classifier(
             )
 
         cat_df = X.select_dtypes(include=["category"])
-        num_df = X.select_dtypes(include=[np.float32])
+        num_df = X.select_dtypes(include=[np.float64, np.int64])
 
         print("Selecting categorical features...")
         cat_transformer = SelectKBest(chi2, k=min(cat_k_best, cat_df.shape[1] - 1)).fit(

diff --git a/feature_reviser/utils.py b/feature_reviser/utils.py
@@ -2,7 +2,6 @@
 
 from typing import List, Tuple
 
-import numpy as np
 import pandas as pd
 
 
@@ -42,12 +41,11 @@ def prepare_categorical_data(
     Args:
         X (pandas.DataFrame): The dataframe containing the categorical features.
         categories (List[Tuple[str, int]]): The list of categorical features and their thresholds.
-            If the number of unique values is greater than the threshold, the feature is considered numerical and not categorical.
+            If the number of unique values is greater than the threshold, the feature is not considered categorical.
 
     Raises:
         TypeError: If the features are not a `pandas.DataFrame` or the categorical features are not a `List[str]`.
         ValueError: If the categorical features are not in the dataframe.
-        ValueError: If the dataframe does not contain any categorical features.
 
     Returns:
         pandas.DataFrame: The original dataframe with the categorical features converted to `category` dtype.
@@ -59,21 +57,17 @@ def prepare_categorical_data(
     if not set(set(cat_features)).issubset(set(X.columns)):
         raise ValueError("cat_features must be in the dataframe!")
 
-    cont_features = []
     for feature, threshold in categories:
-        if X[feature].nunique() > threshold:
-            cont_features.append(feature)
+        if (str(X[feature].dtype) != "object") or (X[feature].nunique() > threshold):
             cat_features.remove(feature)
             print(
-                f"""{feature} has less unique vlaues that {threshold}.
-            So it is not a categorical feature and will be handled as a numerical feature."""
+                f"""{feature} has fewer unique values than {threshold}.
+                So it will not be converted to Category dtype."""
             )
 
     pd.options.mode.chained_assignment = None
     for column in X.columns:
         if column in cat_features:
             X[column] = X[column].astype("category").copy()
-        else:
-            X[column] = X[column].astype(np.float32).copy()
 
     return X
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "feature-reviser"
-version = "0.3.6"
+version = "0.3.7"
 description = "Construct and find the right features of your dataset for the right model."
 authors = ["Christopher Lemke <chris@syhbl.mozmail.com>"]
 license = "MIT"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -29,9 +29,9 @@ def X() -> pd.DataFrame:
         {
             "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             "b": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1],
-            "c": [1, 1, 1, 1, 2, 2, 2, 3, 3, 4],
-            "d": [5, 5, 5, 6, 6, 6, 6, 7, 7, 7],
-            "e": [5, 5, 5, 5, 5, 5, 5, 5, 7, 7],
+            "c": ["1", "1", "1", "1", "2", "2", "2", "3", "3", "4"],
+            "d": ["5", "5", "5", "6", "6", "6", "6", "7", "7", "7"],
+            "e": ["5", "5", "5", "5", "5", "5", "5", "5", "7", "7"],
         }
     )
 
@@ -162,3 +162,16 @@ def X_strings() -> pd.DataFrame:
 @pytest.fixture()
 def y() -> pd.Series:
     return pd.Series([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
+
+
+@pytest.fixture()
+def X_categorical() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "a": ["A1", "A2", "A2", "A1", "A1", "A2", "A1", "A1"],
+            "b": [1, 2, 3, 4, 5, 6, 7, 8],
+            "c": [1, 2, 3, 1, 2, 3, 1, 3],
+            "d": [1.1, 2, 3, 4, 5, 6, 7, 8],
+            "e": ["A", "B", "C", "D", "E", "F", "G", "H"],
+        }
+    )
diff --git a/tests/test_feature_selection/test_reviser.py b/tests/test_feature_selection/test_reviser.py
@@ -51,5 +51,5 @@ def test_revise_classifier_chi2(clf, X, y) -> None:
 
 def test_revise_classifier_f_statistic(clf, X, y) -> None:
     result = revise_classifier(clf, X, y, [("c", 10), ("d", 10), ("e", 10)])
-    assert result[1].iloc[0][0] == 21.33333396911621
-    assert result[1].iloc[0][1] == 23.945640563964844
+    assert result[1].iloc[0][0] == 21.333333333333332
+    assert result[1].iloc[0][1] == 23.9456209150327
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import pytest
 
-from feature_reviser.utils import check_data
+from feature_reviser.utils import check_data, prepare_categorical_data
 
 # pylint: disable=missing-function-docstring
 
@@ -34,3 +34,33 @@ def test_check_data_y_nan() -> None:
         check_data(pd.DataFrame([1, 2, 3]), pd.Series([1, None, 3]))
 
     assert "y must not contain NaN values!" == str(error.value)
+
+
+def test_prepare_categorical_data_x_type() -> None:
+    with pytest.raises(TypeError) as error:
+        prepare_categorical_data("wrong_type", [("a", 1), ("b", 2)])
+
+    assert "features must be a pandas.DataFrame!" == str(error.value)
+
+
+def test_prepare_categorical_data_x_value(X_categorical) -> None:
+    with pytest.raises(ValueError) as error:
+        prepare_categorical_data(X_categorical, [("f", 1)])
+
+    assert "cat_features must be in the dataframe!" == str(error.value)
+
+
+def test_prepare_categorical_data(X_categorical) -> None:
+    categories = [("a", 2), ("b", 3), ("c", 3), ("d", 3), ("e", 3)]
+    result = prepare_categorical_data(X_categorical, categories).dtypes
+    expected = pd.Series(
+        [
+            "category",
+            "int64",
+            "int64",
+            "float64",
+            "object",
+        ],
+        index=X_categorical.columns,
+    )
+    assert result.equals(expected)