diff --git a/.gitignore b/.gitignore index 8da68c282..76de2cc8d 100644 --- a/.gitignore +++ b/.gitignore @@ -98,6 +98,7 @@ venv/ ENV/ env.bak/ venv.bak/ +venv*/ # Spyder project settings .spyderproject @@ -120,4 +121,4 @@ dmypy.json .DS_Store # Local Netlify folder -.netlify \ No newline at end of file +.netlify diff --git a/docs/_scripts/feature-selection.py b/docs/_scripts/feature-selection.py new file mode 100644 index 000000000..cf1a3d409 --- /dev/null +++ b/docs/_scripts/feature-selection.py @@ -0,0 +1,122 @@ +from pathlib import Path + +_file = Path(__file__) +print(f"Executing {_file}") + + +_static_path = Path("_static") / _file.stem +_static_path.mkdir(parents=True, exist_ok=True) + +# --8<-- [start:mrmr-commonimports] +from sklearn.datasets import fetch_openml +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.feature_selection import f_classif, mutual_info_classif +from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split +from sklego.feature_selection import MaximumRelevanceMinimumRedundancy +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +# --8<-- [end:mrmr-commonimports] + +# --8<-- [start:mrmr-intro] + +# Download MNIST dataset using scikit-learn +mnist = fetch_openml("mnist_784", cache=True) + +# Assign features and labels +X_pd, y_pd = mnist["data"], mnist["target"].astype(int) + +X, y = X_pd.to_numpy(), y_pd.to_numpy() +t_t_s_params = {'test_size': 10000, 'random_state': 42} +X_train, X_test, y_train, y_test = train_test_split(X, y, **t_t_s_params) +X_train = X_train.reshape(60000, 28 * 28) +X_test = X_test.reshape(10000, 28 * 28) +# --8<-- [end:mrmr-intro] + +# --8<-- [start:mrmr-smile] +def smile_relevance(X, y): + rows = 28 + cols = 28 + smiling_face = np.zeros((rows, cols), dtype=int) + + # Set the values for the eyes, nose, + # and mouth with adjusted positions and sizes + # Left eye + smiling_face[10:13, 8:10] = 1 + # Right eye + smiling_face[10:13, 18:20] = 1 + # Upper part of the mouth + smiling_face[18:20, 10:18] = 1 + # Left edge of the open mouth + smiling_face[16:18, 8:10] = 1 + # Right edge of the open mouth + smiling_face[16:18, 18:20] = 1 + + # Add the nose as four pixels one pixel higher + smiling_face[14, 13:15] = 1 + smiling_face[27, :] = 1 + return smiling_face.reshape(rows * cols,) + + +def smile_redundancy(X, selected, left): + return np.ones(len(left)) +# --8<-- [end:mrmr-smile] + +# --8<-- [start:mrmr-core] +K = 38 +mrmr = MaximumRelevanceMinimumRedundancy(k=K, + kind="auto", + redundancy_func="p", + relevance_func="f") +mrmr_s = MaximumRelevanceMinimumRedundancy(k=K, + redundancy_func=smile_redundancy, + relevance_func=smile_relevance) + +f = f_classif(X_train ,y_train.reshape(60000,))[0] +f_features = np.argsort(np.nan_to_num(f, nan=np.finfo(float).eps))[-K:] +mi = mutual_info_classif(X_train, y_train.reshape(60000,)) +mi_features = np.argsort(np.nan_to_num(mi, nan=np.finfo(float).eps))[-K:] +mrmr_features = mrmr.fit(X_train, y_train).selected_features_ +mrmr_smile_features = mrmr_s.fit(X_train, y_train).selected_features_ + +# --8<-- [end:mrmr-core] +# --8<-- [start:mrmr-selected-features] +# Define features dictionary +features = { + "f_classif": f_features, + "mutual_info": mi_features, + "mrmr": mrmr_features, + "mrmr_smile": mrmr_smile_features, +} +for name, s_f in features.items(): + model = HistGradientBoostingClassifier(random_state=42) + model.fit(X_train[:, s_f], y_train.squeeze()) + y_pred = model.predict(X_test[:, s_f]) + print(f"Feature selection method: {name}") + print(f"F1 score: {round(f1_score(y_test, y_pred, average="weighted"), 3)}") + +# --8<-- [end:mrmr-selected-features] + +# --8<-- [start:mrmr-plots] +# Create figure and axes for the plots +fig, axes = plt.subplots(2, 2, figsize=(12, 8)) + +# Iterate through the features dictionary and plot the images +for idx, (name, s_f) in enumerate(features.items()): + row = idx // 2 + col = idx % 2 + + a = np.zeros(28 * 28) + a[s_f] = 1 + ax = axes[row, col] + plot_= sns.heatmap(a.reshape(28, 28), cmap="binary", ax=ax, cbar=False) + ax.set_title(name) + + + + +# --8<-- [end:mrmr-plots] +plt.tight_layout() +plt.savefig(_static_path / "mrmr-feature-selection-mnist.png") +plt.clf() diff --git a/docs/_static/feature-selection/mrmr-feature-selection-mnist.png b/docs/_static/feature-selection/mrmr-feature-selection-mnist.png new file mode 100644 index 000000000..3b72db8f7 Binary files /dev/null and b/docs/_static/feature-selection/mrmr-feature-selection-mnist.png differ diff --git a/docs/api/feature-selection.md b/docs/api/feature-selection.md new file mode 100644 index 000000000..6c7b01b6e --- /dev/null +++ b/docs/api/feature-selection.md @@ -0,0 +1,6 @@ +# Features Selection + +:::sklego.feature_selection.mrmr.MaximumRelevanceMinimumRedundancy + options: + show_root_full_path: true + show_root_heading: true diff --git a/docs/api/features-selection.md b/docs/api/features-selection.md new file mode 100644 index 000000000..6c7b01b6e --- /dev/null +++ b/docs/api/features-selection.md @@ -0,0 +1,6 @@ +# Features Selection + +:::sklego.feature_selection.mrmr.MaximumRelevanceMinimumRedundancy + options: + show_root_full_path: true + show_root_heading: true diff --git a/docs/user-guide/feature-selection.md b/docs/user-guide/feature-selection.md new file mode 100644 index 000000000..8d10611a7 --- /dev/null +++ b/docs/user-guide/feature-selection.md @@ -0,0 +1,71 @@ +# Feature Selection + +## Maximum Relevance Minimum Redundancy + +The [`Maximum Relevance Minimum Redundancy`][MaximumRelevanceMinimumRedundancy-api] (MRMR) is an iterative feature selection method commonly used in data science to select a subset of features from a larger feature set. The goal of MRMR is to choose features that have high *relevance* to the target variable while minimizing *redundancy* among the already selected features. + +MRMR is heavily dependent on the two functions used to determine relevace and redundancy. However, the paper [Maximum Relevanceand Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://arxiv.org/pdf/1908.05376.pdf) shows that using [f_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html) or [f_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html) as relevance function and Pearson correlation as redundancy function is the best choice for a variety of different problems and in general is a good choice. + +Inspired by the Medium article [Feature Selection: How To Throw Away 95% of Your Data and Get 95% Accuracy](https://towardsdatascience.com/feature-selection-how-to-throw-away-95-of-your-data-and-get-95-accuracy-ad41ca016877) we showcase a practical application using the well known mnist dataset. + +Note that although the default scikit-lego MRMR implementation uses redundancy and relevance as defined in [Maximum Relevanceand Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://arxiv.org/pdf/1908.05376.pdf), our implementation offers the possibility of defining custom functions, that may be necessary in different scenarios depending on the data. + +We will compare this list of well known filters method: + +- F statistical test ([ANOVA F-test](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html)). +- Mutual information approximation based on sklearn implementation. + +Against the default scikit-lego MRMR implementation and a custom MRMR implementation aimed to select features in order to draw a smiling face on the plot showing the minst letters. + + + +??? example "MRMR imports" + ```py + --8<-- "docs/_scripts/feature-selection.py:mrmr-commonimports" + ``` + +```py title="MRMR mnist" +--8<-- "docs/_scripts/feature-selection.py:mrmr-intro" +``` + +As custom functions, we implemented the smile redundancy and smile relevance. + +```py title="MRMR smile functions" +--8<-- "docs/_scripts/feature-selection.py:mrmr-smile" +``` + +Then we execute the main code part. + +```py title="MRMR core" +--8<-- "docs/_scripts/feature-selection.py:mrmr-core" +``` + +After the execution it is possible to inspect the F1-score for the selected features: + +```py title="MRMR mnist selected features" +--8<-- "docs/_scripts/feature-selection.py:mrmr-selected-features" +``` + +```console hl_lines="5-6" +Feature selection method: f_classif +F1 score: 0.854 +Feature selection method: mutual_info +F1 score: 0.879 +Feature selection method: mrmr +F1 score: 0.925 +Feature selection method: mrmr_smile +F1 score: 0.849 +``` + +The MRMR feature selection model provides better results compared against the other methods, although the smile technique performs rather good as well. + +Finally, we can take a look at the selected features. + +??? example "MRMR generate plots" + ```py + --8<-- "docs/_scripts/feature-selection.py:mrmr-plots" + ``` + +![selected-features-mrmr](../_static/feature-selection/mrmr-feature-selection-mnist.png) + +[MaximumRelevanceMinimumRedundancy-api]: ../../api/feature-selection#sklego.feature_selection.mrmr.MaximumRelevanceMinimumRedundancy diff --git a/mkdocs.yaml b/mkdocs.yaml index 2fd963954..ad4f51ef2 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -127,6 +127,7 @@ nav: - Datasets: user-guide/datasets.md - Linear Models: user-guide/linear-models.md - Mixture Methods: user-guide/mixture-methods.md + - Feature Selection: user-guide/feature-selection.md - Naive Bayes: user-guide/naive-bayes.md - Meta Models: user-guide/meta-models.md - Fairness: user-guide/fairness.md @@ -147,6 +148,7 @@ nav: - Meta: api/meta.md - Metrics: api/metrics.md - Mixture: api/mixture.md + - Feature Selection: api/feature-selection.md - Model Selection: api/model-selection.md - Naive Bayes: api/naive-bayes.md - Neighbors: api/neighbors.md diff --git a/sklego/feature_selection/__init__.py b/sklego/feature_selection/__init__.py new file mode 100644 index 000000000..48cb4b535 --- /dev/null +++ b/sklego/feature_selection/__init__.py @@ -0,0 +1,5 @@ +__all__ = [ + "MaximumRelevanceMinimumRedundancy", +] + +from sklego.feature_selection.mrmr import MaximumRelevanceMinimumRedundancy diff --git a/sklego/feature_selection/mrmr.py b/sklego/feature_selection/mrmr.py new file mode 100644 index 000000000..5cb1c842e --- /dev/null +++ b/sklego/feature_selection/mrmr.py @@ -0,0 +1,233 @@ +import warnings + +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.feature_selection import f_classif, f_regression +from sklearn.feature_selection._base import SelectorMixin +from sklearn.utils.validation import check_is_fitted, check_X_y + + +def _redundancy_pearson(X, selected, left): + """Redundancy function for the MRMR feature selector algorithm + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features,) + Training data. Used to compute redundancy of the training features. + selected : array-like. + List of indexes of the selected features at iteration i-th. + left : array-like. + List of indexes of the left features at iteration i-th. Mrmr will select a feature + from this list. + + Returns + ------- + np.ndarray, shape = (len(left), ) + The array containing the redundancy score using pearson correlation. + """ + # if len(selected) == 0: + # return np.ones(len(left)) + + X_norm = X - np.mean(X, axis=0, keepdims=True) + Xs = X_norm[:, selected] + Xl = X_norm[:, left] + + num = (Xl[:, None, :] * Xs[:, :, None]).sum(axis=0) + den = np.sqrt((Xl[:, None, :] ** 2).sum(axis=0)) * np.sqrt((Xs[:, :, None] ** 2).sum(axis=0)) + + return np.sum(np.abs(np.nan_to_num(num / den, nan=np.finfo(float).eps)), axis=0) + + +class MaximumRelevanceMinimumRedundancy(SelectorMixin, BaseEstimator): + r"""Maximum Relevance Minimum Redundancy (MRMR) is an iterative feature selection method commonly used in data + science to select a subset of features from a larger feature set. The goal of MRMR is to choose features that + have high relevance to the target variable while minimizing redundancy among the already selected features. + + How MRMR works: + + 1. Compute the relevance of each feature to the target variable: The relevance of a feature is typically + measured using a metric such as mutual information, correlation coefficient, or another appropriate measure of + dependence between the feature and the target variable. + + 2. Compute the redundancy between each pair of features: Redundancy is the degree of similarity or overlap between + features. It can be measured using metrics such as mutual information, correlation coefficient, or other similarity + measures. + + 3. Select features based on the maximum relevance and minimum redundancy criteria: MRMR aims to maximize the + relevance of selected features to the target variable while minimizing redundancy among them. + + 4. Construct the final subset of features: MRMR iteratively adds features to the selected subset until a predefined + number of features is reached. + + The implemented formula is: + + $$\text{score}_{i}(f) = \frac{\text{relevance}(f | y)}{\text{redundancy}(f | \text{selected}_{i-1})}$$ + + !!! warning + If a custom relevance_func is provided it must have this signature: + `Callable[[np.ndarray, np.ndarray], np.ndarray]` + It should accept X, y as arguments and it should compute the score for each feature of X + and return an array of shape (n_features_in_,). + + !!! warning + If a custom redundancy_func is provided it must have the same signature as the method _redundancy_pearson, hence + the function must have three parameters: + + - X : array-like, shape=(n_samples, n_features,). Training used to compute redundancy of the training features. + + - selected : array-like. List of indexes of the selected features at iteration i-th. + + - left : array-like. List of indexes of the left features at iteration i-th. Mrmr will select a feature from this list. + + and it must return: + + - np.ndarray, shape = (len(left), ), The array containing the redundancy score using the custom function. + + Parameters + ---------- + k : int + Number of feature the model should use. + relevance_func : str | Callable, default="f" + The relevance function to use. The default maps to scikit-learn [f_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html) or [f_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html) for classification or regression (resp.) + redundancy_func : str | Callable, default="p" + The redundancy function to use. The default maps to Pearson correlation computed for each remaining features. + kind : Literal["auto", "classficiation", "regression"], default="auto". + 'classification' or 'regression' or 'auto' if auto the model + will try to infer the type of problem looking at the y data type, by default "auto". + + Attributes + ---------- + _y_dtype : np.dtype + data type of y + selected_features_ : array-like of shape (k,) + Indexes of the selected features. + scores_ : array-like of shape (k,) + Scores of the selected features. + + Examples + -------- + ```py + from sklego.feature_selection import MaximumRelevanceMinimumRedundancy + from sklearn.datasets import make_classification + + mrmr = MaximumRelevanceMinimumRedundancy(k=4, + kind='auto', + redundancy_func='p', + relevance_func='f') + + X, y = make_classification(n_features=4) + + # Fit mrmr model + mrmr = mrmr.fit(X, y) + + # Selected features + selected_features = mrmr.selected_features_ + + # Get the scores of the selected features + feature_scores = mrmr.scores_ + ``` + """ + + def __init__(self, k, *, relevance_func="f", redundancy_func="p", kind="auto"): + self.k = k + self.relevance_func = relevance_func + self.redundancy_func = redundancy_func + self.kind = kind + + def _get_support_mask(self): + """SelectorMixin base function to get the selected features mask + + Returns + ------- + np.ndarray + Array of boolean, mask indicating if feature n is selected by mrmr or not. + """ + check_is_fitted(self, ["selected_features_"]) + all_features = np.arange(0, self.n_features_in_) + return np.isin(all_features, self.selected_features_) + + @property + def _get_relevance(self): + """get relevance function from init values.""" + if self.relevance_func == "f": + if (self.kind == "auto" and np.issubdtype(self._y_dtype, np.integer)) | (self.kind == "classification"): + return lambda X, y: np.nan_to_num(f_classif(X, y)[0]) + elif (self.kind == "auto" and np.issubdtype(self._y_dtype, np.floating)) | (self.kind == "regression"): + return lambda X, y: np.nan_to_num(f_regression(X, y)[0]) + else: + raise ValueError( + "`kind` parameter must be 'auto', 'classification' or 'regression' and y dtype must be numeric" + ) + elif callable(self.relevance_func): + return self.relevance_func + else: + raise ValueError(f"Relevance function supported are 'f' or Callable, got {self.relevance_func}") + + @property + def _get_redundancy(self): + """get redundancy function from init values.""" + if self.redundancy_func == "p": + return _redundancy_pearson + elif callable(self.redundancy_func): + return self.redundancy_func + else: + raise ValueError(f"Redundancy function supported are 'p' or Callable, got {self.redundancy_func}") + + def fit(self, X, y): + """Fit the underlying feature selection algorithm on the training data `X` and `y` + using the provided redundancy and relevance functions. + + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : MaximumRelevanceMinimumRedundancy + The fitted estimator. + + Raises + ------ + ValueError + if: + + k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1 + """ + X, y = check_X_y(X, y) + self._y_dtype = y.dtype + + relevance = self._get_relevance + redundancy = self._get_redundancy + + self.n_features_in_ = X.shape[1] + left_features = list(range(self.n_features_in_)) + selected_features = [] + selected_scores = [] + + if not isinstance(self.k, int): + raise ValueError("k parameter must be integer type") + if self.k > self.n_features_in_: + raise ValueError(f"k ({self.k}) parameter must be less than n_features_in_ ({self.n_features_in_})") + elif self.k == self.n_features_in_: + warnings.warn("k parameter is equal to n_features_in, no feature selection is applied") + return np.asarray(left_features) + elif self.k < 1: + raise ValueError(f"k ({self.k}) parameter must be greater than or equal to 1") + + # computed one time for all features + + rel_score = relevance(X, y) + + for i in range(self.k): + red_i = redundancy(X, selected_features, left_features) / i if i > 0 else 1 + mrmr_score_i = rel_score[left_features] / red_i + selected_index = np.argmax(mrmr_score_i) + selected_features += [left_features.pop(selected_index)] + selected_scores += [mrmr_score_i[selected_index]] + self.selected_features_ = np.asarray(selected_features) + self.scores_ = np.asarray(selected_scores) + return self diff --git a/tests/test_feature_selection/__init__.py b/tests/test_feature_selection/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_feature_selection/test_mrmr.py b/tests/test_feature_selection/test_mrmr.py new file mode 100644 index 000000000..9bf1a8a4f --- /dev/null +++ b/tests/test_feature_selection/test_mrmr.py @@ -0,0 +1,106 @@ +from unittest.mock import MagicMock + +import numpy as np +import pytest +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import RandomForestClassifier +from sklearn.pipeline import Pipeline + +from sklego.common import flatten +from sklego.feature_selection.mrmr import MaximumRelevanceMinimumRedundancy, _redundancy_pearson +from tests.conftest import general_checks, transformer_checks + + +@pytest.mark.parametrize("test_fn", flatten([general_checks, transformer_checks])) +def test_transformer_checks(test_fn): + mrmr = MaximumRelevanceMinimumRedundancy(k=1) + test_fn(MaximumRelevanceMinimumRedundancy.__name__, mrmr) + + +def test_redundancy_pearson(): + """Test the pearson correlation for selected and left features: + For the 3rd column (index 2), it mirrors the values in the first and second columns (indexes 0, 1), + resulting in a correlation of 1 between the 3rd column and both the 1st and 2nd columns. + For the 4th column, the correlation with the 1st and 2nd columns is |(-1)|, which equals 1. + The total correlation sum is 4, accounting for all correlations. + """ + X = np.array([[0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 0.0]]) + + selected = [0, 1] + left = [2, 3] + redundancy_scores = _redundancy_pearson(X, selected, left) + redundancy_scores_expected = np.array([2 * 1.0, 2 * 1.0]) + assert np.allclose(redundancy_scores, redundancy_scores_expected, atol=0.00001) + + +@pytest.mark.parametrize("k", [1.2, 5, -1]) +def test_mrmr_raises(k): + X, y = make_classification(n_features=4) + with pytest.raises(ValueError): + _ = MaximumRelevanceMinimumRedundancy(k=k).fit(X, y) + + +@pytest.mark.parametrize( + "dataset", + [make_classification(n_features=4), make_regression(n_features=4), make_classification(n_features=10)], +) +@pytest.mark.parametrize("k", [1, 2, 3]) +def test_mrmr_fit(dataset, k): + X, y = dataset + mrmr = MaximumRelevanceMinimumRedundancy(k=k).fit(X, y) + mask = mrmr._get_support_mask() + + assert np.sum(mask) == k + assert mask.size == mrmr.n_features_in_ == X.shape[1] + + +@pytest.mark.parametrize( + "dataset", + [make_classification(n_features=4), make_classification(n_features=10)], +) +@pytest.mark.parametrize("k", [1, 3]) +def test_mrmr_sklearn_compatible(dataset, k): + X_c, y_c = dataset + pipeline = Pipeline( + [ + ("mrmr", MaximumRelevanceMinimumRedundancy(k=k, kind="auto", redundancy_func="p", relevance_func="f")), + ("rf", RandomForestClassifier(n_estimators=100, random_state=42)), + ] + ) + pipeline = pipeline.fit(X_c, y_c) + mrmr = pipeline.named_steps["mrmr"] + rf = pipeline.named_steps["rf"] + + assert len(mrmr.selected_features_) == k == rf.n_features_in_ + + +@pytest.mark.parametrize("relevance, redundancy", [("f", "c"), ("a", "p")]) +def test_raises_wrong_relevance_redundancy_parameters(relevance, redundancy): + X, y = make_classification(n_features=4) + with pytest.raises(ValueError): + _ = MaximumRelevanceMinimumRedundancy(k=2, redundancy_func=redundancy, relevance_func=relevance).fit(X, y) + + +# Forcing to have boolean types on the y -> Fails the kind auto determination +@pytest.mark.parametrize( + "kind, dataset", + [("auto", [i.astype(bool) if n == 1 else i for n, i in enumerate(make_classification(n_features=4))])], +) +def test_raises_wrong_kind_parameters(kind, dataset): + X, y = dataset + with pytest.raises(ValueError): + _ = MaximumRelevanceMinimumRedundancy(k=2, kind=kind).fit(X, y) + + +@pytest.mark.parametrize("k", [1, 2, 3, 4, 5]) +def test_mrmr_count_n_of_calls(k): + X, y = make_classification(n_features=10) + relevance_mock, redundancy_mock = MagicMock(), MagicMock() + + # MagicMocking the functions to only match the signature of the original functions + relevance_mock.side_effect = lambda x, y: np.ones(len(x)) + redundancy_mock.side_effect = lambda p1, p2, p3: np.ones(len(p3)) + + _ = MaximumRelevanceMinimumRedundancy(k=k, redundancy_func=redundancy_mock, relevance_func=relevance_mock).fit(X, y) + relevance_mock.assert_called_once() + assert redundancy_mock.call_count == k - 1