Skip to content

Commit

Permalink
Prepare unawareness for more strategies (#150)
Browse files Browse the repository at this point in the history
* Added strategy option to use mdi as measure

* Removed top_k as parameter

* Added dataset validation for all preprocessing classes

* Removed mdi strategy

* Moved not implemented error to init
  • Loading branch information
reluzita authored Jan 30, 2024
1 parent 3f94bdf commit 34af12c
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 34 deletions.
4 changes: 4 additions & 0 deletions src/aequitas/flow/methods/preprocessing/data_repairer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None) -> N
s : pd.Series, optional
The sensitive attribute.
"""
super().fit(X, y, s)

if self.columns is None:
self.columns = X.columns.tolist()
if s is None:
Expand Down Expand Up @@ -138,6 +140,8 @@ def transform(
pd.DataFrame, pd.Series, pd.Series
Transformed features, labels, and sensitive attribute.
"""
super().transform(X, y, s)

if s is None:
raise ValueError("s must be passed.")

Expand Down
3 changes: 2 additions & 1 deletion src/aequitas/flow/methods/preprocessing/identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ def __init__(self):
pass

def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None) -> None:
pass
super().fit(X, y, s)

def transform(
self,
X: pd.DataFrame,
y: pd.Series,
s: Optional[pd.Series] = None,
) -> tuple[pd.DataFrame, pd.Series, Optional[pd.Series]]:
super().transform(X, y, s)
return X, y, s
4 changes: 4 additions & 0 deletions src/aequitas/flow/methods/preprocessing/label_flipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None:
s : pandas.Series
Protected attribute vector.
"""
super().fit(X, y, s)

self.logger.info("Fitting LabelFlipping.")

X_transformed = X.copy()
Expand Down Expand Up @@ -314,6 +316,8 @@ def transform(
tuple[pd.DataFrame, pd.Series, pd.Series]
The transformed input, X, y, and s.
"""
super().transform(X, y, s)

self.logger.info("Transforming data with LabelFlipping.")

if s is None and self.fair_ordering:
Expand Down
11 changes: 9 additions & 2 deletions src/aequitas/flow/methods/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None) -> N
s : pd.Series, optional
The protected attribute.
"""
pass
self._validate_dataset(X, y, s)

@abstractmethod
def transform(
Expand All @@ -42,4 +42,11 @@ def transform(
pd.DataFrame
The transformed data.
"""
pass
self._validate_dataset(X, y, s)

def _validate_dataset(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None) -> None:
if s.dtype.name != "category":
raise ValueError("The sensitive attribute must be categorical.")

if s.name in X.columns:
raise ValueError("The sensitive attribute must not be in the feature matrix.")
3 changes: 3 additions & 0 deletions src/aequitas/flow/methods/preprocessing/prevalence_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None:
s : pandas.Series
Protected attribute vector.
"""
super().fit(X, y, s)
self.logger.info("Fitting sampling method.")

if s is None:
Expand Down Expand Up @@ -117,6 +118,8 @@ def transform(
tuple[pd.DataFrame, pd.Series, pd.Series]
The transformed input, X, y, and s.
"""
super().transform(X, y, s)

self.logger.info("Transforming data.")
if s is None:
raise ValueError("Sensitive Attribute `s` not passed.")
Expand Down
65 changes: 34 additions & 31 deletions src/aequitas/flow/methods/preprocessing/unawareness.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Literal

import pandas as pd
import numpy as np
Expand All @@ -10,14 +10,15 @@

class Unawareness(PreProcessing):
def __init__(
self, top_k: Optional[int] = 1, correlation_threshold: Optional[float] = None
self,
correlation_threshold: Optional[float] = 0.5,
strategy: Literal["correlation", "featureselection"] = "correlation",
seed: int = 0,
):
"""Removes features that are highly correlated with the sensitive attribute.
Note: For this method, the vector s (protected attribute) is assumed to be
categorical.
Parameters
----------
top_k : int, optional
Expand All @@ -27,18 +28,24 @@ def __init__(
Features with a correlation value higher than this thresold are
removed. If None, the top_k parameter is used to determine how many
features to remove. Defaults to None.
strategy : {"correlation", "featureselection"}, optional
Strategy to use to calculate how much each feature is related to the
sensitive attribute. If "correlation", correlation between features
is used. "featureselection" is not implemented yet. Defaults to
"correlation".
"""
self.logger = create_logger("methods.preprocessing.Unawareness")
self.logger.info("Instantiating an Unawareness preprocessing method.")
self.used_in_inference = True

if top_k is None and correlation_threshold is None:
raise ValueError(
"Since top_k is set as None, the correlation_threshold must be "
"passed by the user."
)
self.top_k = top_k
self.correlation_threshold = correlation_threshold
if strategy == "featureselection":
raise NotImplementedError(
"The feature selection strategy is not implemented yet."
)
self.strategy = strategy
self.seed = seed

def _correlation_ratio(
self, categorical_feature: np.ndarray, numeric_feature: np.ndarray
Expand Down Expand Up @@ -109,8 +116,7 @@ def _cramerv(self, a: np.ndarray, b: np.ndarray):
return statistic

def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None:
"""Calculates the correlations between the features and the sensitive
attribute.
"""Calculates how related each feature is to the sensitive attribute.
Parameters
----------
Expand All @@ -121,19 +127,19 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None:
s : pandas.Series
Protected attribute vector.
"""
self.correlations = pd.Series(index=X.columns)
super().fit(X, y, s)

self.logger.info("Calculating feature correlation with sensitive attribute.")

for col in X.columns:
if X[col].dtype.name == "category":
self.correlations[col] = self._cramerv(
s.astype("category").values, X[col].values
)
else:
self.correlations[col] = self._correlation_ratio(
s.astype("category").values, X[col].values
)
if self.strategy == "correlation":
self.scores = pd.Series(index=X.columns)
for col in X.columns:
if X[col].dtype.name == "category":
self.scores[col] = self._cramerv(s.values, X[col].values)
else:
self.scores[col] = self._correlation_ratio(s.values, X[col].values)

self.correlations = self.correlations.sort_values(ascending=False)
self.scores = self.scores.sort_values(ascending=False)

def transform(
self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None
Expand All @@ -154,14 +160,11 @@ def transform(
tuple[pd.DataFrame, pd.Series, pd.Series]
The transformed input, X, y, and s.
"""
remove_features = self.correlations.copy()
if self.top_k is not None:
remove_features = remove_features[: self.top_k]
if self.correlation_threshold is not None:
remove_features = remove_features.loc[
remove_features >= self.correlation_threshold
]
remove_features = list(remove_features.index)
super().transform(X, y, s)

remove_features = list(
self.scores.loc[self.scores >= self.correlation_threshold].index
)

self.logger.info(
f"Removing most correlated features with sensitive attribute: "
Expand Down

0 comments on commit 34af12c

Please sign in to comment.