diff --git a/ethicml/algorithms/algorithm_base.py b/ethicml/algorithms/algorithm_base.py index f7404f12..6ff04d47 100644 --- a/ethicml/algorithms/algorithm_base.py +++ b/ethicml/algorithms/algorithm_base.py @@ -11,23 +11,32 @@ class Algorithm(ABC): """Base class for Algorithms.""" - def __init__(self, name: str): + def __init__(self, name: str, seed: int): """Base constructor for the Algorithm class. Args: name: name of the algorithm + seed: seed for the random number generator """ self.__name = name + self.__seed = seed @property def name(self) -> str: """Name of the algorithm.""" return self.__name + @property + def seed(self) -> int: + """Seed for the random number generator.""" + return self.__seed + class AlgorithmAsync(metaclass=ABCMeta): # pylint: disable=too-few-public-methods """Base class of async methods; meant to be used in conjuction with :class:`Algorithm`.""" + model_dir: Path + @property def _executable(self) -> str: """Path to a (Python) executable. diff --git a/ethicml/algorithms/inprocess/agarwal_reductions.py b/ethicml/algorithms/inprocess/agarwal_reductions.py index 52338732..f26d8312 100644 --- a/ethicml/algorithms/inprocess/agarwal_reductions.py +++ b/ethicml/algorithms/inprocess/agarwal_reductions.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Dict, List, Optional, Set, Union +from ranzen import implements + from ethicml.utility import ClassifierType, FairnessType from .in_algorithm import InAlgorithmAsync @@ -19,6 +21,7 @@ class Agarwal(InAlgorithmAsync): def __init__( self, + dir: Union[str, Path], fairness: FairnessType = "DP", classifier: ClassifierType = "LR", eps: float = 0.1, @@ -31,7 +34,8 @@ def __init__( raise ValueError(f"results: fairness must be one of {VALID_FAIRNESS!r}.") if classifier not in VALID_MODELS: raise ValueError(f"results: classifier must be one of {VALID_MODELS!r}.") - super().__init__(name=f"Agarwal, {classifier}, {fairness}") + super().__init__(name=f"Agarwal, {classifier}, {fairness}", seed=seed) + self.model_dir = dir if isinstance(dir, Path) else Path(dir) chosen_c, chosen_kernel = settings_for_svm_lr(classifier, C, kernel) self.flags: Dict[str, Union[str, float, int]] = { "classifier": classifier, @@ -43,6 +47,23 @@ def __init__( "seed": seed, } - def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: - args = flag_interface(train_path, test_path, pred_path, self.flags) + @implements(InAlgorithmAsync) + def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: + args = flag_interface( + train_path=train_path, test_path=test_path, pred_path=pred_path, flags=self.flags + ) + return ["-m", "ethicml.implementations.agarwal"] + args + + @implements(InAlgorithmAsync) + def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]: + args = flag_interface(train_path=train_path, model_path=model_path, flags=self.flags) + return ["-m", "ethicml.implementations.agarwal"] + args + + @implements(InAlgorithmAsync) + def _predict_script_command( + self, model_path: Path, test_path: Path, pred_path: Path + ) -> List[str]: + args = flag_interface( + model_path=model_path, test_path=test_path, pred_path=pred_path, flags=self.flags + ) return ["-m", "ethicml.implementations.agarwal"] + args diff --git a/ethicml/algorithms/inprocess/blind.py b/ethicml/algorithms/inprocess/blind.py index 20588b16..08187d98 100644 --- a/ethicml/algorithms/inprocess/blind.py +++ b/ethicml/algorithms/inprocess/blind.py @@ -15,8 +15,18 @@ class Blind(InAlgorithm): """Returns a random label.""" def __init__(self, seed: int = 888) -> None: - super().__init__(name="Blind", is_fairness_algo=False) - self.seed = seed + super().__init__(name="Blind", is_fairness_algo=False, seed=seed) + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + self.vals = train.y.drop_duplicates() + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + random = np.random.RandomState(self.seed) + + return Prediction(hard=pd.Series(random.choice(self.vals.T.to_numpy()[0], test.x.shape[0]))) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: diff --git a/ethicml/algorithms/inprocess/fairness_wo_demographics.py b/ethicml/algorithms/inprocess/fairness_wo_demographics.py index f1f1349b..ed98ce43 100644 --- a/ethicml/algorithms/inprocess/fairness_wo_demographics.py +++ b/ethicml/algorithms/inprocess/fairness_wo_demographics.py @@ -16,15 +16,17 @@ class DRO(InAlgorithmAsync): def __init__( self, + dir: Union[str, Path], eta: float = 0.5, epochs: int = 10, batch_size: int = 32, network_size: Optional[List[int]] = None, seed: int = 888, ): - super().__init__(name="Dist Robust Optim") + super().__init__(name="Dist Robust Optim", seed=seed) if network_size is None: network_size = [50] + self.model_dir = dir if isinstance(dir, Path) else Path(dir) self.flags: Dict[str, Union[float, int, str, List[int]]] = { "eta": eta, "batch_size": batch_size, @@ -34,6 +36,22 @@ def __init__( } @implements(InAlgorithmAsync) - def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: - args = flag_interface(train_path, test_path, pred_path, self.flags) + def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: + args = flag_interface( + train_path=train_path, test_path=test_path, pred_path=pred_path, flags=self.flags + ) + return ["-m", "ethicml.implementations.dro_tabular"] + args + + @implements(InAlgorithmAsync) + def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]: + args = flag_interface(train_path=train_path, model_path=model_path, flags=self.flags) + return ["-m", "ethicml.implementations.dro_tabular"] + args + + @implements(InAlgorithmAsync) + def _predict_script_command( + self, model_path: Path, test_path: Path, pred_path: Path + ) -> List[str]: + args = flag_interface( + model_path=model_path, test_path=test_path, pred_path=pred_path, flags=self.flags + ) return ["-m", "ethicml.implementations.dro_tabular"] + args diff --git a/ethicml/algorithms/inprocess/in_algorithm.py b/ethicml/algorithms/inprocess/in_algorithm.py index 6587a0bc..28aef247 100644 --- a/ethicml/algorithms/inprocess/in_algorithm.py +++ b/ethicml/algorithms/inprocess/in_algorithm.py @@ -1,4 +1,6 @@ """Abstract Base Class of all algorithms in the framework.""" +from __future__ import annotations + from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -15,10 +17,32 @@ class InAlgorithm(Algorithm): """Abstract Base Class for algorithms that run in the middle of the pipeline.""" - def __init__(self, name: str, is_fairness_algo: bool = True): - super().__init__(name=name) + def __init__(self, name: str, seed: int, is_fairness_algo: bool = True): + super().__init__(name=name, seed=seed) self.__is_fairness_algo = is_fairness_algo + @abstractmethod + def fit(self, train: DataTuple) -> InAlgorithm: + """Run Algorithm on the given data. + + Args: + train: training data + + Returns: + self, but trained. + """ + + @abstractmethod + def predict(self, test: TestTuple) -> Prediction: + """Run Algorithm on the given data. + + Args: + test: data to evaluate on + + Returns: + predictions + """ + @abstractmethod def run(self, train: DataTuple, test: TestTuple) -> Prediction: """Run Algorithm on the given data. @@ -45,6 +69,15 @@ def is_fairness_algo(self) -> bool: class InAlgorithmAsync(InAlgorithm, AlgorithmAsync): """In-Algorithm that can be run blocking and asynchronously.""" + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + run_blocking(self.fit_async(train)) + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return run_blocking(self.predict_async(test)) + @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: """Run this asynchronous Algorithm as blocking on the given data. @@ -58,6 +91,44 @@ def run(self, train: DataTuple, test: TestTuple) -> Prediction: """ return run_blocking(self.run_async(train, test)) + async def fit_async(self, train: DataTuple) -> InAlgorithmAsync: + """Run Algorithm on the given data asynchronously. + + Args: + train: training data + test: test data + + Returns: + predictions + """ + self.model_path = self.model_dir / f"model_{self.name}.joblib" + with TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + train_path = tmp_path / "train.npz" + train.to_npz(train_path) + cmd = self._fit_script_command(train_path, self.model_path) + await self._call_script(cmd + ["--mode", "fit"]) # wait for script to run + return self + + async def predict_async(self, test: TestTuple) -> Prediction: + """Run Algorithm on the given data asynchronously. + + Args: + train: training data + test: test data + + Returns: + predictions + """ + with TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + test_path = tmp_path / "test.npz" + pred_path = tmp_path / "predictions.npz" + test.to_npz(test_path) + cmd = self._predict_script_command(self.model_path, test_path, pred_path) + await self._call_script(cmd + ["--mode", "predict"]) # wait for scrip to run + return Prediction.from_npz(pred_path) + async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: """Run Algorithm on the given data asynchronously. @@ -75,10 +146,20 @@ async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: pred_path = tmp_path / "predictions.npz" train.to_npz(train_path) test.to_npz(test_path) - cmd = self._script_command(train_path, test_path, pred_path) - await self._call_script(cmd) # wait for scrip to run + cmd = self._run_script_command(train_path, test_path, pred_path) + await self._call_script(cmd + ["--mode", "run"]) # wait for scrip to run return Prediction.from_npz(pred_path) @abstractmethod - def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: + def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: + """The command that will run the script.""" + + @abstractmethod + def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]: + """The command that will run the script.""" + + @abstractmethod + def _predict_script_command( + self, model_path: Path, test_path: Path, pred_path: Path + ) -> List[str]: """The command that will run the script.""" diff --git a/ethicml/algorithms/inprocess/installed_model.py b/ethicml/algorithms/inprocess/installed_model.py index a6b7145b..cb9bc720 100644 --- a/ethicml/algorithms/inprocess/installed_model.py +++ b/ethicml/algorithms/inprocess/installed_model.py @@ -13,6 +13,7 @@ from typing import List, Optional import git +from ranzen import implements from .in_algorithm import InAlgorithmAsync @@ -29,6 +30,7 @@ def __init__( top_dir: str, url: Optional[str] = None, executable: Optional[str] = None, + seed: int = 888, ): """Download code from given URL and create Pip environment with Pipfile found in the code. @@ -39,6 +41,7 @@ def __init__( simply the last part of the repository URL) url: (optional) URL of the repository executable: (optional) path to a Python executable + seed: Random seed to use for reproducibility """ # QUESTION: do we really need `store_dir`? we could also just clone the code into "." self._store_dir: Path = Path(".") / dir_name # directory where code and venv are stored @@ -54,7 +57,7 @@ def __init__( self.__executable = str(self._code_path.resolve() / ".venv" / "bin" / "python") else: self.__executable = executable - super().__init__(name=name) + super().__init__(name=name, seed=seed) @property def _code_path(self) -> Path: @@ -90,5 +93,16 @@ def remove(self) -> None: except OSError as excep: print(f"Error: {excep.filename} - {excep.strerror}.") - def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: + @implements(InAlgorithmAsync) + def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: return [] # pylint was complaining when I didn't return anything here... + + @implements(InAlgorithmAsync) + def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]: + return [] + + @implements(InAlgorithmAsync) + def _predict_script_command( + self, model_path: Path, test_path: Path, pred_path: Path + ) -> List[str]: + return [] diff --git a/ethicml/algorithms/inprocess/kamiran.py b/ethicml/algorithms/inprocess/kamiran.py index 76352b51..f5e491aa 100644 --- a/ethicml/algorithms/inprocess/kamiran.py +++ b/ethicml/algorithms/inprocess/kamiran.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import sklearn.linear_model._base from ranzen import implements from sklearn.linear_model import LogisticRegression @@ -28,12 +29,22 @@ def __init__( kernel: Optional[str] = None, seed: int = 888, ): - super().__init__(name=f"Kamiran & Calders {classifier}") + super().__init__(name=f"Kamiran & Calders {classifier}", seed=seed) if classifier not in VALID_MODELS: raise ValueError(f"results: classifier must be one of {VALID_MODELS!r}.") self.classifier = classifier self.C, self.kernel = settings_for_svm_lr(classifier, C, kernel) - self.seed = seed + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + self.clf = _train( + train, classifier=self.classifier, C=self.C, kernel=self.kernel, seed=self.seed + ) + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return _predict(model=self.clf, test=test) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: @@ -69,6 +80,28 @@ def compute_instance_weights( return pd.DataFrame(group_weights[inv_indexes_gi], columns=["instance weights"]) +def _train( + train: DataTuple, classifier: ClassifierType, C: float, kernel: str, seed: int +) -> sklearn.linear_model._base.LinearModel: + if classifier == "SVM": + model = select_svm(C=C, kernel=kernel, seed=seed) + else: + random_state = np.random.RandomState(seed=seed) + model = LogisticRegression( + solver="liblinear", random_state=random_state, max_iter=5000, C=C + ) + model.fit( + train.x, + train.y.to_numpy().ravel(), + sample_weight=compute_instance_weights(train)["instance weights"], + ) + return model + + +def _predict(model: sklearn.linear_model._base.LinearModel, test: TestTuple) -> Prediction: + return Prediction(hard=pd.Series(model.predict(test.x))) + + def _train_and_predict( train: DataTuple, test: TestTuple, classifier: ClassifierType, C: float, kernel: str, seed: int ) -> Prediction: diff --git a/ethicml/algorithms/inprocess/logistic_regression.py b/ethicml/algorithms/inprocess/logistic_regression.py index 8f1a3b47..cb4cfe73 100644 --- a/ethicml/algorithms/inprocess/logistic_regression.py +++ b/ethicml/algorithms/inprocess/logistic_regression.py @@ -19,8 +19,22 @@ class LR(InAlgorithm): def __init__(self, C: Optional[float] = None, seed: int = 888): self.C = LogisticRegression().C if C is None else C - super().__init__(name=f"Logistic Regression (C={self.C})", is_fairness_algo=False) - self.seed = seed + super().__init__( + name=f"Logistic Regression (C={self.C})", is_fairness_algo=False, seed=seed + ) + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + random_state = np.random.RandomState(seed=self.seed) + self.clf = LogisticRegression( + solver="liblinear", random_state=random_state, C=self.C, multi_class="auto" + ) + self.clf.fit(train.x, train.y.to_numpy().ravel()) + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return Prediction(hard=pd.Series(self.clf.predict(test.x))) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: @@ -37,8 +51,22 @@ class LRProb(InAlgorithm): def __init__(self, C: Optional[int] = None, seed: int = 888): self.C = LogisticRegression().C if C is None else C - super().__init__(name=f"Logistic Regression Prob (C={self.C})", is_fairness_algo=False) - self.seed = seed + super().__init__( + name=f"Logistic Regression Prob (C={self.C})", is_fairness_algo=False, seed=seed + ) + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + random_state = np.random.RandomState(seed=self.seed) + self.clf = LogisticRegression( + solver="liblinear", random_state=random_state, C=self.C, multi_class="auto" + ) + self.clf.fit(train.x, train.y.to_numpy().ravel()) + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return SoftPrediction(soft=pd.Series(self.clf.predict_proba(test.x)[:, 1])) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> SoftPrediction: @@ -54,9 +82,22 @@ class LRCV(InAlgorithm): """Kind of a cheap hack for now, but gives a proper cross-valudeted LR.""" def __init__(self, n_splits: int = 3, seed: int = 888) -> None: - super().__init__(name="LRCV", is_fairness_algo=False) + super().__init__(name="LRCV", is_fairness_algo=False, seed=seed) self.n_splits = n_splits - self.seed = seed + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + random_state = np.random.RandomState(seed=self.seed) + folder = KFold(n_splits=self.n_splits, shuffle=True, random_state=random_state) + self.clf = LogisticRegressionCV( + cv=folder, n_jobs=-1, random_state=random_state, solver="liblinear", multi_class="auto" + ) + self.clf.fit(train.x, train.y.to_numpy().ravel()) + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return Prediction(hard=pd.Series(self.clf.predict(test.x)), info=dict(C=self.clf.C_[0])) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: diff --git a/ethicml/algorithms/inprocess/majority.py b/ethicml/algorithms/inprocess/majority.py index 8211815b..eed6d568 100644 --- a/ethicml/algorithms/inprocess/majority.py +++ b/ethicml/algorithms/inprocess/majority.py @@ -13,8 +13,17 @@ class Majority(InAlgorithm): """Simply returns the majority label from the train set.""" - def __init__(self) -> None: - super().__init__(name="Majority", is_fairness_algo=False) + def __init__(self, seed: int = 888) -> None: + super().__init__(name="Majority", is_fairness_algo=False, seed=seed) + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + self.maj = train.y.mode().iloc[0].to_numpy() # type: ignore[attr-defined] + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return Prediction(hard=pd.Series(self.maj.repeat(len(test.x)))) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: diff --git a/ethicml/algorithms/inprocess/manual.py b/ethicml/algorithms/inprocess/manual.py index 2198272e..306abd04 100644 --- a/ethicml/algorithms/inprocess/manual.py +++ b/ethicml/algorithms/inprocess/manual.py @@ -18,9 +18,27 @@ class Corels(InAlgorithm): From this paper: https://arxiv.org/abs/1704.01701 """ - def __init__(self) -> None: + def __init__(self, seed: int = 888) -> None: """Constructor of the class.""" - super().__init__(name="CORELS") + super().__init__(name="CORELS", seed=seed) + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + if test.name is None or "Compas" not in test.name or "sex" not in test.s.columns: + raise RuntimeError("The Corels algorithm only works on the COMPAS dataset") + age = test.x["age-num"].to_numpy() + priors = test.x["priors-count"].to_numpy() + sex = test.s["sex"].to_numpy() + male = 1 + condition1 = (age >= 18) & (age <= 20) & (sex == male) + condition2 = (age >= 21) & (age <= 23) & (priors >= 2) & (priors <= 3) + condition3: np.ndarray = priors > 3 + pred = np.where(condition1 | condition2 | condition3, np.ones_like(age), np.zeros_like(age)) + return Prediction(hard=pd.Series(pred)) @implements(InAlgorithm) def run(self, _: DataTuple, test: TestTuple) -> Prediction: diff --git a/ethicml/algorithms/inprocess/mlp.py b/ethicml/algorithms/inprocess/mlp.py index 57a2aefc..bd27a752 100644 --- a/ethicml/algorithms/inprocess/mlp.py +++ b/ethicml/algorithms/inprocess/mlp.py @@ -30,7 +30,7 @@ def __init__( activation: Optional[ActivationType] = None, seed: int = 888, ): - super().__init__(name="MLP", is_fairness_algo=False) + super().__init__(name="MLP", is_fairness_algo=False, seed=seed) if hidden_layer_sizes is None: self.hidden_layer_sizes = MLPClassifier().hidden_layer_sizes else: @@ -38,7 +38,16 @@ def __init__( self.activation: ActivationType = ( MLPClassifier().activation if activation is None else activation ) - self.seed = seed + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + self.clf = select_mlp(self.hidden_layer_sizes, self.activation, seed=self.seed) + self.clf.fit(train.x, train.y.to_numpy().ravel()) + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return Prediction(hard=pd.Series(self.clf.predict(test.x))) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: diff --git a/ethicml/algorithms/inprocess/oracle.py b/ethicml/algorithms/inprocess/oracle.py index 85374b9e..66e2e6de 100644 --- a/ethicml/algorithms/inprocess/oracle.py +++ b/ethicml/algorithms/inprocess/oracle.py @@ -17,8 +17,17 @@ class Oracle(InAlgorithm): but can be useful if you want to either do a sanity check, or report potential values. """ - def __init__(self) -> None: - super().__init__(name="Oracle", is_fairness_algo=False) + def __init__(self, seed: int = 888) -> None: + super().__init__(name="Oracle", is_fairness_algo=False, seed=seed) + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + assert isinstance(test, DataTuple), "test must be a DataTuple." + return Prediction(hard=test.y[test.y.columns[0]].copy()) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: @@ -34,12 +43,23 @@ class DPOracle(InAlgorithm): but can be useful if you want to either do a sanity check, or report potential values. """ - def __init__(self) -> None: - super().__init__(name="DemPar. Oracle", is_fairness_algo=True) + def __init__(self, seed: int = 888) -> None: + super().__init__(name="DemPar. Oracle", is_fairness_algo=True, seed=seed) + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + assert isinstance(test, DataTuple), "test must be a DataTuple." + flipper = DPFlip(seed=self.seed) + test_preds = Prediction(test.y[test.y.columns[0]].copy()) + return flipper.run(test_preds, test, test_preds, test) @implements(InAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Prediction: assert isinstance(test, DataTuple), "test must be a DataTuple." - flipper = DPFlip() + flipper = DPFlip(seed=self.seed) test_preds = Prediction(test.y[test.y.columns[0]].copy()) return flipper.run(test_preds, test, test_preds, test) diff --git a/ethicml/algorithms/inprocess/shared.py b/ethicml/algorithms/inprocess/shared.py index acdc5909..9087beaf 100644 --- a/ethicml/algorithms/inprocess/shared.py +++ b/ethicml/algorithms/inprocess/shared.py @@ -9,13 +9,24 @@ def flag_interface( - train_path: Path, test_path: Path, pred_path: Path, flags: Dict[str, Any] + flags: Dict[str, Any], + *, + train_path: Optional[Path] = None, + test_path: Optional[Path] = None, + model_path: Optional[Path] = None, + pred_path: Optional[Path] = None, ) -> List[str]: """Generate the commandline arguments that are expected by the script about to be called.""" # paths to training and test data - data_flags: Dict[str, Any] = {"train": train_path, "test": test_path} - # paths to output files - data_flags.update({"predictions": pred_path}) + data_flags: Dict[str, Any] = {} + if train_path is not None: + data_flags["train"] = train_path + if test_path is not None: + data_flags["test"] = test_path + if model_path is not None: + data_flags["model"] = model_path + if pred_path is not None: + data_flags["predictions"] = pred_path data_flags.update(flags) flags_list: List[str] = [] diff --git a/ethicml/algorithms/inprocess/svm.py b/ethicml/algorithms/inprocess/svm.py index 650b491b..df386c4a 100644 --- a/ethicml/algorithms/inprocess/svm.py +++ b/ethicml/algorithms/inprocess/svm.py @@ -18,10 +18,19 @@ class SVM(InAlgorithm): def __init__(self, C: Optional[float] = None, kernel: Optional[str] = None, seed: int = 888): kernel_name = f" ({kernel})" if kernel is not None else "" - super().__init__(name="SVM" + kernel_name, is_fairness_algo=False) + super().__init__(name="SVM" + kernel_name, is_fairness_algo=False, seed=seed) self.C = SVC().C if C is None else C self.kernel = SVC().kernel if kernel is None else kernel - self.seed = seed + + @implements(InAlgorithm) + def fit(self, train: DataTuple) -> InAlgorithm: + self.clf = select_svm(self.C, self.kernel, self.seed) + self.clf.fit(train.x, train.y.to_numpy().ravel()) + return self + + @implements(InAlgorithm) + def predict(self, test: TestTuple) -> Prediction: + return Prediction(hard=pd.Series(self.clf.predict(test.x))) @implements(InAlgorithm) def run(self, train: DataTuple, test: Union[DataTuple, TestTuple]) -> Prediction: diff --git a/ethicml/algorithms/inprocess/svm_async.py b/ethicml/algorithms/inprocess/svm_async.py index f6ba4cef..c5f86b92 100644 --- a/ethicml/algorithms/inprocess/svm_async.py +++ b/ethicml/algorithms/inprocess/svm_async.py @@ -1,7 +1,8 @@ """Wrapper for SKLearn implementation of SVM.""" from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union +from ranzen import implements from sklearn.svm import SVC from ethicml.algorithms.inprocess.in_algorithm import InAlgorithmAsync @@ -14,14 +15,38 @@ class SVMAsync(InAlgorithmAsync): """Support Vector Machine.""" - def __init__(self, C: Optional[float] = None, kernel: Optional[str] = None, seed: int = 888): - super().__init__(name="SVM", is_fairness_algo=False) + def __init__( + self, + dir: Union[str, Path], + C: Optional[float] = None, + kernel: Optional[str] = None, + seed: int = 888, + ): + super().__init__(name="SVM", is_fairness_algo=False, seed=seed) + self.model_dir = dir if isinstance(dir, Path) else Path(dir) self.flags = { "c": SVC().C if C is None else C, "kernel": SVC().kernel if kernel is None else kernel, "seed": seed, } - def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: - args = flag_interface(train_path, test_path, pred_path, self.flags) + @implements(InAlgorithmAsync) + def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]: + args = flag_interface( + train_path=train_path, test_path=test_path, pred_path=pred_path, flags=self.flags + ) + return ["-m", "ethicml.implementations.svm"] + args + + @implements(InAlgorithmAsync) + def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]: + args = flag_interface(train_path=train_path, model_path=model_path, flags=self.flags) + return ["-m", "ethicml.implementations.svm"] + args + + @implements(InAlgorithmAsync) + def _predict_script_command( + self, model_path: Path, test_path: Path, pred_path: Path + ) -> List[str]: + args = flag_interface( + model_path=model_path, test_path=test_path, pred_path=pred_path, flags=self.flags + ) return ["-m", "ethicml.implementations.svm"] + args diff --git a/ethicml/algorithms/postprocess/dp_flip.py b/ethicml/algorithms/postprocess/dp_flip.py index 32301c75..691e5755 100644 --- a/ethicml/algorithms/postprocess/dp_flip.py +++ b/ethicml/algorithms/postprocess/dp_flip.py @@ -15,8 +15,21 @@ class DPFlip(PostAlgorithm): """Randomly flip a number of decisions such that perfect demographic parity is achieved.""" def __init__(self, seed: int = 888) -> None: - super().__init__(name="DemPar. Post Process") - self.seed = seed + super().__init__(name="DemPar. Post Process", seed=seed) + + @implements(PostAlgorithm) + def fit(self, train_predictions: Prediction, train: DataTuple) -> PostAlgorithm: + return self + + @implements(PostAlgorithm) + def predict(self, test_predictions: Prediction, test: TestTuple) -> Prediction: + x, y = self._fit(test, test_predictions) + _test_preds = self._flip( + test_predictions, test, flip_0_to_1=True, num_to_flip=x, s_group=0, seed=self.seed + ) + return self._flip( + _test_preds, test, flip_0_to_1=False, num_to_flip=y, s_group=1, seed=self.seed + ) @implements(PostAlgorithm) def run( diff --git a/ethicml/algorithms/postprocess/hardt.py b/ethicml/algorithms/postprocess/hardt.py index 59f88fb2..a991c040 100644 --- a/ethicml/algorithms/postprocess/hardt.py +++ b/ethicml/algorithms/postprocess/hardt.py @@ -19,10 +19,19 @@ class Hardt(PostAlgorithm): """Post-processing method by Hardt et al.""" def __init__(self, unfavorable_label: int = 0, favorable_label: int = 1, seed: int = 888): - super().__init__(name="Hardt") + super().__init__(name="Hardt", seed=seed) self._unfavorable_label = unfavorable_label self._favorable_label = favorable_label - self._random = RandomState(seed=seed) + self._random = RandomState(seed=self.seed) + + @implements(PostAlgorithm) + def fit(self, train_predictions: Prediction, train: DataTuple) -> PostAlgorithm: + self.model_params = self._fit(train_predictions, train) + return self + + @implements(PostAlgorithm) + def predict(self, test_predictions: Prediction, test: TestTuple) -> Prediction: + return self._predict(self.model_params, test_predictions, test) @implements(PostAlgorithm) def run( diff --git a/ethicml/algorithms/postprocess/post_algorithm.py b/ethicml/algorithms/postprocess/post_algorithm.py index 4e581c0a..bce1bf06 100644 --- a/ethicml/algorithms/postprocess/post_algorithm.py +++ b/ethicml/algorithms/postprocess/post_algorithm.py @@ -1,4 +1,5 @@ """Abstract Base Class of all post-processing algorithms in the framework.""" +from __future__ import annotations from abc import abstractmethod @@ -12,6 +13,28 @@ class PostAlgorithm(Algorithm): """Abstract Base Class for all algorithms that do post-processing.""" + @abstractmethod + def fit(self, train_predictions: Prediction, train: DataTuple) -> PostAlgorithm: + """Run Algorithm on the given data. + + Args: + train: training data + + Returns: + self, but trained. + """ + + @abstractmethod + def predict(self, test_predictions: Prediction, test: TestTuple) -> Prediction: + """Run Algorithm on the given data. + + Args: + test: data to evaluate on + + Returns: + predictions + """ + @abstractmethod def run( self, diff --git a/ethicml/algorithms/preprocess/beutel.py b/ethicml/algorithms/preprocess/beutel.py index 0c31e52d..8613a4cf 100644 --- a/ethicml/algorithms/preprocess/beutel.py +++ b/ethicml/algorithms/preprocess/beutel.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Dict, List, Sequence, Union +from ranzen import implements + from ethicml.utility import FairnessType from .interface import flag_interface @@ -15,6 +17,7 @@ class Beutel(PreAlgorithmAsync): def __init__( self, + dir: Union[str, Path], fairness: FairnessType = "DP", enc_size: Sequence[int] = (40,), adv_size: Sequence[int] = (40,), @@ -30,7 +33,8 @@ def __init__( seed: int = 888, ): # pylint: disable=too-many-arguments - super().__init__(name=f"Beutel {fairness}") + super().__init__(name=f"Beutel {fairness}", seed=seed, out_size=enc_size[-1]) + self.model_dir = dir if isinstance(dir, Path) else Path(dir) self.flags: Dict[str, Union[str, Sequence[int], int, float]] = { "fairness": fairness, "enc_size": enc_size, @@ -47,8 +51,39 @@ def __init__( "seed": seed, } - def _script_command( + @implements(PreAlgorithmAsync) + def _run_script_command( self, train_path: Path, test_path: Path, new_train_path: Path, new_test_path: Path ) -> List[str]: - args = flag_interface(train_path, test_path, new_train_path, new_test_path, self.flags) + args = flag_interface( + train_path=train_path, + test_path=test_path, + new_train_path=new_train_path, + new_test_path=new_test_path, + flags=self.flags, + ) + return ["-m", "ethicml.implementations.beutel"] + args + + @implements(PreAlgorithmAsync) + def _fit_script_command( + self, train_path: Path, new_train_path: Path, model_path: Path + ) -> List[str]: + args = flag_interface( + train_path=train_path, + new_train_path=new_train_path, + model_path=model_path, + flags=self.flags, + ) + return ["-m", "ethicml.implementations.beutel"] + args + + @implements(PreAlgorithmAsync) + def _transform_script_command( + self, model_path: Path, test_path: Path, new_test_path: Path + ) -> List[str]: + args = flag_interface( + model_path=model_path, + test_path=test_path, + new_test_path=new_test_path, + flags=self.flags, + ) return ["-m", "ethicml.implementations.beutel"] + args diff --git a/ethicml/algorithms/preprocess/calders.py b/ethicml/algorithms/preprocess/calders.py index 31b1d1a8..a9aba8b0 100644 --- a/ethicml/algorithms/preprocess/calders.py +++ b/ethicml/algorithms/preprocess/calders.py @@ -6,7 +6,7 @@ from ethicml.utility import DataTuple, SoftPrediction, TestTuple, concat_dt from ..inprocess.logistic_regression import LRProb -from .pre_algorithm import PreAlgorithm +from .pre_algorithm import PreAlgorithm, T __all__ = ["Calders"] @@ -14,14 +14,32 @@ class Calders(PreAlgorithm): """Massaging algorithm from Kamiran&Calders 2012.""" - def __init__(self, preferable_class: int, disadvantaged_group: int): - super().__init__(name="Calders") + def __init__(self, preferable_class: int, disadvantaged_group: int, seed: int = 888): + super().__init__(name="Calders", seed=seed, out_size=None) self.preferable_class = preferable_class self.disadvantaged_group = disadvantaged_group + @implements(PreAlgorithm) + def fit(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: + self._out_size = train.x.shape[1] + new_train, _ = _calders_algorithm( + train, train, self.preferable_class, self.disadvantaged_group + ) + return self, new_train.replace(name=f"{self.name}: {train.name}") + + @implements(PreAlgorithm) + def transform(self, data: T) -> T: + return data.replace(name=f"{self.name}: {data.name}") + @implements(PreAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: - return _calders_algorithm(train, test, self.preferable_class, self.disadvantaged_group) + self._out_size = train.x.shape[1] + new_train, new_test = _calders_algorithm( + train, test, self.preferable_class, self.disadvantaged_group + ) + return new_train.replace(name=f"{self.name}: {train.name}"), new_test.replace( + name=f"{self.name}: {test.name}" + ) def _calders_algorithm( diff --git a/ethicml/algorithms/preprocess/interface.py b/ethicml/algorithms/preprocess/interface.py index 24b99274..cd12ed7f 100644 --- a/ethicml/algorithms/preprocess/interface.py +++ b/ethicml/algorithms/preprocess/interface.py @@ -1,20 +1,30 @@ """Methods that define commandline interfaces.""" from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional def flag_interface( - train_path: Path, - test_path: Path, - new_train_path: Path, - new_test_path: Path, flags: Dict[str, Any], + train_path: Optional[Path] = None, + test_path: Optional[Path] = None, + new_train_path: Optional[Path] = None, + new_test_path: Optional[Path] = None, + model_path: Optional[Path] = None, ) -> List[str]: """Generate the commandline arguments that are expected by the script about to be called.""" # paths to training and test data - data_flags: Dict[str, Any] = {"train": train_path, "test": test_path} + data_flags: Dict[str, Any] = {} + if train_path is not None: + data_flags["train"] = str(train_path) + if new_train_path is not None: + data_flags["new_train"] = str(new_train_path) + if test_path is not None: + data_flags["test"] = str(test_path) + if new_test_path is not None: + data_flags["new_test"] = str(new_test_path) + if model_path is not None: + data_flags["model"] = str(model_path) # paths to output files - data_flags.update({"new_train": new_train_path, "new_test": new_test_path}) data_flags.update(flags) flags_list: List[str] = [] diff --git a/ethicml/algorithms/preprocess/pre_algorithm.py b/ethicml/algorithms/preprocess/pre_algorithm.py index 44d9bf6e..38f05783 100644 --- a/ethicml/algorithms/preprocess/pre_algorithm.py +++ b/ethicml/algorithms/preprocess/pre_algorithm.py @@ -1,9 +1,11 @@ """Abstract Base Class of all algorithms in the framework.""" +from __future__ import annotations + from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory -from typing import List, Tuple +from typing import List, Optional, Tuple, TypeVar from ranzen import implements @@ -12,10 +14,47 @@ __all__ = ["PreAlgorithm", "PreAlgorithmAsync"] +T = TypeVar("T", DataTuple, TestTuple) + class PreAlgorithm(Algorithm): """Abstract Base Class for all algorithms that do pre-processing.""" + def __init__(self, name: str, seed: int, out_size: Optional[int]): + """Base constructor for the Algorithm class. + + Args: + name: name of the algorithm + seed: seed for the random number generator + out_size: number of features to generate + """ + super().__init__(name=name, seed=seed) + self._out_size = out_size + + @abstractmethod + def fit(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: + """Generate fair features with the given data. + + Args: + train: training data + test: test data + + Returns: + a tuple of the pre-processed training data and the test data + """ + + @abstractmethod + def transform(self, data: T) -> T: + """Generate fair features with the given data. + + Args: + train: training data + test: test data + + Returns: + a tuple of the pre-processed training data and the test data + """ + @abstractmethod def run(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: """Generate fair features with the given data. @@ -33,10 +72,25 @@ def run_test(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTu train_testing = train.get_subset() return self.run(train_testing, test) + @property + def out_size(self) -> int: + """The number of features to generate.""" + assert self._out_size is not None + return self._out_size + class PreAlgorithmAsync(PreAlgorithm, AlgorithmAsync): """Pre-Algorithm that can be run blocking and asynchronously.""" + @implements(PreAlgorithm) + def fit(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: + model, data = run_blocking(self.fit_async(train)) + return self, data + + @implements(PreAlgorithm) + def transform(self, data: T) -> T: + return run_blocking(self.transform_async(data)) + @implements(PreAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: """Generate fair features with the given data by running as a blocking function. @@ -50,6 +104,73 @@ def run(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: """ return run_blocking(self.run_async(train, test)) + async def fit_async(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: + """Generate fair features with the given data asynchronously. + + Args: + train: training data + test: test data + + Returns: + a tuple of the pre-processed training data and the test data + """ + self.model_path = self.model_dir / f"model_{self.name}.joblib" + with TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + # ================================ write data to files ================================ + train_path, test_path = tmp_path / "train.npz", tmp_path / "test.npz" + train.to_npz(train_path) + + # ========================== generate commandline arguments =========================== + transformed_train_path = tmp_path / "transformed_train.npz" + cmd = self._fit_script_command(train_path, transformed_train_path, self.model_path) + + # ============================= run the generated command ============================= + await self._call_script(cmd + ["--mode", "fit"]) + + # ================================== load results ===================================== + transformed_train = DataTuple.from_npz(transformed_train_path) + + # prefix the name of the algorithm to the dataset name + transformed_train = transformed_train.replace( + name=None if train.name is None else f"{self.name}: {train.name}" + ) + return self, transformed_train + + async def transform_async(self, data: T) -> T: + """Generate fair features with the given data asynchronously. + + Args: + train: training data + test: test data + + Returns: + a tuple of the pre-processed training data and the test data + """ + with TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + # ================================ write data to files ================================ + test_path = tmp_path / "test.npz" + data.to_npz(test_path) + + # ========================== generate commandline arguments =========================== + transformed_test_path = tmp_path / "transformed_test.npz" + cmd = self._transform_script_command( + model_path=self.model_path, test_path=test_path, new_test_path=transformed_test_path + ) + + # ============================= run the generated command ============================= + await self._call_script(cmd + ["--mode", "transform"]) + + # ================================== load results ===================================== + transformed_test = TestTuple.from_npz(transformed_test_path) + + # prefix the name of the algorithm to the dataset name + transformed_test = transformed_test.replace( + name=None if data.name is None else f"{self.name}: {data.name}" + ) + return transformed_test + async def run_async(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: """Generate fair features with the given data asynchronously. @@ -70,12 +191,12 @@ async def run_async(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, # ========================== generate commandline arguments =========================== transformed_train_path = tmp_path / "transformed_train.npz" transformed_test_path = tmp_path / "transformed_test.npz" - cmd = self._script_command( + cmd = self._run_script_command( train_path, test_path, transformed_train_path, transformed_test_path ) # ============================= run the generated command ============================= - await self._call_script(cmd) + await self._call_script(cmd + ["--mode", "run"]) # ================================== load results ===================================== transformed_train = DataTuple.from_npz(transformed_train_path) @@ -91,7 +212,19 @@ async def run_async(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, return transformed_train, transformed_test @abstractmethod - def _script_command( + def _run_script_command( self, train_path: Path, test_path: Path, new_train_path: Path, new_test_path: Path ) -> List[str]: """The command that will run the script.""" + + @abstractmethod + def _fit_script_command( + self, train_path: Path, new_train_path: Path, model_path: Path + ) -> List[str]: + """The command that will run the script.""" + + @abstractmethod + def _transform_script_command( + self, model_path: Path, test_path: Path, new_test_path: Path + ) -> List[str]: + """The command that will run the script.""" diff --git a/ethicml/algorithms/preprocess/upsampler.py b/ethicml/algorithms/preprocess/upsampler.py index 18ca907e..a6242812 100644 --- a/ethicml/algorithms/preprocess/upsampler.py +++ b/ethicml/algorithms/preprocess/upsampler.py @@ -9,7 +9,7 @@ from ethicml.utility import DataTuple, SoftPrediction, TestTuple from ..inprocess.logistic_regression import LRProb -from .pre_algorithm import PreAlgorithm +from .pre_algorithm import PreAlgorithm, T __all__ = ["Upsampler"] @@ -24,15 +24,25 @@ class Upsampler(PreAlgorithm): def __init__( self, strategy: Literal["uniform", "preferential", "naive"] = "uniform", seed: int = 888 ): - super().__init__(name=f"Upsample {strategy}") + super().__init__(name=f"Upsample {strategy}", seed=seed, out_size=None) assert strategy in ["uniform", "preferential", "naive"] self.strategy = strategy - self.seed = seed + + @implements(PreAlgorithm) + def fit(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: + self._out_size = train.x.shape[1] + new_train, _ = upsample(train, train, self.strategy, self.seed, name=self.name) + return self, new_train + + @implements(PreAlgorithm) + def transform(self, data: T) -> T: + return data.replace(name=f"{self.name}: {data.name}") @implements(PreAlgorithm) def run(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: - return upsample(train, test, self.strategy, self.seed) + self._out_size = train.x.shape[1] + return upsample(train, test, self.strategy, self.seed, name=self.name) def concat_datatuples(first_dt: DataTuple, second_dt: DataTuple) -> DataTuple: @@ -61,6 +71,7 @@ def upsample( test: TestTuple, strategy: Literal["uniform", "preferential", "naive"], seed: int, + name: str, ) -> Tuple[DataTuple, TestTuple]: """Upsample a datatuple.""" s_col = dataset.s.columns[0] @@ -122,6 +133,7 @@ def upsample( upsampled_datatuple = val else: upsampled_datatuple = concat_datatuples(upsampled_datatuple, val) + upsampled_datatuple = upsampled_datatuple.replace(name=f"{name}: {dataset.name}") if strategy == "preferential": ranker = LRProb() @@ -163,8 +175,8 @@ def upsample( x=upsampled_dataframes[x_columns], s=upsampled_dataframes[s_columns], y=upsampled_dataframes[y_columns], - name=dataset.name, + name=f"{name}: {dataset.name}", ) assert upsampled_datatuple is not None - return upsampled_datatuple, TestTuple(x=test.x, s=test.s, name=test.name) + return upsampled_datatuple, TestTuple(x=test.x, s=test.s, name=f"{name}: {test.name}") diff --git a/ethicml/algorithms/preprocess/vfae.py b/ethicml/algorithms/preprocess/vfae.py index 29b22703..4ce80a44 100644 --- a/ethicml/algorithms/preprocess/vfae.py +++ b/ethicml/algorithms/preprocess/vfae.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Dict, List, Optional, Union +from ranzen import implements + from .interface import flag_interface from .pre_algorithm import PreAlgorithmAsync @@ -13,18 +15,21 @@ class VFAE(PreAlgorithmAsync): def __init__( self, + dir: Union[str, Path], dataset: str, supervised: bool = True, epochs: int = 10, batch_size: int = 32, fairness: str = "DI", + latent_dims: int = 50, z1_enc_size: Optional[List[int]] = None, z2_enc_size: Optional[List[int]] = None, z1_dec_size: Optional[List[int]] = None, seed: int = 888, ): # pylint: disable=too-many-arguments - super().__init__(name="VFAE") + super().__init__(name="VFAE", seed=seed, out_size=latent_dims) + self.model_dir = dir if isinstance(dir, Path) else Path(dir) if z1_enc_size is None: z1_enc_size = [100] @@ -39,14 +44,46 @@ def __init__( "batch_size": batch_size, "epochs": epochs, "dataset": dataset, + "latent_dims": latent_dims, "z1_enc_size": z1_enc_size, "z2_enc_size": z2_enc_size, "z1_dec_size": z1_dec_size, "seed": seed, } - def _script_command( + @implements(PreAlgorithmAsync) + def _run_script_command( self, train_path: Path, test_path: Path, new_train_path: Path, new_test_path: Path ) -> List[str]: - args = flag_interface(train_path, test_path, new_train_path, new_test_path, self.flags) + args = flag_interface( + train_path=train_path, + test_path=test_path, + new_train_path=new_train_path, + new_test_path=new_test_path, + flags=self.flags, + ) + return ["-m", "ethicml.implementations.vfae"] + args + + @implements(PreAlgorithmAsync) + def _fit_script_command( + self, train_path: Path, new_train_path: Path, model_path: Path + ) -> List[str]: + args = flag_interface( + train_path=train_path, + new_train_path=new_train_path, + model_path=model_path, + flags=self.flags, + ) + return ["-m", "ethicml.implementations.vfae"] + args + + @implements(PreAlgorithmAsync) + def _transform_script_command( + self, model_path: Path, test_path: Path, new_test_path: Path + ) -> List[str]: + args = flag_interface( + model_path=model_path, + test_path=test_path, + new_test_path=new_test_path, + flags=self.flags, + ) return ["-m", "ethicml.implementations.vfae"] + args diff --git a/ethicml/algorithms/preprocess/zemel.py b/ethicml/algorithms/preprocess/zemel.py index 9db5d270..51bcf119 100644 --- a/ethicml/algorithms/preprocess/zemel.py +++ b/ethicml/algorithms/preprocess/zemel.py @@ -1,18 +1,23 @@ """Zemel's Learned Fair Representations.""" from pathlib import Path -from typing import Dict, List, Union +from typing import Dict, List, Tuple, Union + +from ranzen import implements from .interface import flag_interface -from .pre_algorithm import PreAlgorithmAsync +from .pre_algorithm import PreAlgorithm, PreAlgorithmAsync __all__ = ["Zemel"] +from ethicml.utility import DataTuple, TestTuple + class Zemel(PreAlgorithmAsync): """AIF360 implementation of Zemel's LFR.""" def __init__( self, + dir: Union[str, Path], threshold: float = 0.5, clusters: int = 2, Ax: float = 0.01, @@ -23,7 +28,8 @@ def __init__( epsilon: float = 1e-5, seed: int = 888, ) -> None: - super().__init__(name="Zemel") + super().__init__(name="Zemel", seed=seed, out_size=None) + self.model_dir = dir if isinstance(dir, Path) else Path(dir) self.flags: Dict[str, Union[int, float]] = { "clusters": clusters, "Ax": Ax, @@ -36,8 +42,54 @@ def __init__( "seed": seed, } - def _script_command( + @implements(PreAlgorithm) + def run(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: + self._out_size = train.x.shape[1] + return super().run(train, test) + + @implements(PreAlgorithm) + def fit(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: + self._out_size = train.x.shape[1] + return super().fit(train) + + @implements(PreAlgorithmAsync) + def fit_async(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: + self._out_size = train.x.shape[1] + return super().fit_async(train) + + @implements(PreAlgorithmAsync) + def _run_script_command( self, train_path: Path, test_path: Path, new_train_path: Path, new_test_path: Path ) -> List[str]: - args = flag_interface(train_path, test_path, new_train_path, new_test_path, self.flags) + args = flag_interface( + train_path=train_path, + test_path=test_path, + new_train_path=new_train_path, + new_test_path=new_test_path, + flags=self.flags, + ) + return ["-m", "ethicml.implementations.zemel"] + args + + @implements(PreAlgorithmAsync) + def _fit_script_command( + self, train_path: Path, new_train_path: Path, model_path: Path + ) -> List[str]: + args = flag_interface( + train_path=train_path, + new_train_path=new_train_path, + model_path=model_path, + flags=self.flags, + ) + return ["-m", "ethicml.implementations.zemel"] + args + + @implements(PreAlgorithmAsync) + def _transform_script_command( + self, model_path: Path, test_path: Path, new_test_path: Path + ) -> List[str]: + args = flag_interface( + model_path=model_path, + test_path=test_path, + new_test_path=new_test_path, + flags=self.flags, + ) return ["-m", "ethicml.implementations.zemel"] + args diff --git a/ethicml/data/tabular_data/acs_income.py b/ethicml/data/tabular_data/acs_income.py index c7b592ce..f61b6868 100644 --- a/ethicml/data/tabular_data/acs_income.py +++ b/ethicml/data/tabular_data/acs_income.py @@ -35,7 +35,7 @@ class AdultSplits(Enum): @contextlib.contextmanager -def download_dir(root: Path): +def download_dir(root: Path) -> None: curdir = os.getcwd() os.chdir(root.expanduser().resolve()) try: @@ -48,7 +48,6 @@ def acs_income( root: Path, year: str, horizon: int, - survey: str, states: List[str], split: str = "Sex", target_threshold: int = 50_000, @@ -59,7 +58,6 @@ def acs_income( root=root, year=year, horizon=horizon, - survey=survey, states=states, split=split, target_threshold=target_threshold, @@ -75,7 +73,6 @@ def __init__( root: Union[str, Path], year: str, horizon: int, - survey: str, states: List[str], split: str = "Sex", target_threshold: int = 50_000, @@ -90,7 +87,7 @@ def __init__( self.year = year self.horizon = horizon - self.survey = survey + self.survey = "person" self.states = states self.split = split self.target = "PINCP" @@ -102,7 +99,7 @@ def __init__( self.sens_lookup = {"Sex": "SEX", "Race": "RAC1P"} state_string = "_".join(states) - self.name = f"ACS_Income_{year}_{survey}_{horizon}_{state_string}_{split}" + self.name = f"ACS_Income_{year}_{horizon}_{state_string}_{split}" self.class_label_spec = "PINCP_1" self.class_label_prefix = ["PINCP"] self.discard_non_one_hot = False diff --git a/ethicml/data/tabular_data/german.py b/ethicml/data/tabular_data/german.py index a1ccf436..72951040 100644 --- a/ethicml/data/tabular_data/german.py +++ b/ethicml/data/tabular_data/german.py @@ -14,15 +14,6 @@ class GermanSplits(Enum): CUSTOM = "Custom" -def german( - split: Union[GermanSplits, str] = "Sex", - discrete_only: bool = False, - invert_s: bool = False, -): - """German credit dataset.""" - return German(split=split, discrete_only=discrete_only, invert_s=invert_s) - - class German(Dataset): """German credit dataset.""" @@ -145,3 +136,12 @@ def __init__( discrete_feature_groups=disc_feature_groups, invert_s=invert_s, ) + + +def german( + split: Union[GermanSplits, str] = "Sex", + discrete_only: bool = False, + invert_s: bool = False, +) -> German: + """German credit dataset.""" + return German(split=split, discrete_only=discrete_only, invert_s=invert_s) diff --git a/ethicml/data/tabular_data/toy.py b/ethicml/data/tabular_data/toy.py index 06f32eca..8a926336 100644 --- a/ethicml/data/tabular_data/toy.py +++ b/ethicml/data/tabular_data/toy.py @@ -14,7 +14,7 @@ def toy() -> Dataset: class Toy(Dataset): """Dataset with toy data for testing.""" - def __init__(self): + def __init__(self) -> None: disc_feature_groups = { "disc_1": ["disc_1_a", "disc_1_b", "disc_1_c", "disc_1_d", "disc_1_e"], "disc_2": ["disc_2_x", "disc_2_y", "disc_2_z"], diff --git a/ethicml/implementations/agarwal.py b/ethicml/implementations/agarwal.py index c1eaadb3..6506e9a6 100644 --- a/ethicml/implementations/agarwal.py +++ b/ethicml/implementations/agarwal.py @@ -1,9 +1,12 @@ """Implementation of logistic regression (actually just a wrapper around sklearn).""" +import contextlib +import os import random from pathlib import Path import numpy as np import pandas as pd +from joblib import dump, load from sklearn.linear_model import LogisticRegression from ethicml.algorithms.inprocess.svm import select_svm @@ -24,8 +27,8 @@ class AgarwalArgs(InAlgoArgs): seed: int -def train_and_predict(train: DataTuple, test: TestTuple, args: AgarwalArgs) -> pd.DataFrame: - """Train a logistic regression model and compute predictions on the given test data.""" +def fit(train: DataTuple, args): + """Fit a model.""" try: from fairlearn.reductions import ( ConditionalSelectionRate, @@ -36,9 +39,6 @@ def train_and_predict(train: DataTuple, test: TestTuple, args: AgarwalArgs) -> p except ImportError as e: raise RuntimeError("In order to use Agarwal, install fairlearn.") from e - random.seed(args.seed) - np.random.seed(args.seed) - fairness_class: ConditionalSelectionRate if args.fairness == "DP": fairness_class = DemographicParity() @@ -62,20 +62,78 @@ def train_and_predict(train: DataTuple, test: TestTuple, args: AgarwalArgs) -> p ) exponentiated_gradient.fit(data_x, data_y, sensitive_features=data_a) + min_class_label = train.y[train.y.columns[0]].min() + exponentiated_gradient.min_class_label = min_class_label + + return exponentiated_gradient + + +def predict(exponentiated_gradient, test: TestTuple) -> pd.DataFrame: + """Compute predictions on the given test data.""" randomized_predictions = exponentiated_gradient.predict(test.x) preds = pd.DataFrame(randomized_predictions, columns=["preds"]) - min_class_label = train.y[train.y.columns[0]].min() if preds["preds"].min() != preds["preds"].max(): - preds = preds.replace(preds["preds"].min(), min_class_label) + preds = preds.replace(preds["preds"].min(), exponentiated_gradient.min_class_label) return preds +def train_and_predict(train: DataTuple, test: TestTuple, args: AgarwalArgs) -> pd.DataFrame: + """Train a logistic regression model and compute predictions on the given test data.""" + exponentiated_gradient = fit(train, args) + return predict(exponentiated_gradient, test) + + +@contextlib.contextmanager +def working_dir(root: Path) -> None: + """Change the working directory to the given path.""" + curdir = os.getcwd() + os.chdir(root.expanduser().resolve().parent) + try: + yield + finally: + os.chdir(curdir) + + def main() -> None: """This function runs the Agarwal model as a standalone program.""" args: AgarwalArgs = AgarwalArgs().parse_args() - train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz(Path(args.test)) - Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz(Path(args.predictions)) + random.seed(args.seed) + np.random.seed(args.seed) + try: + import cloudpickle + + # Need to install cloudpickle for now. See https://github.com/fairlearn/fairlearn/issues/569 + except ImportError as e: + raise RuntimeError("In order to use Agarwal, install fairlearn and cloudpickle.") from e + + if args.mode == "run": + assert args.train is not None + assert args.test is not None + assert args.predictions is not None + train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz(Path(args.test)) + Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz( + Path(args.predictions) + ) + elif args.mode == "fit": + assert args.train is not None + assert args.model is not None + data = DataTuple.from_npz(Path(args.train)) + model = fit(data, args) + with working_dir(Path(args.model)): + model_file = cloudpickle.dumps(model) + dump(model_file, Path(args.model)) + elif args.mode == "predict": + assert args.model is not None + assert args.predictions is not None + assert args.test is not None + data = TestTuple.from_npz(Path(args.test)) + model_file = load(Path(args.model)) + with working_dir(Path(args.model)): + model = cloudpickle.loads(model_file) + Prediction(hard=predict(model, data)["preds"]).to_npz(Path(args.predictions)) + else: + raise RuntimeError(f"Unknown mode: {args.mode}") if __name__ == "__main__": diff --git a/ethicml/implementations/beutel.py b/ethicml/implementations/beutel.py index 286ef1ba..e011b56d 100644 --- a/ethicml/implementations/beutel.py +++ b/ethicml/implementations/beutel.py @@ -3,11 +3,13 @@ # pylint: disable=arguments-differ import random +from pathlib import Path from typing import Any, List, Optional, Sequence, Tuple import numpy as np import pandas as pd import torch +from joblib import dump, load from torch import Tensor, nn from torch.autograd import Function from torch.optim import Adam @@ -103,9 +105,7 @@ def make_dataset_and_loader( return dataset, dataloader -def train_and_transform( - train: DataTuple, test: TestTuple, flags: BeutelArgs -) -> Tuple[DataTuple, TestTuple]: +def fit(train: DataTuple, flags: BeutelArgs): """Train the fair autoencoder on the training data and then transform both training and test.""" set_seed(flags.seed) @@ -125,11 +125,6 @@ def train_and_transform( _, validation_loader = make_dataset_and_loader(validation, flags) _, all_train_data_loader = make_dataset_and_loader(train, flags) - test_data = TestDataset(test) - test_loader = torch.utils.data.DataLoader( - dataset=test_data, batch_size=flags.batch_size, shuffle=False - ) - # convert flags to Python objects enc_activation = STRING_TO_ACTIVATION_MAP[flags.enc_activation] adv_activation = STRING_TO_ACTIVATION_MAP[flags.adv_activation] @@ -195,7 +190,25 @@ def train_and_transform( transformed_train = encode_dataset(enc, all_train_data_loader, train) if post_process: transformed_train = processor.post(encode_dataset(enc, all_train_data_loader, train)) - return transformed_train, encode_testset(enc, test_loader, test) + return transformed_train, enc + + +def transform(data: TestTuple, enc: torch.nn.Module, flags: BeutelArgs) -> TestTuple: + """Transform the test data using the trained autoencoder.""" + test_data = TestDataset(data) + test_loader = torch.utils.data.DataLoader( + dataset=test_data, batch_size=flags.batch_size, shuffle=False + ) + return encode_testset(enc, test_loader, data) + + +def train_and_transform( + train: DataTuple, test: TestTuple, flags: BeutelArgs +) -> Tuple[DataTuple, TestTuple]: + """Train the fair autoencoder on the training data and then transform both training and test.""" + transformed_train, enc = fit(train, flags) + transformed_test = transform(test, enc, flags) + return transformed_train, transformed_test def step(iteration: int, loss: Tensor, optimizer: Adam, scheduler: ExponentialLR) -> None: @@ -394,8 +407,29 @@ def forward(self, x: Tensor, y: Tensor) -> Tuple[Tensor, Tensor, Tensor]: def main() -> None: """Load data from feather files, pass it to `train_and_transform` and then save the result.""" args = BeutelArgs().parse_args() - train, test = load_data_from_flags(args) - save_transformations(train_and_transform(train, test, args), args) + if args.mode == "run": + assert args.train is not None + assert args.new_train is not None + assert args.test is not None + assert args.new_test is not None + train, test = load_data_from_flags(args) + save_transformations(train_and_transform(train, test, args), args) + elif args.mode == "fit": + assert args.model is not None + assert args.train is not None + assert args.new_train is not None + train = DataTuple.from_npz(Path(args.train)) + transformed_train, enc = fit(train, args) + transformed_train.to_npz(Path(args.new_train)) + dump(enc, Path(args.model)) + elif args.mode == "transform": + assert args.model is not None + assert args.test is not None + assert args.new_test is not None + test = DataTuple.from_npz(Path(args.test)) + model = load(Path(args.model)) + transformed_test = transform(test, model, args) + transformed_test.to_npz(Path(args.new_test)) if __name__ == "__main__": diff --git a/ethicml/implementations/dro_tabular.py b/ethicml/implementations/dro_tabular.py index 5cbc75e2..1c18d4f7 100644 --- a/ethicml/implementations/dro_tabular.py +++ b/ethicml/implementations/dro_tabular.py @@ -4,6 +4,7 @@ import pandas as pd import torch +from joblib import dump, load from torch import optim from torch.optim.optimizer import Optimizer # pylint: disable=no-name-in-module from torch.utils.data import DataLoader @@ -50,6 +51,45 @@ def train_model( print(f"====> Epoch: {epoch} Average loss: {train_loss / len(train_loader.dataset):.4f}") # type: ignore[arg-type] +def fit(train: DataTuple, args: DroArgs) -> DROClassifier: + """Train a network and return predictions.""" + # Set up the data + set_seed(args.seed) + train_data = CustomDataset(train) + train_loader = DataLoader(train_data, batch_size=args.batch_size) + + # Build Network + model = DROClassifier( + in_size=train_data.xdim, + out_size=train_data.ydim, + network_size=args.network_size, + eta=args.eta, + ).to("cpu") + optimizer = optim.Adam(model.parameters(), lr=1e-3) + + # Run Network + for epoch in range(int(args.epochs)): + train_model(epoch, model, train_loader, optimizer) + return model + + +def predict(model: DROClassifier, test: TestTuple, args: DroArgs) -> SoftPrediction: + """Train a network and return predictions.""" + # Set up the data + test_data = TestDataset(test) + test_loader = DataLoader(test_data, batch_size=args.batch_size) + + # Transform output + post_test: List[List[float]] = [] + model.eval() + with torch.no_grad(): + for _x, _s in test_loader: + out = model.forward(_x) + post_test += out.data.tolist() + + return SoftPrediction(soft=pd.Series([j for i in post_test for j in i])) + + def train_and_predict(train: DataTuple, test: TestTuple, args: DroArgs) -> SoftPrediction: """Train a network and return predictions.""" # Set up the data @@ -87,8 +127,25 @@ def train_and_predict(train: DataTuple, test: TestTuple, args: DroArgs) -> SoftP def main() -> None: """This function runs the FWD model as a standalone program on tabular data.""" args = DroArgs().parse_args() - train, test = load_data_from_flags(args) - train_and_predict(train, test, args).to_npz(Path(args.predictions)) + if args.mode == "run": + assert args.train is not None + assert args.test is not None + assert args.predictions is not None + train, test = load_data_from_flags(args) + train_and_predict(train, test, args).to_npz(Path(args.predictions)) + elif args.mode == "fit": + assert args.train is not None + assert args.model is not None + data = DataTuple.from_npz(Path(args.train)) + model = fit(data, args) + dump(model, Path(args.model)) + elif args.mode == "predict": + assert args.model is not None + assert args.predictions is not None + assert args.test is not None + data = TestTuple.from_npz(Path(args.test)) + model = load(Path(args.model)) + predict(model, data, args).to_npz(Path(args.predictions)) if __name__ == "__main__": diff --git a/ethicml/implementations/svm.py b/ethicml/implementations/svm.py index c348187d..e4ba0982 100644 --- a/ethicml/implementations/svm.py +++ b/ethicml/implementations/svm.py @@ -2,6 +2,7 @@ from pathlib import Path import numpy as np +from joblib import dump, load from sklearn.svm import SVC, LinearSVC from .utils import InAlgoArgs @@ -18,20 +19,48 @@ class SvmArgs(InAlgoArgs): def main() -> None: """This function runs the SVM model as a standalone program.""" args = SvmArgs().parse_args() - with open(args.train, "rb") as train_file: - train = np.load(train_file) - train_x, train_y = train["x"], train["y"] - with open(args.test, "rb") as test_file: - test = np.load(test_file) - test_x = test["x"] - random_state = np.random.RandomState(seed=args.seed) - if args.kernel == "linear": - clf = LinearSVC(C=args.c, dual=False, tol=1e-12, random_state=random_state) - else: - clf = SVC(C=args.c, kernel=args.kernel, gamma="auto", random_state=random_state) - clf.fit(train_x, train_y.ravel()) - predictions = clf.predict(test_x) - np.savez(Path(args.predictions), hard=predictions) + + if args.mode == "run": + assert args.train is not None + assert args.test is not None + assert args.predictions is not None + with open(args.train, "rb") as train_file: + train = np.load(train_file) + train_x, train_y = train["x"], train["y"] + with open(args.test, "rb") as test_file: + test = np.load(test_file) + test_x = test["x"] + random_state = np.random.RandomState(seed=args.seed) + if args.kernel == "linear": + clf = LinearSVC(C=args.c, dual=False, tol=1e-12, random_state=random_state) + else: + clf = SVC(C=args.c, kernel=args.kernel, gamma="auto", random_state=random_state) + clf.fit(train_x, train_y.ravel()) + predictions = clf.predict(test_x) + np.savez(Path(args.predictions), hard=predictions) + if args.mode == "fit": + assert args.train is not None + assert args.model is not None + with open(args.train, "rb") as train_file: + train = np.load(train_file) + train_x, train_y = train["x"], train["y"] + random_state = np.random.RandomState(seed=args.seed) + if args.kernel == "linear": + clf = LinearSVC(C=args.c, dual=False, tol=1e-12, random_state=random_state) + else: + clf = SVC(C=args.c, kernel=args.kernel, gamma="auto", random_state=random_state) + clf.fit(train_x, train_y.ravel()) + dump(clf, Path(args.model)) + if args.mode == "predict": + assert args.model is not None + assert args.predictions is not None + assert args.test is not None + clf = load(Path(args.model)) + with open(args.test, "rb") as test_file: + test = np.load(test_file) + test_x = test["x"] + predictions = clf.predict(test_x) + np.savez(Path(args.predictions), hard=predictions) if __name__ == "__main__": diff --git a/ethicml/implementations/utils.py b/ethicml/implementations/utils.py index 67cc74a2..ea1c6f47 100644 --- a/ethicml/implementations/utils.py +++ b/ethicml/implementations/utils.py @@ -1,6 +1,6 @@ """Useful functions used in implementations.""" from pathlib import Path -from typing import Tuple +from typing import Optional, Tuple import tap @@ -11,8 +11,10 @@ class AlgoArgs(tap.Tap): """Base arguments needed for all algorithms.""" # paths to the files with the data - train: str - test: str + train: Optional[str] = None + test: Optional[str] = None + mode: str + model: Optional[str] = None class PreAlgoArgs(AlgoArgs): @@ -23,15 +25,15 @@ class PreAlgoArgs(AlgoArgs): """ # paths to where the processed inputs should be stored - new_train: str - new_test: str + new_train: Optional[str] = None + new_test: Optional[str] = None class InAlgoArgs(AlgoArgs): """ArgumentParser that already has arguments for the paths needed for InAlgorithms.""" # path to where the predictions should be stored - predictions: str + predictions: Optional[str] = None def load_data_from_flags(args: AlgoArgs) -> Tuple[DataTuple, TestTuple]: @@ -44,5 +46,7 @@ def save_transformations(transforms: Tuple[DataTuple, TestTuple], args: PreAlgoA train, test = transforms assert isinstance(train, DataTuple) assert isinstance(test, TestTuple) + assert args.new_train is not None + assert args.new_test is not None train.to_npz(Path(args.new_train)) test.to_npz(Path(args.new_test)) diff --git a/ethicml/implementations/vfae.py b/ethicml/implementations/vfae.py index 4c84efdf..96687ed5 100644 --- a/ethicml/implementations/vfae.py +++ b/ethicml/implementations/vfae.py @@ -1,12 +1,15 @@ """Implementation of VFAE.""" -from typing import List, Tuple +from pathlib import Path +from typing import List, Tuple, Union import pandas as pd import torch +from joblib import dump, load from torch import optim from torch.optim import Adam from torch.utils.data import DataLoader +from ethicml.algorithms.preprocess.pre_algorithm import T from ethicml.data.lookup import get_dataset_obj_by_name from ethicml.implementations.beutel import set_seed from ethicml.utility import DataTuple, TestTuple @@ -16,35 +19,20 @@ from .vfae_modules import VfaeArgs, VFAENetwork, loss_function -def train_and_transform( - train: DataTuple, test: TestTuple, flags: VfaeArgs -) -> Tuple[DataTuple, TestTuple]: - """Train the model and transform the dataset. - - Args: - train: - test: - flags: - - Returns: - Tuple of Encoded Train Dataset and Test Dataset. - """ - set_seed(flags.seed) +def fit(train, flags): + """Train the model.""" dataset = get_dataset_obj_by_name(flags.dataset)() # Set up the data train_data = CustomDataset(train) train_loader = DataLoader(train_data, batch_size=flags.batch_size) - test_data = TestDataset(test) - test_loader = DataLoader(test_data, batch_size=flags.batch_size) - # Build Network model = VFAENetwork( dataset, flags.supervised, train_data.xdim, - latent_dims=50, + latent_dims=flags.latent_dims, z1_enc_size=flags.z1_enc_size, z2_enc_size=flags.z2_enc_size, z1_dec_size=flags.z1_dec_size, @@ -54,25 +42,56 @@ def train_and_transform( # Run Network for epoch in range(int(flags.epochs)): train_model(epoch, model, train_loader, optimizer, flags) + return model + + +def transform(model: VFAENetwork, dataset: T, flags) -> T: + """Transform the dataset.""" + data: Union[CustomDataset, TestDataset] + if isinstance(dataset, DataTuple): + data = CustomDataset(dataset) + loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False) + elif isinstance(dataset, TestTuple): + data = TestDataset(dataset) + loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False) - # Transform output post_train: List[List[float]] = [] - post_test: List[List[float]] = [] model.eval() with torch.no_grad(): - for _x, _s, _ in train_loader: - z1_mu, z1_logvar = model.encode_z1(_x, _s) - z1 = model.reparameterize(z1_mu, z1_logvar) - post_train += z1.data.tolist() - for _x, _s in test_loader: + for sample in loader: + if isinstance(dataset, DataTuple): + _x, _s, _ = sample + elif isinstance(dataset, TestTuple): + _x, _s = sample z1_mu, z1_logvar = model.encode_z1(_x, _s) - z1 = model.reparameterize(z1_mu, z1_logvar) - post_test += z1.data.tolist() + # z1 = model.reparameterize(z1_mu, z1_logvar) + post_train += z1_mu.data.tolist() - return ( - DataTuple(x=pd.DataFrame(post_train), s=train.s, y=train.y, name=f"VFAE: {train.name}"), - TestTuple(x=pd.DataFrame(post_test), s=test.s, name=f"VFAE: {test.name}"), - ) + if isinstance(dataset, DataTuple): + return DataTuple( + x=pd.DataFrame(post_train), s=dataset.s, y=dataset.y, name=f"VFAE: {dataset.name}" + ) + elif isinstance(dataset, TestTuple): + return TestTuple(x=pd.DataFrame(post_train), s=dataset.s, name=f"VFAE: {dataset.name}") + + +def train_and_transform( + train: DataTuple, test: TestTuple, flags: VfaeArgs +) -> Tuple[DataTuple, TestTuple]: + """Train the model and transform the dataset. + + Args: + train: + test: + flags: + + Returns: + Tuple of Encoded Train Dataset and Test Dataset. + """ + model = fit(train, flags) + + # Transform output + return transform(model, train, flags), transform(model, test, flags) def train_model( @@ -133,8 +152,31 @@ def train_model( def main() -> None: """Main method to run model.""" args = VfaeArgs(explicit_bool=True).parse_args() - train, test = load_data_from_flags(args) - save_transformations(train_and_transform(train, test, args), args) + set_seed(args.seed) + if args.mode == "run": + assert args.train is not None + assert args.new_train is not None + assert args.test is not None + assert args.new_test is not None + train, test = load_data_from_flags(args) + save_transformations(train_and_transform(train, test, args), args) + elif args.mode == "fit": + assert args.model is not None + assert args.train is not None + assert args.new_train is not None + train = DataTuple.from_npz(Path(args.train)) + enc = fit(train, args) + transformed_train = transform(enc, train, args) + transformed_train.to_npz(Path(args.new_train)) + dump(enc, Path(args.model)) + elif args.mode == "transform": + assert args.model is not None + assert args.test is not None + assert args.new_test is not None + test = DataTuple.from_npz(Path(args.test)) + model = load(Path(args.model)) + transformed_test = transform(model, test, args) + transformed_test.to_npz(Path(args.new_test)) if __name__ == "__main__": diff --git a/ethicml/implementations/vfae_modules/utils.py b/ethicml/implementations/vfae_modules/utils.py index 70075bb9..059a79cb 100644 --- a/ethicml/implementations/vfae_modules/utils.py +++ b/ethicml/implementations/vfae_modules/utils.py @@ -21,6 +21,7 @@ class VfaeArgs(PreAlgoArgs): batch_size: int epochs: int dataset: str + latent_dims: int z1_enc_size: List[int] z2_enc_size: List[int] z1_dec_size: List[int] diff --git a/ethicml/implementations/zemel.py b/ethicml/implementations/zemel.py index e0197e00..05505389 100644 --- a/ethicml/implementations/zemel.py +++ b/ethicml/implementations/zemel.py @@ -1,16 +1,26 @@ """Zemel algorithm.""" -from typing import Tuple +from pathlib import Path +from typing import NamedTuple, Tuple import numpy as np import pandas as pd import scipy.optimize as optim +from joblib import dump, load from scipy.spatial.distance import cdist from scipy.special import softmax +from ethicml.algorithms.preprocess.pre_algorithm import T from ethicml.implementations.utils import PreAlgoArgs, load_data_from_flags, save_transformations from ethicml.utility import DataTuple, TestTuple +class Model(NamedTuple): + """Model.""" + + prototypes: np.ndarray + w: np.ndarray + + class ZemelArgs(PreAlgoArgs): """Arguments for the Zemel algorithm.""" @@ -89,6 +99,38 @@ def get_xhat_y_hat( def train_and_transform( train: DataTuple, test: TestTuple, flags: ZemelArgs ) -> (Tuple[DataTuple, TestTuple]): + """Train and transform.""" + prototypes, w = fit(train, flags) + sens_col = train.s.columns[0] + + training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() + training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() + + testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy() + testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy() + + train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train) + test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test) + + return ( + DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name), + TestTuple(x=test_transformed, s=test.s, name=test.name), + ) + + +def transform(data: T, prototypes: np.ndarray, w: np.ndarray) -> T: + """Transform.""" + sens_col = data.s.columns[0] + data_sens = data.x.loc[data.s[sens_col] == 0].to_numpy() + data_nons = data.x.loc[data.s[sens_col] == 1].to_numpy() + transformed = trans(prototypes, w, data_nons, data_sens, data) + if isinstance(data, DataTuple): + return DataTuple(x=transformed, s=data.s, y=data.y, name=data.name) + elif isinstance(data, TestTuple): + return TestTuple(x=transformed, s=data.s, name=data.name) + + +def fit(train: DataTuple, flags: ZemelArgs) -> Model: """Train the Zemel model and return the transformed features of the train and test sets.""" np.random.seed(flags.seed) @@ -107,7 +149,9 @@ def train_and_transform( parameters_initialization = np.random.uniform( size=flags.clusters + features_dim * flags.clusters ) - bnd = [(0, 1)] * flags.clusters + [(None, None)] * features_dim * flags.clusters # type: ignore[operator] + bnd = [(0, 1)] * flags.clusters + [ + (None, None) + ] * features_dim * flags.clusters # type: ignore[operator] LFR_optim_objective.steps = 0 # type: ignore[attr-defined] learned_model = optim.fmin_l_bfgs_b( @@ -135,16 +179,7 @@ def train_and_transform( w = learned_model[: flags.clusters] prototypes = learned_model[flags.clusters :].reshape((flags.clusters, features_dim)) - testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy() - testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy() - - train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train) - test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test) - - return ( - DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name), - TestTuple(x=test_transformed, s=test.s, name=test.name), - ) + return Model(prototypes=prototypes, w=w) def trans( @@ -183,9 +218,36 @@ def main() -> None: """ args = ZemelArgs() args.parse_args() - - train, test = load_data_from_flags(args) - save_transformations(train_and_transform(train, test, args), args) + if args.mode == "run": + assert args.train is not None + assert args.new_train is not None + assert args.test is not None + assert args.new_test is not None + train, test = load_data_from_flags(args) + save_transformations(train_and_transform(train, test, args), args) + elif args.mode == "fit": + assert args.model is not None + assert args.train is not None + assert args.new_train is not None + train = DataTuple.from_npz(Path(args.train)) + model = fit(train, args) + sens_col = train.s.columns[0] + training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() + training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() + train_transformed = trans( + model.prototypes, model.w, training_nonsensitive, training_sensitive, train + ) + data = DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name) + data.to_npz(Path(args.new_train)) + dump(model, Path(args.model)) + elif args.mode == "transform": + assert args.model is not None + assert args.test is not None + assert args.new_test is not None + test = DataTuple.from_npz(Path(args.test)) + model = load(Path(args.model)) + transformed_test = transform(test, model.prototypes, model.w) + transformed_test.to_npz(Path(args.new_test)) if __name__ == "__main__": diff --git a/poetry.lock b/poetry.lock index 5e4e683c..06f6e7dd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -204,6 +204,14 @@ python-versions = ">=3.6" colorama = {version = "*", markers = "platform_system == \"Windows\""} importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} +[[package]] +name = "cloudpickle" +version = "2.0.0" +description = "Extended pickling support for Python objects" +category = "main" +optional = true +python-versions = ">=3.6" + [[package]] name = "colorama" version = "0.4.4" @@ -704,7 +712,7 @@ python-versions = "*" [[package]] name = "mypy" -version = "0.920" +version = "0.921" description = "Optional static typing for Python" category = "dev" optional = false @@ -1631,13 +1639,13 @@ docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] [extras] -all = ["fairlearn"] -ci = ["fairlearn", "pytest", "pytest-cov", "torch", "torchvision"] +all = ["fairlearn", "cloudpickle"] +ci = ["fairlearn", "pytest", "pytest-cov", "torch", "torchvision", "cloudpickle"] [metadata] lock-version = "1.1" python-versions = ">=3.7.1,<4.0" -content-hash = "68ff268ebe9a26f6cc78eecd9d4048df48ee93b16a407d97a784c2dbc7273970" +content-hash = "0f1afc0b35c633f00ff203d947876a9102f43772be188af6c768a6d5fb834368" [metadata.files] appdirs = [ @@ -1774,6 +1782,10 @@ click = [ {file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"}, {file = "click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"}, ] +cloudpickle = [ + {file = "cloudpickle-2.0.0-py3-none-any.whl", hash = "sha256:6b2df9741d06f43839a3275c4e6632f7df6487a1f181f5f46a052d3c917c3d11"}, + {file = "cloudpickle-2.0.0.tar.gz", hash = "sha256:5cd02f3b417a783ba84a4ec3e290ff7929009fe51f6405423cfccfadd43ba4a4"}, +] colorama = [ {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, @@ -2140,26 +2152,26 @@ mistune = [ {file = "mistune-0.8.4.tar.gz", hash = "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e"}, ] mypy = [ - {file = "mypy-0.920-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:41f3575b20714171c832d8f6c7aaaa0d499c9a2d1b8adaaf837b4c9065c38540"}, - {file = "mypy-0.920-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:431be889ffc8d9681813a45575c42e341c19467cbfa6dd09bf41467631feb530"}, - {file = "mypy-0.920-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f8b2059f73878e92eff7ed11a03515d6572f4338a882dd7547b5f7dd242118e6"}, - {file = "mypy-0.920-cp310-cp310-win_amd64.whl", hash = "sha256:9cd316e9705555ca6a50670ba5fb0084d756d1d8cb1697c83820b1456b0bc5f3"}, - {file = "mypy-0.920-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e091fe58b4475b3504dc7c3022ff7f4af2f9e9ddf7182047111759ed0973bbde"}, - {file = "mypy-0.920-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98b4f91a75fed2e4c6339e9047aba95968d3a7c4b91e92ab9dc62c0c583564f4"}, - {file = "mypy-0.920-cp36-cp36m-win_amd64.whl", hash = "sha256:562a0e335222d5bbf5162b554c3afe3745b495d67c7fe6f8b0d1b5bace0c1eeb"}, - {file = "mypy-0.920-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:618e677aabd21f30670bffb39a885a967337f5b112c6fb7c79375e6dced605d6"}, - {file = "mypy-0.920-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40cb062f1b7ff4cd6e897a89d8ddc48c6ad7f326b5277c93a8c559564cc1551c"}, - {file = "mypy-0.920-cp37-cp37m-win_amd64.whl", hash = "sha256:69b5a835b12fdbfeed84ef31152d41343d32ccb2b345256d8682324409164330"}, - {file = "mypy-0.920-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:993c2e52ea9570e6e872296c046c946377b9f5e89eeb7afea2a1524cf6e50b27"}, - {file = "mypy-0.920-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:df0fec878ccfcb2d1d2306ba31aa757848f681e7bbed443318d9bbd4b0d0fe9a"}, - {file = "mypy-0.920-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:331a81d2c9bf1be25317260a073b41f4584cd11701a7c14facef0aa5a005e843"}, - {file = "mypy-0.920-cp38-cp38-win_amd64.whl", hash = "sha256:ffb1e57ec49a30e3c0ebcfdc910ae4aceb7afb649310b7355509df6b15bd75f6"}, - {file = "mypy-0.920-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:31895b0b3060baf15bf76e789d94722c026f673b34b774bba9e8772295edccff"}, - {file = "mypy-0.920-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:140174e872d20d4768124a089b9f9fc83abd6a349b7f8cc6276bc344eb598922"}, - {file = "mypy-0.920-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:13b3c110309b53f5a62aa1b360f598124be33a42563b790a2a9efaacac99f1fc"}, - {file = "mypy-0.920-cp39-cp39-win_amd64.whl", hash = "sha256:82e6c15675264e923b60a11d6eb8f90665504352e68edfbb4a79aac7a04caddd"}, - {file = "mypy-0.920-py3-none-any.whl", hash = "sha256:71c77bd885d2ce44900731d4652d0d1c174dc66a0f11200e0c680bdedf1a6b37"}, - {file = "mypy-0.920.tar.gz", hash = "sha256:a55438627f5f546192f13255a994d6d1cf2659df48adcf966132b4379fd9c86b"}, + {file = "mypy-0.921-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d9d6a9c35ac1e5d89d9f71f60d4932dfba00b8d2cb0ba758293f0214c851d2c0"}, + {file = "mypy-0.921-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01ff922b9fa13f451ce51f7b707c97e35b5dd6ad0104a83d598306255cc7f990"}, + {file = "mypy-0.921-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:279d87385acc33d4117612002026d09ef039845dee2cab41d2cca38ca63a72b3"}, + {file = "mypy-0.921-cp310-cp310-win_amd64.whl", hash = "sha256:f4688e06b2bbb9708eda50bf119abf072833687ca25c11caf84371fb44722b8a"}, + {file = "mypy-0.921-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:54bfe651425cc0935e056327c8f0da749015d64e1586601a9350363f4a3a7794"}, + {file = "mypy-0.921-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aadc06bffbe00c285771056e5b0364bc3e0a814e3a08d2cc64f4b12ea40bc283"}, + {file = "mypy-0.921-cp36-cp36m-win_amd64.whl", hash = "sha256:49e528bf13d54a4cbb163fc7532ae220edf0b1bb79070481c77a0c83cc4e36ce"}, + {file = "mypy-0.921-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1952b1c8e84eb03375b5e339295a96b92dd5b865d2a9768431c9c5aa58f8d32b"}, + {file = "mypy-0.921-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d816a6e2114c473181e0df3013decb9a02acbc57d45454357a05258acd528a3"}, + {file = "mypy-0.921-cp37-cp37m-win_amd64.whl", hash = "sha256:777fc39141b8a4154c61cc6dc0315b25832b8b6efe5a2bef1dba66d5544341d4"}, + {file = "mypy-0.921-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8fcad97e6be583c7de2d18304581dc7f8c42ce4950df5d56005bd3efd53e9ef9"}, + {file = "mypy-0.921-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:71c193bc6dc1b2f183b59f6473a13e627885751d9e534fd26bf15bc8eeed8772"}, + {file = "mypy-0.921-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b64e64fb6092c86239a7b10437c8e0b9b013e704ecdf8bdfaa8d80dbd7ba2a73"}, + {file = "mypy-0.921-cp38-cp38-win_amd64.whl", hash = "sha256:02aca528afcb965ea7bf2bc5fbe5736225b5786e135d64cce5075e3bc8b785a4"}, + {file = "mypy-0.921-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:59f6280e3cbb961b7a9957b6e1739c60fd027743c5ec4d3636f1ae24d5249528"}, + {file = "mypy-0.921-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:549557f7dc7ddd45ca08df0944b7f6519a0e23e6336ef3ff260a4e100fe1ccb3"}, + {file = "mypy-0.921-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b6f6bc11222b61fa805371f18d70f1546f5ca26db5eda8ad9a75364460bd17a0"}, + {file = "mypy-0.921-cp39-cp39-win_amd64.whl", hash = "sha256:8c2cff600d34ea8f3426a470e0ea75bd35c75269f6df69a9320c99b4e92edca4"}, + {file = "mypy-0.921-py3-none-any.whl", hash = "sha256:6e57f340ea04a6f7c67c7757e573bc61c2cc096f87ebd829d7c3264dedc0bc54"}, + {file = "mypy-0.921.tar.gz", hash = "sha256:eca089d7053dff45d6dcd5bf67f1cabc311591e85d378917d97363e7c13da088"}, ] mypy-extensions = [ {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, diff --git a/pyproject.toml b/pyproject.toml index 75409ad3..c5d83842 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ tqdm = ">=4.31.1" typed-argument-parser = "1.4" typing-extensions = ">=3.7.2" fairlearn = { version = "0.4.6", optional = true } +cloudpickle = { version = "^2.0.0", optional= true } pytest = { version = "^6.0.0", optional = true } pytest-cov = { version = "^2.6.0", optional = true } torch = { version = "^1.8", optional = true } @@ -45,8 +46,8 @@ folktables = "^0.0.11" ranzen = "^1.1.1" [tool.poetry.extras] -ci = ["fairlearn","pytest","pytest-cov","torch","torchvision"] -all= ["fairlearn"] +ci = ["fairlearn","pytest","pytest-cov","torch","torchvision","cloudpickle"] +all= ["fairlearn","cloudpickle"] [tool.poetry.dev-dependencies] @@ -141,6 +142,7 @@ module = [ "black.*", "fairlearn.*", "imageio", + "joblib", "pandas.testing", "pylint.*", "pytest.*", diff --git a/tests/loading_data_test.py b/tests/loading_data_test.py index 6fde83f2..7e1ea2af 100644 --- a/tests/loading_data_test.py +++ b/tests/loading_data_test.py @@ -502,9 +502,7 @@ def idfn(val: DT): sum_y=231, ), DT( - dataset=em.acs_income( - root=Path("~/Data"), year="2018", horizon=1, survey="person", states=["AL"] - ), + dataset=em.acs_income(root=Path("~/Data"), year="2018", horizon=1, states=["AL"]), samples=22_268, x_features=45, discrete_features=40, @@ -512,14 +510,25 @@ def idfn(val: DT): num_sens=2, y_features=1, num_labels=2, - name="ACS_Income_2018_person_1_AL_Sex", + name="ACS_Income_2018_1_AL_Sex", sum_s=11_622, sum_y=6_924, ), DT( - dataset=em.acs_income( - root=Path("~/Data"), year="2018", horizon=1, survey="person", states=["AL", "PA"] - ), + dataset=em.acs_income(root=Path("~/Data"), year="2018", horizon=1, states=["PA"]), + samples=68_308, + x_features=45, + discrete_features=40, + s_features=1, + num_sens=2, + y_features=1, + num_labels=2, + name="ACS_Income_2018_1_PA_Sex", + sum_s=35_480, + sum_y=24_385, + ), + DT( + dataset=em.acs_income(root=Path("~/Data"), year="2018", horizon=1, states=["AL", "PA"]), samples=90_576, x_features=45, discrete_features=40, @@ -527,18 +536,13 @@ def idfn(val: DT): num_sens=2, y_features=1, num_labels=2, - name="ACS_Income_2018_person_1_AL_PA_Sex", + name="ACS_Income_2018_1_AL_PA_Sex", sum_s=47_102, sum_y=31_309, ), DT( dataset=em.acs_income( - root=Path("~/Data"), - year="2018", - horizon=1, - survey="person", - states=["AL"], - split="Race", + root=Path("~/Data"), year="2018", horizon=1, states=["AL"], split="Race" ), samples=22_268, x_features=38, @@ -547,18 +551,13 @@ def idfn(val: DT): num_sens=9, y_features=1, num_labels=2, - name="ACS_Income_2018_person_1_AL_Race", + name="ACS_Income_2018_1_AL_Race", sum_s=9_947, sum_y=6_924, ), DT( dataset=em.acs_income( - root=Path("~/Data"), - year="2018", - horizon=1, - survey="person", - states=["AL"], - split="Sex-Race", + root=Path("~/Data"), year="2018", horizon=1, states=["AL"], split="Sex-Race" ), samples=22_268, x_features=36, @@ -567,7 +566,7 @@ def idfn(val: DT): num_sens=17, y_features=1, num_labels=2, - name="ACS_Income_2018_person_1_AL_Sex-Race", + name="ACS_Income_2018_1_AL_Sex-Race", sum_s=31_516, sum_y=6_924, ), diff --git a/tests/models_test/inprocess_test/models_inprocessing_test.py b/tests/models_test/inprocess_test/models_inprocessing_test.py index 54f77a0d..b8c2fa37 100644 --- a/tests/models_test/inprocess_test/models_inprocessing_test.py +++ b/tests/models_test/inprocess_test/models_inprocessing_test.py @@ -1,9 +1,8 @@ """EthicML Tests.""" import sys from pathlib import Path -from typing import Dict, List, NamedTuple, Tuple +from typing import Dict, List, NamedTuple -import pandas as pd import pytest from pytest import approx @@ -33,6 +32,9 @@ Prediction, SoftPrediction, TrainTestPair, + ZafarAccuracy, + ZafarBaseline, + ZafarFairness, compas, evaluate_models_async, load_data, @@ -42,6 +44,7 @@ toy, train_test_split, ) +from ethicml.algorithms.inprocess.shared import flag_interface from tests.run_algorithm_test import count_true @@ -54,17 +57,40 @@ class InprocessTest(NamedTuple): INPROCESS_TESTS = [ + InprocessTest(name="Agarwal, LR, DP", model=Agarwal(dir='/tmp'), num_pos=45), + InprocessTest(name="Agarwal, LR, EqOd", model=Agarwal(dir='/tmp', fairness="EqOd"), num_pos=44), + InprocessTest(name="Agarwal, SVM, DP", model=Agarwal(dir='/tmp', classifier="SVM"), num_pos=45), + InprocessTest( + name="Agarwal, SVM, DP", + model=Agarwal(dir='/tmp', classifier="SVM", kernel="linear"), + num_pos=42, + ), + InprocessTest( + name="Agarwal, SVM, EqOd", + model=Agarwal(dir='/tmp', classifier="SVM", fairness="EqOd"), + num_pos=45, + ), + InprocessTest( + name="Agarwal, SVM, EqOd", + model=Agarwal(dir='/tmp', classifier="SVM", fairness="EqOd", kernel="linear"), + num_pos=42, + ), InprocessTest(name="Blind", model=Blind(), num_pos=48), InprocessTest(name="DemPar. Oracle", model=DPOracle(), num_pos=53), - InprocessTest(name="Dist Robust Optim", model=DRO(eta=0.5), num_pos=45), - InprocessTest(name="Dist Robust Optim", model=DRO(eta=5.0), num_pos=59), + InprocessTest(name="Dist Robust Optim", model=DRO(eta=0.5, dir="/tmp"), num_pos=45), + InprocessTest(name="Dist Robust Optim", model=DRO(eta=5.0, dir="/tmp"), num_pos=59), + InprocessTest(name="Kamiran & Calders LR", model=Kamiran(), num_pos=44), InprocessTest(name="Logistic Regression (C=1.0)", model=LR(), num_pos=44), + InprocessTest(name="Logistic Regression Prob (C=1.0)", model=LRProb(), num_pos=44), InprocessTest(name="LRCV", model=LRCV(), num_pos=40), InprocessTest(name="Majority", model=Majority(), num_pos=80), InprocessTest(name="MLP", model=MLP(), num_pos=43), InprocessTest(name="Oracle", model=Oracle(), num_pos=41), InprocessTest(name="SVM", model=SVM(), num_pos=45), InprocessTest(name="SVM (linear)", model=SVM(kernel="linear"), num_pos=41), + # InprocessTest(name="Zafar", model=ZafarAccuracy(), num_pos=41), + # InprocessTest(name="Zafar", model=ZafarBaseline(), num_pos=41), + # InprocessTest(name="Zafar", model=ZafarFairness(), num_pos=41), ] @@ -82,6 +108,23 @@ def test_inprocess(toy_train_test: TrainTestPair, name: str, model: InAlgorithm, assert count_true(predictions.hard.values == 0) == len(predictions) - num_pos +@pytest.mark.parametrize("name,model,num_pos", INPROCESS_TESTS) +def test_inprocess_sep_train_pred( + toy_train_test: TrainTestPair, name: str, model: InAlgorithm, num_pos: int +): + """Test an inprocess model with distinct train and predict steps.""" + train, test = toy_train_test + + assert isinstance(model, InAlgorithm) + assert model is not None + assert model.name == name + + model = model.fit(train) + predictions: Prediction = model.predict(test) + assert count_true(predictions.hard.values == 1) == num_pos + assert count_true(predictions.hard.values == 0) == len(predictions) - num_pos + + def test_corels(toy_train_test: TrainTestPair) -> None: """Test corels.""" model: InAlgorithm = Corels() @@ -155,7 +198,7 @@ def __init__(self): name="local installed LR", dir_name="../..", top_dir="", executable=sys.executable ) - def _script_command( + def _run_script_command( self, train_path: Path, test_path: Path, pred_path: Path ) -> (List[str]): script = str((Path(__file__).parent.parent.parent / "local_installed_lr.py").resolve()) @@ -166,6 +209,18 @@ def _script_command( str(pred_path), ] + def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]: + script = str((Path(__file__).parent.parent.parent / "local_installed_lr.py").resolve()) + args = flag_interface(train_path=train_path, model_path=model_path) + return [script, args] + + def _predict_script_command( + self, model_path: Path, test_path: Path, pred_path: Path + ) -> List[str]: + script = str((Path(__file__).parent.parent.parent / "local_installed_lr.py").resolve()) + args = flag_interface(model_path=model_path, test_path=test_path, pred_path=pred_path) + return [script, args] + model: InAlgorithm = _LocalInstalledLR() assert model is not None assert model.name == "local installed LR" @@ -176,54 +231,9 @@ def _script_command( assert count_true(predictions.hard.values == 0) == len(predictions) - expected_num_pos -def test_agarwal(toy_train_test: TrainTestPair): - """Test agarwal.""" - train, test = toy_train_test - - agarwal_variants: List[InAlgorithmAsync] = [] - model_names: List[str] = [] - expected_results: List[Tuple[int, int]] = [] - - agarwal_variants.append(Agarwal()) - model_names.append("Agarwal, LR, DP") - expected_results.append((45, 35)) - - agarwal_variants.append(Agarwal(fairness="EqOd")) - model_names.append("Agarwal, LR, EqOd") - expected_results.append((44, 36)) - - agarwal_variants.append(Agarwal(classifier="SVM")) - model_names.append("Agarwal, SVM, DP") - expected_results.append((45, 35)) - - agarwal_variants.append(Agarwal(classifier="SVM", kernel="linear")) - model_names.append("Agarwal, SVM, DP") - expected_results.append((42, 38)) - - agarwal_variants.append(Agarwal(classifier="SVM", fairness="EqOd")) - model_names.append("Agarwal, SVM, EqOd") - expected_results.append((45, 35)) - - agarwal_variants.append(Agarwal(classifier="SVM", fairness="EqOd", kernel="linear")) - model_names.append("Agarwal, SVM, EqOd") - expected_results.append((42, 38)) - - results = run_blocking( - run_in_parallel(agarwal_variants, [TrainTestPair(train, test)], max_parallel=1) - ) - - for model, results_for_model, model_name, (pred_true, pred_false) in zip( - agarwal_variants, results, model_names, expected_results - ): - assert model.name == model_name - print(model.name) - assert count_true(results_for_model[0].hard.to_numpy() == 1) == pred_true, model_name - assert count_true(results_for_model[0].hard.to_numpy() == 0) == pred_false, model_name - - def test_threaded_agarwal(): """Test threaded agarwal.""" - models: List[InAlgorithmAsync] = [Agarwal(classifier="SVM", fairness="EqOd")] + models: List[InAlgorithmAsync] = [Agarwal(dir='/tmp', classifier="SVM", fairness="EqOd")] class AssertResult(Metric): _name = "assert_result" @@ -240,36 +250,3 @@ def score(self, prediction, actual) -> float: ) ) assert results["assert_result"].iloc[0] == 0.0 - - -def test_lr_prob(toy_train_test: TrainTestPair): - """Test lr prob.""" - train, test = toy_train_test - - model: LRProb = LRProb() - assert model.name == "Logistic Regression Prob (C=1.0)" - - heavi = Heaviside() - - predictions: SoftPrediction = model.run(train, test) - hard_predictions = pd.Series(heavi.apply(predictions.soft.to_numpy())) - pd.testing.assert_series_equal(hard_predictions, predictions.hard) - assert hard_predictions.values[hard_predictions.values == 1].shape[0] == 44 - assert hard_predictions.values[hard_predictions.values == 0].shape[0] == 36 - - -def test_kamiran(toy_train_test: TrainTestPair): - """Test kamiran.""" - train, test = toy_train_test - - kamiran_model: InAlgorithm = Kamiran() - assert kamiran_model is not None - assert kamiran_model.name == "Kamiran & Calders LR" - - predictions: Prediction = kamiran_model.run(train, test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 44 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 36 - - # remove all samples with s=0 & y=1 from the data - train_no_s0y1 = query_dt(train, "`sensitive-attr` != 0 | decision != 1") - predictions = kamiran_model.run(train_no_s0y1, test) diff --git a/tests/models_test/inprocess_test/threaded_test.py b/tests/models_test/inprocess_test/threaded_test.py index 55f62b7f..bea906b3 100644 --- a/tests/models_test/inprocess_test/threaded_test.py +++ b/tests/models_test/inprocess_test/threaded_test.py @@ -1,3 +1,4 @@ +"""Tests for hreaded models.""" from typing import NamedTuple import pytest @@ -15,12 +16,12 @@ class ThreadedParams(NamedTuple): num_pos: int -THREADED_PARAMS = [ThreadedParams(model=SVMAsync(), name="SVM", num_pos=45)] +THREADED_PARAMS = [ThreadedParams(model=SVMAsync(dir="/tmp"), name="SVM", num_pos=45)] @pytest.mark.parametrize("model,name,num_pos", THREADED_PARAMS, ids=get_id) def test_threaded(toy_train_test: TrainTestPair, model: InAlgorithmAsync, name: str, num_pos: int): - """test threaded svm""" + """Test threaded svm.""" train, test = toy_train_test assert model is not None @@ -30,3 +31,20 @@ def test_threaded(toy_train_test: TrainTestPair, model: InAlgorithmAsync, name: assert predictions.hard.values[predictions.hard.values == 1].shape[0] == num_pos num_neg = predictions.hard.values[predictions.hard.values == 0].shape[0] assert num_neg == len(predictions) - num_pos + + +@pytest.mark.parametrize("model,name,num_pos", THREADED_PARAMS, ids=get_id) +def test_threaded_sep( + toy_train_test: TrainTestPair, model: InAlgorithmAsync, name: str, num_pos: int +): + """Test threaded svm.""" + train, test = toy_train_test + + assert model is not None + assert model.name == name + + model = em.run_blocking(model.fit_async(train)) + predictions: Prediction = em.run_blocking(model.predict_async(test)) + assert predictions.hard.values[predictions.hard.values == 1].shape[0] == num_pos + num_neg = predictions.hard.values[predictions.hard.values == 0].shape[0] + assert num_neg == len(predictions) - num_pos diff --git a/tests/models_test/postprocess_test/models_postprocessing_test.py b/tests/models_test/postprocess_test/models_postprocessing_test.py index b115f8d6..03c760d9 100644 --- a/tests/models_test/postprocess_test/models_postprocessing_test.py +++ b/tests/models_test/postprocess_test/models_postprocessing_test.py @@ -1,14 +1,33 @@ """EthicML tests.""" +from typing import NamedTuple + import pytest import ethicml as em -from ethicml import LR, Hardt, InAlgorithm, PostAlgorithm, Prediction, ProbPos, TrainTestPair +from ethicml import LR, Hardt, InAlgorithm, PostAlgorithm, Prediction, ProbPos from ethicml.algorithms.postprocess.dp_flip import DPFlip from ethicml.utility.data_structures import TrainValPair from tests.run_algorithm_test import count_true -def test_dp_flip(toy_train_test: TrainValPair) -> None: +class PostprocessTest(NamedTuple): + """Define a test for an postprocess model.""" + + post_model: PostAlgorithm + name: str + num_pos: int + + +@pytest.mark.parametrize( + "post_model,name,num_pos", + [ + PostprocessTest(post_model=DPFlip(), name="DemPar. Post Process", num_pos=57), + PostprocessTest(post_model=Hardt(), name="Hardt", num_pos=35), + ], +) +def test_post( + toy_train_test: TrainValPair, post_model: PostAlgorithm, name: str, num_pos: int +) -> None: """Test the dem par flipping method.""" train, test = toy_train_test train_test = em.concat_tt([train, test], ignore_index=True) @@ -25,23 +44,30 @@ def test_dp_flip(toy_train_test: TrainValPair) -> None: assert count_true(pred_test.values == 1) == 44 assert count_true(pred_test.values == 0) == 36 - post_model: PostAlgorithm = DPFlip() - assert post_model.name == "DemPar. Post Process" + assert post_model.name == name fair_preds = post_model.run(Prediction(pred_train), train, Prediction(pred_test), test) - assert count_true(fair_preds.hard.values == 1) == 57 - assert count_true(fair_preds.hard.values == 0) == 23 + assert count_true(fair_preds.hard.values == 1) == num_pos + assert count_true(fair_preds.hard.values == 0) == len(fair_preds) - num_pos diffs = em.diff_per_sensitive_attribute( em.metric_per_sensitive_attribute(fair_preds, test, ProbPos()) ) - for diff in diffs.values(): - assert pytest.approx(diff, abs=1e-2) == 0 - - -def test_dp_flip_inverted_s(toy_train_test: TrainValPair) -> None: + if isinstance(post_model, DPFlip): + for diff in diffs.values(): + assert pytest.approx(diff, abs=1e-2) == 0 + + +@pytest.mark.parametrize( + "post_model,name,num_pos", + [ + PostprocessTest(post_model=DPFlip(), name="DemPar. Post Process", num_pos=57), + PostprocessTest(post_model=Hardt(), name="Hardt", num_pos=35), + ], +) +def test_post_sep_fit_pred( + toy_train_test: TrainValPair, post_model: PostAlgorithm, name: str, num_pos: int +) -> None: """Test the dem par flipping method.""" train, test = toy_train_test - train = train.replace(s=1 - train.s) - test = test.replace(s=1 - test.s) train_test = em.concat_tt([train, test], ignore_index=True) in_model: InAlgorithm = LR() @@ -56,25 +82,24 @@ def test_dp_flip_inverted_s(toy_train_test: TrainValPair) -> None: assert count_true(pred_test.values == 1) == 44 assert count_true(pred_test.values == 0) == 36 - post_model: PostAlgorithm = DPFlip() - assert post_model.name == "DemPar. Post Process" - fair_preds = post_model.run(Prediction(pred_train), train, Prediction(pred_test), test) - assert count_true(fair_preds.hard.values == 1) == 57 - assert count_true(fair_preds.hard.values == 0) == 23 + assert post_model.name == name + fair_model = post_model.fit(Prediction(pred_train), train) + fair_preds = fair_model.predict(Prediction(pred_test), test) + assert count_true(fair_preds.hard.values == 1) == num_pos + assert count_true(fair_preds.hard.values == 0) == len(fair_preds) - num_pos diffs = em.diff_per_sensitive_attribute( em.metric_per_sensitive_attribute(fair_preds, test, ProbPos()) ) - for diff in diffs.values(): - assert pytest.approx(diff, abs=1e-2) == 0 + if isinstance(post_model, DPFlip): + for diff in diffs.values(): + assert pytest.approx(diff, abs=1e-2) == 0 -def test_hardt(toy_train_test: TrainTestPair) -> None: - """Tests the hardt postprocessing technique. - - Args: - toy_train_test: Train-test pair of toy data - """ +def test_dp_flip_inverted_s(toy_train_test: TrainValPair) -> None: + """Test the dem par flipping method.""" train, test = toy_train_test + train = train.replace(s=1 - train.s) + test = test.replace(s=1 - test.s) train_test = em.concat_tt([train, test], ignore_index=True) in_model: InAlgorithm = LR() @@ -85,11 +110,17 @@ def test_hardt(toy_train_test: TrainTestPair) -> None: # seperate out predictions on train set and predictions on test set pred_train = predictions.hard.iloc[: train.y.shape[0]] - pred_test = predictions.hard.iloc[train.y.shape[0] :] + pred_test = predictions.hard.iloc[train.y.shape[0] :].reset_index(drop=True) assert count_true(pred_test.values == 1) == 44 assert count_true(pred_test.values == 0) == 36 - post_model: PostAlgorithm = Hardt() + post_model: PostAlgorithm = DPFlip() + assert post_model.name == "DemPar. Post Process" fair_preds = post_model.run(Prediction(pred_train), train, Prediction(pred_test), test) - assert count_true(fair_preds.hard.values == 1) == 35 - assert count_true(fair_preds.hard.values == 0) == 45 + assert count_true(fair_preds.hard.values == 1) == 57 + assert count_true(fair_preds.hard.values == 0) == 23 + diffs = em.diff_per_sensitive_attribute( + em.metric_per_sensitive_attribute(fair_preds, test, ProbPos()) + ) + for diff in diffs.values(): + assert pytest.approx(diff, abs=1e-2) == 0 diff --git a/tests/models_test/preprocess_test/models_preprocessing_test.py b/tests/models_test/preprocess_test/models_preprocessing_test.py index 8a727fd0..85cf4775 100644 --- a/tests/models_test/preprocess_test/models_preprocessing_test.py +++ b/tests/models_test/preprocess_test/models_preprocessing_test.py @@ -1,8 +1,9 @@ """Test preprocessing models.""" -from typing import Tuple +from typing import NamedTuple import numpy as np import pandas as pd +import pytest from pytest import approx import ethicml as em @@ -24,237 +25,163 @@ ) -def test_vfae(toy_train_test: TrainTestPair): - """Test vfae.""" +class PreprocessTest(NamedTuple): + """Define a test for a preprocess model.""" + + model: PreAlgorithm + name: str + num_pos: int + + +METHOD_LIST = [ + PreprocessTest( + model=VFAE( + dir='/tmp', + dataset="Toy", + supervised=True, + epochs=10, + fairness="Eq. Opp", + batch_size=100, + ), + name="VFAE", + num_pos=56, + ), + PreprocessTest( + model=VFAE( + dir='/tmp', + dataset="Toy", + supervised=False, + epochs=10, + fairness="Eq. Opp", + batch_size=100, + ), + name="VFAE", + num_pos=47, + ), + PreprocessTest(model=Zemel(dir='/tmp'), name="Zemel", num_pos=51), + PreprocessTest(model=Beutel(dir='/tmp'), name="Beutel DP", num_pos=49), + PreprocessTest( + model=Beutel(dir='/tmp', epochs=5, fairness="EqOp"), name="Beutel EqOp", num_pos=56 + ), + PreprocessTest(model=Upsampler(strategy="naive"), name="Upsample naive", num_pos=43), + PreprocessTest(model=Upsampler(strategy="uniform"), name="Upsample uniform", num_pos=44), + PreprocessTest( + model=Upsampler(strategy="preferential"), name="Upsample preferential", num_pos=45 + ), + PreprocessTest( + model=Calders(preferable_class=1, disadvantaged_group=0), name="Calders", num_pos=43 + ), +] + + +@pytest.mark.parametrize("model,name,num_pos", METHOD_LIST) +def test_pre(toy_train_test: TrainTestPair, model: PreAlgorithm, name: str, num_pos: int): + """Test preprocessing.""" train, test = toy_train_test - vfae_model: PreAlgorithm = VFAE(dataset="Toy", epochs=10, batch_size=100) - assert vfae_model is not None - assert vfae_model.name == "VFAE" - - new_train_test: Tuple[DataTuple, TestTuple] = vfae_model.run(train, test) - new_train, new_test = new_train_test - - assert len(new_train) == len(train) - assert new_test.x.shape[0] == test.x.shape[0] - svm_model: InAlgorithm = SVM() - assert svm_model is not None - assert svm_model.name == "SVM" - - predictions: Prediction = svm_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 65 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 15 - - vfae_model = VFAE(dataset="Toy", supervised=True, epochs=10, fairness="Eq. Opp", batch_size=100) - assert vfae_model is not None - assert vfae_model.name == "VFAE" - new_train_test = vfae_model.run(train, test) - new_train, new_test = new_train_test + assert model.name == name + new_train, new_test = model.run(train, test) - assert new_train.x.shape[0] == train.x.shape[0] - assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == "VFAE: " + str(test.name) - assert new_train.name == "VFAE: " + str(train.name) + if not isinstance(model, Upsampler): + assert new_train.x.shape[0] == train.x.shape[0] + assert new_test.x.shape[0] == test.x.shape[0] - predictions = svm_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 65 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 15 + assert new_train.x.shape[1] == model.out_size + assert new_test.x.shape[1] == model.out_size + assert new_test.name == f"{name}: " + str(test.name) + assert new_train.name == f"{name}: " + str(train.name) - vfae_model = VFAE( - dataset="Toy", supervised=False, epochs=10, fairness="Eq. Opp", batch_size=100 - ) - assert vfae_model is not None - assert vfae_model.name == "VFAE" + preds = svm_model.run_test(new_train, new_test) + assert preds.hard.values[preds.hard.values == 1].shape[0] == num_pos + assert preds.hard.values[preds.hard.values == 0].shape[0] == len(preds) - num_pos - new_train_test = vfae_model.run(train, test) - new_train, new_test = new_train_test - assert new_train.x.shape[0] == train.x.shape[0] - assert new_test.x.shape[0] == test.x.shape[0] - - predictions = svm_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 44 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 36 - - -def test_threaded_zemel(toy_train_test: TrainTestPair): - """Test threaded zemel.""" +@pytest.mark.parametrize("model,name,num_pos", METHOD_LIST) +def test_pre_sep_fit_transform( + toy_train_test: TrainTestPair, model: PreAlgorithm, name: str, num_pos: int +): + """Test preprocessing.""" train, test = toy_train_test - model: PreAlgorithmAsync = Zemel() - assert model is not None - assert model.name == "Zemel" - - new_train_test: Tuple[DataTuple, TestTuple] = em.run_blocking(model.run_async(train, test)) - new_train, new_test = new_train_test - - assert new_train.x.shape[0] == train.x.shape[0] - assert new_test.x.shape[0] == test.x.shape[0] - - classifier: InAlgorithm = SVM() - assert classifier is not None - assert classifier.name == "SVM" - - predictions: Prediction = classifier.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 51 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 29 - - beut_model: PreAlgorithm = Zemel() - assert beut_model is not None - assert beut_model.name == "Zemel" - - new_train_test = beut_model.run(train, test) - new_train, new_test = new_train_test - - assert new_train.x.shape[0] == train.x.shape[0] - assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == "Zemel: " + str(test.name) - assert new_train.name == "Zemel: " + str(train.name) - svm_model: InAlgorithm = SVM() - assert svm_model is not None - assert svm_model.name == "SVM" - - predictions = svm_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 51 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 29 - - -def test_threaded_beutel(toy_train_test: TrainTestPair): - """Test threaded beutel.""" - train, test = toy_train_test - - model: PreAlgorithmAsync = Beutel() - assert model is not None - assert model.name == "Beutel DP" - - new_train_test: Tuple[DataTuple, TestTuple] = em.run_blocking(model.run_async(train, test)) - new_train, new_test = new_train_test - assert new_train.x.shape[0] == train.x.shape[0] - assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == "Beutel DP: " + str(test.name) - assert new_train.name == "Beutel DP: " + str(train.name) - - classifier: InAlgorithm = SVM() - assert classifier is not None - assert classifier.name == "SVM" - - predictions: Prediction = classifier.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 49 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 31 - - beut_model: PreAlgorithm = Beutel() - assert beut_model is not None - assert beut_model.name == "Beutel DP" - - new_train_test = beut_model.run(train, test) - new_train, new_test = new_train_test - - assert new_train.x.shape[0] == train.x.shape[0] - assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == "Beutel DP: " + str(test.name) - assert new_train.name == "Beutel DP: " + str(train.name) - - svm_model: InAlgorithm = SVM() - assert svm_model is not None - assert svm_model.name == "SVM" - - predictions = svm_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 49 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 31 - - -def test_threaded_custom_beutel(toy_train_test: TrainTestPair): - """Test threaded custom beutel.""" + assert model.name == name + model, new_train = model.fit(train) + new_test = model.transform(test) + + if not isinstance(model, Upsampler): + assert new_train.x.shape[0] == train.x.shape[0] + assert new_test.x.shape[0] == test.x.shape[0] + + assert new_train.x.shape[1] == model.out_size + assert new_test.x.shape[1] == model.out_size + assert new_test.name == f"{name}: " + str(test.name) + assert new_train.name == f"{name}: " + str(train.name) + + preds = svm_model.run_test(new_train, new_test) + assert preds.hard.values[preds.hard.values == 1].shape[0] == num_pos + assert preds.hard.values[preds.hard.values == 0].shape[0] == len(preds) - num_pos + + +@pytest.mark.parametrize( + "model,name,num_pos", + [ + PreprocessTest( + model=VFAE( + dir='/tmp', + dataset="Toy", + supervised=True, + epochs=10, + fairness="Eq. Opp", + batch_size=100, + ), + name="VFAE", + num_pos=56, + ), + PreprocessTest( + model=VFAE( + dir='/tmp', + dataset="Toy", + supervised=False, + epochs=10, + fairness="Eq. Opp", + batch_size=100, + ), + name="VFAE", + num_pos=47, + ), + PreprocessTest(model=Zemel(dir='/tmp'), name="Zemel", num_pos=51), + PreprocessTest(model=Beutel(dir='/tmp'), name="Beutel DP", num_pos=49), + PreprocessTest( + model=Beutel(dir='/tmp', epochs=5, fairness="EqOp"), name="Beutel EqOp", num_pos=56 + ), + ], +) +def test_threaded_pre(toy_train_test: TrainTestPair, model: PreAlgorithm, name: str, num_pos: int): + """Test vfae.""" train, test = toy_train_test - beut_model: PreAlgorithm = Beutel(epochs=5, fairness="EqOp") - assert beut_model is not None - assert beut_model.name == "Beutel EqOp" - - new_train_test_non_thread: Tuple[DataTuple, TestTuple] = beut_model.run(train, test) - new_train_nt, new_test_nt = new_train_test_non_thread - - assert new_train_nt.x.shape[0] == train.x.shape[0] - assert new_test_nt.x.shape[0] == test.x.shape[0] - svm_model: InAlgorithm = SVM() assert svm_model is not None assert svm_model.name == "SVM" - predictions = svm_model.run_test(new_train_nt, new_test_nt) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 56 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 24 - - model: PreAlgorithmAsync = Beutel(epochs=5, fairness="EqOp") - assert model is not None - assert model.name == "Beutel EqOp" - - new_train_test: Tuple[DataTuple, TestTuple] = em.run_blocking(model.run_async(train, test)) + assert model.name == name + new_train_test = em.run_blocking(model.run_async(train, test)) new_train, new_test = new_train_test - assert new_train.x.shape[0] == train.x.shape[0] - assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == "Beutel EqOp: " + str(test.name) - assert new_train.name == "Beutel EqOp: " + str(train.name) - - classifier: InAlgorithm = SVM() - assert classifier is not None - assert classifier.name == "SVM" - - treaded_predictions: Prediction = classifier.run_test(new_train, new_test) - assert treaded_predictions.hard.values[treaded_predictions.hard.values == 1].shape[0] == 56 - assert treaded_predictions.hard.values[treaded_predictions.hard.values == 0].shape[0] == 24 - - -def test_upsampler(toy_train_test: TrainTestPair): - """Test upsampler.""" - train, test = toy_train_test - - upsampler: PreAlgorithm = Upsampler(strategy="naive") - assert upsampler is not None - assert upsampler.name == "Upsample naive" - - new_train: DataTuple - new_test: TestTuple - new_train, new_test = upsampler.run(train, test) - - assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == test.name - assert new_train.name == train.name - - lr_model: InAlgorithm = LR() - assert lr_model is not None - assert lr_model.name == "Logistic Regression (C=1.0)" - - predictions = lr_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 41 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 39 - - upsampler = Upsampler(strategy="uniform") - new_train, new_test = upsampler.run(train, test) - + assert len(new_train) == len(train) assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == test.name - assert new_train.name == train.name - - predictions = lr_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 43 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 37 - - upsampler = Upsampler(strategy="preferential") - new_train, new_test = upsampler.run(train, test) + assert new_train.x.shape[0] == train.x.shape[0] assert new_test.x.shape[0] == test.x.shape[0] - assert new_test.name == test.name - assert new_train.name == train.name + assert new_test.name == f"{name}: " + str(test.name) + assert new_train.name == f"{name}: " + str(train.name) - predictions = lr_model.run_test(new_train, new_test) - assert predictions.hard.values[predictions.hard.values == 1].shape[0] == 44 - assert predictions.hard.values[predictions.hard.values == 0].shape[0] == 36 + preds = svm_model.run_test(new_train, new_test) + assert preds.hard.values[preds.hard.values == 1].shape[0] == num_pos + assert preds.hard.values[preds.hard.values == 0].shape[0] == len(preds) - num_pos def test_calders(): diff --git a/tests/saving_data_test.py b/tests/saving_data_test.py index 3210ab95..3d832817 100644 --- a/tests/saving_data_test.py +++ b/tests/saving_data_test.py @@ -31,9 +31,9 @@ class CheckEquality(InAlgorithmAsync): """Dummy algorithm class for testing whether writing and reading feather files works.""" def __init__(self) -> None: - super().__init__(name="Check equality") + super().__init__(name="Check equality", seed=-1) - def _script_command(self, train_path, _, pred_path): + def _run_script_command(self, train_path, _, pred_path): """Check if the dataframes loaded from the files are the same as the original ones.""" loaded = DataTuple.from_npz(train_path) pd.testing.assert_frame_equal(data_tuple.x, loaded.x) @@ -43,6 +43,12 @@ def _script_command(self, train_path, _, pred_path): np.savez(pred_path, hard=np.load(train_path)["x"]) return ["-c", "pass"] + def _fit_script_command(self, train_path, model_path): + """Check if the dataframes loaded from the files are the same as the original ones.""" + + def _predict_script_command(self, model_path, test_path, pred_path): + """Check if the dataframes loaded from the files are the same as the original ones.""" + data_x = run_blocking(CheckEquality().run_async(data_tuple, data_tuple)) pd.testing.assert_series_equal( # type: ignore[call-arg] data_tuple.x["a1"], data_x.hard, check_names=False