wearepal · olliethomas · Dec 22, 2021 · Dec 22, 2021 · Dec 22, 2021 · Dec 22, 2021
diff --git a/ethicml/algorithms/algorithm_base.py b/ethicml/algorithms/algorithm_base.py
@@ -11,23 +11,32 @@
 class Algorithm(ABC):
     """Base class for Algorithms."""
 
-    def __init__(self, name: str):
+    def __init__(self, name: str, seed: int):
         """Base constructor for the Algorithm class.
 
         Args:
             name: name of the algorithm
+            seed: seed for the random number generator
         """
         self.__name = name
+        self.__seed = seed
 
     @property
     def name(self) -> str:
         """Name of the algorithm."""
         return self.__name
 
+    @property
+    def seed(self) -> int:
+        """Seed for the random number generator."""
+        return self.__seed
+
 
 class AlgorithmAsync(metaclass=ABCMeta):  # pylint: disable=too-few-public-methods
     """Base class of async methods; meant to be used in conjuction with :class:`Algorithm`."""
 
+    model_dir: Path
+
     @property
     def _executable(self) -> str:
         """Path to a (Python) executable.

diff --git a/ethicml/algorithms/inprocess/agarwal_reductions.py b/ethicml/algorithms/inprocess/agarwal_reductions.py
@@ -2,6 +2,8 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Set, Union
 
+from ranzen import implements
+
 from ethicml.utility import ClassifierType, FairnessType
 
 from .in_algorithm import InAlgorithmAsync
@@ -19,6 +21,7 @@ class Agarwal(InAlgorithmAsync):
 
     def __init__(
         self,
+        dir: Union[str, Path],
         fairness: FairnessType = "DP",
         classifier: ClassifierType = "LR",
         eps: float = 0.1,
@@ -31,7 +34,8 @@ def __init__(
             raise ValueError(f"results: fairness must be one of {VALID_FAIRNESS!r}.")
         if classifier not in VALID_MODELS:
             raise ValueError(f"results: classifier must be one of {VALID_MODELS!r}.")
-        super().__init__(name=f"Agarwal, {classifier}, {fairness}")
+        super().__init__(name=f"Agarwal, {classifier}, {fairness}", seed=seed)
+        self.model_dir = dir if isinstance(dir, Path) else Path(dir)
         chosen_c, chosen_kernel = settings_for_svm_lr(classifier, C, kernel)
         self.flags: Dict[str, Union[str, float, int]] = {
             "classifier": classifier,
@@ -43,6 +47,23 @@ def __init__(
             "seed": seed,
         }
 
-    def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
-        args = flag_interface(train_path, test_path, pred_path, self.flags)
+    @implements(InAlgorithmAsync)
+    def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
+        args = flag_interface(
+            train_path=train_path, test_path=test_path, pred_path=pred_path, flags=self.flags
+        )
+        return ["-m", "ethicml.implementations.agarwal"] + args
+
+    @implements(InAlgorithmAsync)
+    def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
+        args = flag_interface(train_path=train_path, model_path=model_path, flags=self.flags)
+        return ["-m", "ethicml.implementations.agarwal"] + args
+
+    @implements(InAlgorithmAsync)
+    def _predict_script_command(
+        self, model_path: Path, test_path: Path, pred_path: Path
+    ) -> List[str]:
+        args = flag_interface(
+            model_path=model_path, test_path=test_path, pred_path=pred_path, flags=self.flags
+        )
         return ["-m", "ethicml.implementations.agarwal"] + args
diff --git a/ethicml/algorithms/inprocess/blind.py b/ethicml/algorithms/inprocess/blind.py
@@ -15,8 +15,18 @@ class Blind(InAlgorithm):
     """Returns a random label."""
 
     def __init__(self, seed: int = 888) -> None:
-        super().__init__(name="Blind", is_fairness_algo=False)
-        self.seed = seed
+        super().__init__(name="Blind", is_fairness_algo=False, seed=seed)
+
+    @implements(InAlgorithm)
+    def fit(self, train: DataTuple) -> InAlgorithm:
+        self.vals = train.y.drop_duplicates()
+        return self
+
+    @implements(InAlgorithm)
+    def predict(self, test: TestTuple) -> Prediction:
+        random = np.random.RandomState(self.seed)
+
+        return Prediction(hard=pd.Series(random.choice(self.vals.T.to_numpy()[0], test.x.shape[0])))
 
     @implements(InAlgorithm)
     def run(self, train: DataTuple, test: TestTuple) -> Prediction:

diff --git a/ethicml/algorithms/inprocess/fairness_wo_demographics.py b/ethicml/algorithms/inprocess/fairness_wo_demographics.py
@@ -16,15 +16,17 @@ class DRO(InAlgorithmAsync):
 
     def __init__(
         self,
+        dir: Union[str, Path],
         eta: float = 0.5,
         epochs: int = 10,
         batch_size: int = 32,
         network_size: Optional[List[int]] = None,
         seed: int = 888,
     ):
-        super().__init__(name="Dist Robust Optim")
+        super().__init__(name="Dist Robust Optim", seed=seed)
         if network_size is None:
             network_size = [50]
+        self.model_dir = dir if isinstance(dir, Path) else Path(dir)
         self.flags: Dict[str, Union[float, int, str, List[int]]] = {
             "eta": eta,
             "batch_size": batch_size,
@@ -34,6 +36,22 @@ def __init__(
         }
 
     @implements(InAlgorithmAsync)
-    def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
-        args = flag_interface(train_path, test_path, pred_path, self.flags)
+    def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
+        args = flag_interface(
+            train_path=train_path, test_path=test_path, pred_path=pred_path, flags=self.flags
+        )
+        return ["-m", "ethicml.implementations.dro_tabular"] + args
+
+    @implements(InAlgorithmAsync)
+    def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
+        args = flag_interface(train_path=train_path, model_path=model_path, flags=self.flags)
+        return ["-m", "ethicml.implementations.dro_tabular"] + args
+
+    @implements(InAlgorithmAsync)
+    def _predict_script_command(
+        self, model_path: Path, test_path: Path, pred_path: Path
+    ) -> List[str]:
+        args = flag_interface(
+            model_path=model_path, test_path=test_path, pred_path=pred_path, flags=self.flags
+        )
         return ["-m", "ethicml.implementations.dro_tabular"] + args
diff --git a/ethicml/algorithms/inprocess/in_algorithm.py b/ethicml/algorithms/inprocess/in_algorithm.py
@@ -1,4 +1,6 @@
 """Abstract Base Class of all algorithms in the framework."""
+from __future__ import annotations
+
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -15,10 +17,32 @@
 class InAlgorithm(Algorithm):
     """Abstract Base Class for algorithms that run in the middle of the pipeline."""
 
-    def __init__(self, name: str, is_fairness_algo: bool = True):
-        super().__init__(name=name)
+    def __init__(self, name: str, seed: int, is_fairness_algo: bool = True):
+        super().__init__(name=name, seed=seed)
         self.__is_fairness_algo = is_fairness_algo
 
+    @abstractmethod
+    def fit(self, train: DataTuple) -> InAlgorithm:
+        """Run Algorithm on the given data.
+
+        Args:
+            train: training data
+
+        Returns:
+            self, but trained.
+        """
+
+    @abstractmethod
+    def predict(self, test: TestTuple) -> Prediction:
+        """Run Algorithm on the given data.
+
+        Args:
+            test: data to evaluate on
+
+        Returns:
+            predictions
+        """
+
     @abstractmethod
     def run(self, train: DataTuple, test: TestTuple) -> Prediction:
         """Run Algorithm on the given data.
@@ -45,6 +69,15 @@ def is_fairness_algo(self) -> bool:
 class InAlgorithmAsync(InAlgorithm, AlgorithmAsync):
     """In-Algorithm that can be run blocking and asynchronously."""
 
+    @implements(InAlgorithm)
+    def fit(self, train: DataTuple) -> InAlgorithm:
+        run_blocking(self.fit_async(train))
+        return self
+
+    @implements(InAlgorithm)
+    def predict(self, test: TestTuple) -> Prediction:
+        return run_blocking(self.predict_async(test))
+
     @implements(InAlgorithm)
     def run(self, train: DataTuple, test: TestTuple) -> Prediction:
         """Run this asynchronous Algorithm as blocking on the given data.
@@ -58,6 +91,44 @@ def run(self, train: DataTuple, test: TestTuple) -> Prediction:
         """
         return run_blocking(self.run_async(train, test))
 
+    async def fit_async(self, train: DataTuple) -> InAlgorithmAsync:
+        """Run Algorithm on the given data asynchronously.
+
+        Args:
+            train: training data
+            test: test data
+
+        Returns:
+            predictions
+        """
+        self.model_path = self.model_dir / f"model_{self.name}.joblib"
+        with TemporaryDirectory() as tmpdir:
+            tmp_path = Path(tmpdir)
+            train_path = tmp_path / "train.npz"
+            train.to_npz(train_path)
+            cmd = self._fit_script_command(train_path, self.model_path)
+            await self._call_script(cmd + ["--mode", "fit"])  # wait for script to run
+            return self
+
+    async def predict_async(self, test: TestTuple) -> Prediction:
+        """Run Algorithm on the given data asynchronously.
+
+        Args:
+            train: training data
+            test: test data
+
+        Returns:
+            predictions
+        """
+        with TemporaryDirectory() as tmpdir:
+            tmp_path = Path(tmpdir)
+            test_path = tmp_path / "test.npz"
+            pred_path = tmp_path / "predictions.npz"
+            test.to_npz(test_path)
+            cmd = self._predict_script_command(self.model_path, test_path, pred_path)
+            await self._call_script(cmd + ["--mode", "predict"])  # wait for scrip to run
+            return Prediction.from_npz(pred_path)
+
     async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
         """Run Algorithm on the given data asynchronously.
 
@@ -75,10 +146,20 @@ async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
             pred_path = tmp_path / "predictions.npz"
             train.to_npz(train_path)
             test.to_npz(test_path)
-            cmd = self._script_command(train_path, test_path, pred_path)
-            await self._call_script(cmd)  # wait for scrip to run
+            cmd = self._run_script_command(train_path, test_path, pred_path)
+            await self._call_script(cmd + ["--mode", "run"])  # wait for scrip to run
             return Prediction.from_npz(pred_path)
 
     @abstractmethod
-    def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
+    def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
+        """The command that will run the script."""
+
+    @abstractmethod
+    def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
+        """The command that will run the script."""
+
+    @abstractmethod
+    def _predict_script_command(
+        self, model_path: Path, test_path: Path, pred_path: Path
+    ) -> List[str]:
         """The command that will run the script."""
diff --git a/ethicml/algorithms/inprocess/installed_model.py b/ethicml/algorithms/inprocess/installed_model.py
@@ -13,6 +13,7 @@
 from typing import List, Optional
 
 import git
+from ranzen import implements
 
 from .in_algorithm import InAlgorithmAsync
 
@@ -29,6 +30,7 @@ def __init__(
         top_dir: str,
         url: Optional[str] = None,
         executable: Optional[str] = None,
+        seed: int = 888,
     ):
         """Download code from given URL and create Pip environment with Pipfile found in the code.
 
@@ -39,6 +41,7 @@ def __init__(
                      simply the last part of the repository URL)
             url: (optional) URL of the repository
             executable: (optional) path to a Python executable
+            seed: Random seed to use for reproducibility
         """
         # QUESTION: do we really need `store_dir`? we could also just clone the code into "."
         self._store_dir: Path = Path(".") / dir_name  # directory where code and venv are stored
@@ -54,7 +57,7 @@ def __init__(
             self.__executable = str(self._code_path.resolve() / ".venv" / "bin" / "python")
         else:
             self.__executable = executable
-        super().__init__(name=name)
+        super().__init__(name=name, seed=seed)
 
     @property
     def _code_path(self) -> Path:
@@ -90,5 +93,16 @@ def remove(self) -> None:
         except OSError as excep:
             print(f"Error: {excep.filename} - {excep.strerror}.")
 
-    def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
+    @implements(InAlgorithmAsync)
+    def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
         return []  # pylint was complaining when I didn't return anything here...
+
+    @implements(InAlgorithmAsync)
+    def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
+        return []
+
+    @implements(InAlgorithmAsync)
+    def _predict_script_command(
+        self, model_path: Path, test_path: Path, pred_path: Path
+    ) -> List[str]:
+        return []
diff --git a/ethicml/algorithms/inprocess/kamiran.py b/ethicml/algorithms/inprocess/kamiran.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import sklearn.linear_model._base
 from ranzen import implements
 from sklearn.linear_model import LogisticRegression
 
@@ -28,12 +29,22 @@ def __init__(
         kernel: Optional[str] = None,
         seed: int = 888,
     ):
-        super().__init__(name=f"Kamiran & Calders {classifier}")
+        super().__init__(name=f"Kamiran & Calders {classifier}", seed=seed)
         if classifier not in VALID_MODELS:
             raise ValueError(f"results: classifier must be one of {VALID_MODELS!r}.")
         self.classifier = classifier
         self.C, self.kernel = settings_for_svm_lr(classifier, C, kernel)
-        self.seed = seed
+
+    @implements(InAlgorithm)
+    def fit(self, train: DataTuple) -> InAlgorithm:
+        self.clf = _train(
+            train, classifier=self.classifier, C=self.C, kernel=self.kernel, seed=self.seed
+        )
+        return self
+
+    @implements(InAlgorithm)
+    def predict(self, test: TestTuple) -> Prediction:
+        return _predict(model=self.clf, test=test)
 
     @implements(InAlgorithm)
     def run(self, train: DataTuple, test: TestTuple) -> Prediction:
@@ -69,6 +80,28 @@ def compute_instance_weights(
     return pd.DataFrame(group_weights[inv_indexes_gi], columns=["instance weights"])
 
 
+def _train(
+    train: DataTuple, classifier: ClassifierType, C: float, kernel: str, seed: int
+) -> sklearn.linear_model._base.LinearModel:
+    if classifier == "SVM":
+        model = select_svm(C=C, kernel=kernel, seed=seed)
+    else:
+        random_state = np.random.RandomState(seed=seed)
+        model = LogisticRegression(
+            solver="liblinear", random_state=random_state, max_iter=5000, C=C
+        )
+    model.fit(
+        train.x,
+        train.y.to_numpy().ravel(),
+        sample_weight=compute_instance_weights(train)["instance weights"],
+    )
+    return model
+
+
+def _predict(model: sklearn.linear_model._base.LinearModel, test: TestTuple) -> Prediction:
+    return Prediction(hard=pd.Series(model.predict(test.x)))
+
+
 def _train_and_predict(
     train: DataTuple, test: TestTuple, classifier: ClassifierType, C: float, kernel: str, seed: int
 ) -> Prediction: