Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add separate fit and test functions #472

Merged
merged 7 commits into from
Dec 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion ethicml/algorithms/algorithm_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,32 @@
class Algorithm(ABC):
"""Base class for Algorithms."""

def __init__(self, name: str):
def __init__(self, name: str, seed: int):
"""Base constructor for the Algorithm class.

Args:
name: name of the algorithm
seed: seed for the random number generator
"""
self.__name = name
self.__seed = seed

@property
def name(self) -> str:
"""Name of the algorithm."""
return self.__name

@property
def seed(self) -> int:
"""Seed for the random number generator."""
return self.__seed


class AlgorithmAsync(metaclass=ABCMeta): # pylint: disable=too-few-public-methods
"""Base class of async methods; meant to be used in conjuction with :class:`Algorithm`."""

model_dir: Path

@property
def _executable(self) -> str:
"""Path to a (Python) executable.
Expand Down
27 changes: 24 additions & 3 deletions ethicml/algorithms/inprocess/agarwal_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from pathlib import Path
from typing import Dict, List, Optional, Set, Union

from ranzen import implements

from ethicml.utility import ClassifierType, FairnessType

from .in_algorithm import InAlgorithmAsync
Expand All @@ -19,6 +21,7 @@ class Agarwal(InAlgorithmAsync):

def __init__(
self,
dir: Union[str, Path],
fairness: FairnessType = "DP",
classifier: ClassifierType = "LR",
eps: float = 0.1,
Expand All @@ -31,7 +34,8 @@ def __init__(
raise ValueError(f"results: fairness must be one of {VALID_FAIRNESS!r}.")
if classifier not in VALID_MODELS:
raise ValueError(f"results: classifier must be one of {VALID_MODELS!r}.")
super().__init__(name=f"Agarwal, {classifier}, {fairness}")
super().__init__(name=f"Agarwal, {classifier}, {fairness}", seed=seed)
self.model_dir = dir if isinstance(dir, Path) else Path(dir)
chosen_c, chosen_kernel = settings_for_svm_lr(classifier, C, kernel)
self.flags: Dict[str, Union[str, float, int]] = {
"classifier": classifier,
Expand All @@ -43,6 +47,23 @@ def __init__(
"seed": seed,
}

def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
args = flag_interface(train_path, test_path, pred_path, self.flags)
@implements(InAlgorithmAsync)
def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
args = flag_interface(
train_path=train_path, test_path=test_path, pred_path=pred_path, flags=self.flags
)
return ["-m", "ethicml.implementations.agarwal"] + args

@implements(InAlgorithmAsync)
def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
args = flag_interface(train_path=train_path, model_path=model_path, flags=self.flags)
return ["-m", "ethicml.implementations.agarwal"] + args

@implements(InAlgorithmAsync)
def _predict_script_command(
self, model_path: Path, test_path: Path, pred_path: Path
) -> List[str]:
args = flag_interface(
model_path=model_path, test_path=test_path, pred_path=pred_path, flags=self.flags
)
return ["-m", "ethicml.implementations.agarwal"] + args
14 changes: 12 additions & 2 deletions ethicml/algorithms/inprocess/blind.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,18 @@ class Blind(InAlgorithm):
"""Returns a random label."""

def __init__(self, seed: int = 888) -> None:
super().__init__(name="Blind", is_fairness_algo=False)
self.seed = seed
super().__init__(name="Blind", is_fairness_algo=False, seed=seed)

@implements(InAlgorithm)
def fit(self, train: DataTuple) -> InAlgorithm:
self.vals = train.y.drop_duplicates()
return self

@implements(InAlgorithm)
def predict(self, test: TestTuple) -> Prediction:
random = np.random.RandomState(self.seed)

return Prediction(hard=pd.Series(random.choice(self.vals.T.to_numpy()[0], test.x.shape[0])))

@implements(InAlgorithm)
def run(self, train: DataTuple, test: TestTuple) -> Prediction:
Expand Down
24 changes: 21 additions & 3 deletions ethicml/algorithms/inprocess/fairness_wo_demographics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ class DRO(InAlgorithmAsync):

def __init__(
self,
dir: Union[str, Path],
eta: float = 0.5,
epochs: int = 10,
batch_size: int = 32,
network_size: Optional[List[int]] = None,
seed: int = 888,
):
super().__init__(name="Dist Robust Optim")
super().__init__(name="Dist Robust Optim", seed=seed)
if network_size is None:
network_size = [50]
self.model_dir = dir if isinstance(dir, Path) else Path(dir)
self.flags: Dict[str, Union[float, int, str, List[int]]] = {
"eta": eta,
"batch_size": batch_size,
Expand All @@ -34,6 +36,22 @@ def __init__(
}

@implements(InAlgorithmAsync)
def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
args = flag_interface(train_path, test_path, pred_path, self.flags)
def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
args = flag_interface(
train_path=train_path, test_path=test_path, pred_path=pred_path, flags=self.flags
)
return ["-m", "ethicml.implementations.dro_tabular"] + args

@implements(InAlgorithmAsync)
def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
args = flag_interface(train_path=train_path, model_path=model_path, flags=self.flags)
return ["-m", "ethicml.implementations.dro_tabular"] + args

@implements(InAlgorithmAsync)
def _predict_script_command(
self, model_path: Path, test_path: Path, pred_path: Path
) -> List[str]:
args = flag_interface(
model_path=model_path, test_path=test_path, pred_path=pred_path, flags=self.flags
)
return ["-m", "ethicml.implementations.dro_tabular"] + args
91 changes: 86 additions & 5 deletions ethicml/algorithms/inprocess/in_algorithm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Abstract Base Class of all algorithms in the framework."""
from __future__ import annotations

from abc import abstractmethod
from pathlib import Path
from tempfile import TemporaryDirectory
Expand All @@ -15,10 +17,32 @@
class InAlgorithm(Algorithm):
"""Abstract Base Class for algorithms that run in the middle of the pipeline."""

def __init__(self, name: str, is_fairness_algo: bool = True):
super().__init__(name=name)
def __init__(self, name: str, seed: int, is_fairness_algo: bool = True):
super().__init__(name=name, seed=seed)
self.__is_fairness_algo = is_fairness_algo

@abstractmethod
def fit(self, train: DataTuple) -> InAlgorithm:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be the perfect opportunity to use -> Self but it's not widely supported yet: python/typeshed#6300

"""Run Algorithm on the given data.

Args:
train: training data

Returns:
self, but trained.
"""

@abstractmethod
def predict(self, test: TestTuple) -> Prediction:
"""Run Algorithm on the given data.

Args:
test: data to evaluate on

Returns:
predictions
"""

@abstractmethod
def run(self, train: DataTuple, test: TestTuple) -> Prediction:
"""Run Algorithm on the given data.
Expand All @@ -45,6 +69,15 @@ def is_fairness_algo(self) -> bool:
class InAlgorithmAsync(InAlgorithm, AlgorithmAsync):
"""In-Algorithm that can be run blocking and asynchronously."""

@implements(InAlgorithm)
def fit(self, train: DataTuple) -> InAlgorithm:
run_blocking(self.fit_async(train))
return self

@implements(InAlgorithm)
def predict(self, test: TestTuple) -> Prediction:
return run_blocking(self.predict_async(test))

@implements(InAlgorithm)
def run(self, train: DataTuple, test: TestTuple) -> Prediction:
"""Run this asynchronous Algorithm as blocking on the given data.
Expand All @@ -58,6 +91,44 @@ def run(self, train: DataTuple, test: TestTuple) -> Prediction:
"""
return run_blocking(self.run_async(train, test))

async def fit_async(self, train: DataTuple) -> InAlgorithmAsync:
"""Run Algorithm on the given data asynchronously.

Args:
train: training data
test: test data

Returns:
predictions
"""
self.model_path = self.model_dir / f"model_{self.name}.joblib"
with TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
train_path = tmp_path / "train.npz"
train.to_npz(train_path)
cmd = self._fit_script_command(train_path, self.model_path)
await self._call_script(cmd + ["--mode", "fit"]) # wait for script to run
return self

async def predict_async(self, test: TestTuple) -> Prediction:
"""Run Algorithm on the given data asynchronously.

Args:
train: training data
test: test data

Returns:
predictions
"""
with TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
test_path = tmp_path / "test.npz"
pred_path = tmp_path / "predictions.npz"
test.to_npz(test_path)
cmd = self._predict_script_command(self.model_path, test_path, pred_path)
await self._call_script(cmd + ["--mode", "predict"]) # wait for scrip to run
return Prediction.from_npz(pred_path)

async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
"""Run Algorithm on the given data asynchronously.

Expand All @@ -75,10 +146,20 @@ async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
pred_path = tmp_path / "predictions.npz"
train.to_npz(train_path)
test.to_npz(test_path)
cmd = self._script_command(train_path, test_path, pred_path)
await self._call_script(cmd) # wait for scrip to run
cmd = self._run_script_command(train_path, test_path, pred_path)
await self._call_script(cmd + ["--mode", "run"]) # wait for scrip to run
return Prediction.from_npz(pred_path)

@abstractmethod
def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
"""The command that will run the script."""

@abstractmethod
def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
"""The command that will run the script."""

@abstractmethod
def _predict_script_command(
self, model_path: Path, test_path: Path, pred_path: Path
) -> List[str]:
"""The command that will run the script."""
18 changes: 16 additions & 2 deletions ethicml/algorithms/inprocess/installed_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import List, Optional

import git
from ranzen import implements

from .in_algorithm import InAlgorithmAsync

Expand All @@ -29,6 +30,7 @@ def __init__(
top_dir: str,
url: Optional[str] = None,
executable: Optional[str] = None,
seed: int = 888,
):
"""Download code from given URL and create Pip environment with Pipfile found in the code.

Expand All @@ -39,6 +41,7 @@ def __init__(
simply the last part of the repository URL)
url: (optional) URL of the repository
executable: (optional) path to a Python executable
seed: Random seed to use for reproducibility
"""
# QUESTION: do we really need `store_dir`? we could also just clone the code into "."
self._store_dir: Path = Path(".") / dir_name # directory where code and venv are stored
Expand All @@ -54,7 +57,7 @@ def __init__(
self.__executable = str(self._code_path.resolve() / ".venv" / "bin" / "python")
else:
self.__executable = executable
super().__init__(name=name)
super().__init__(name=name, seed=seed)

@property
def _code_path(self) -> Path:
Expand Down Expand Up @@ -90,5 +93,16 @@ def remove(self) -> None:
except OSError as excep:
print(f"Error: {excep.filename} - {excep.strerror}.")

def _script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
@implements(InAlgorithmAsync)
def _run_script_command(self, train_path: Path, test_path: Path, pred_path: Path) -> List[str]:
return [] # pylint was complaining when I didn't return anything here...

@implements(InAlgorithmAsync)
def _fit_script_command(self, train_path: Path, model_path: Path) -> List[str]:
return []

@implements(InAlgorithmAsync)
def _predict_script_command(
self, model_path: Path, test_path: Path, pred_path: Path
) -> List[str]:
return []
37 changes: 35 additions & 2 deletions ethicml/algorithms/inprocess/kamiran.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
import sklearn.linear_model._base
from ranzen import implements
from sklearn.linear_model import LogisticRegression

Expand All @@ -28,12 +29,22 @@ def __init__(
kernel: Optional[str] = None,
seed: int = 888,
):
super().__init__(name=f"Kamiran & Calders {classifier}")
super().__init__(name=f"Kamiran & Calders {classifier}", seed=seed)
if classifier not in VALID_MODELS:
raise ValueError(f"results: classifier must be one of {VALID_MODELS!r}.")
self.classifier = classifier
self.C, self.kernel = settings_for_svm_lr(classifier, C, kernel)
self.seed = seed

@implements(InAlgorithm)
def fit(self, train: DataTuple) -> InAlgorithm:
self.clf = _train(
train, classifier=self.classifier, C=self.C, kernel=self.kernel, seed=self.seed
)
return self

@implements(InAlgorithm)
def predict(self, test: TestTuple) -> Prediction:
return _predict(model=self.clf, test=test)

@implements(InAlgorithm)
def run(self, train: DataTuple, test: TestTuple) -> Prediction:
Expand Down Expand Up @@ -69,6 +80,28 @@ def compute_instance_weights(
return pd.DataFrame(group_weights[inv_indexes_gi], columns=["instance weights"])


def _train(
train: DataTuple, classifier: ClassifierType, C: float, kernel: str, seed: int
) -> sklearn.linear_model._base.LinearModel:
if classifier == "SVM":
model = select_svm(C=C, kernel=kernel, seed=seed)
else:
random_state = np.random.RandomState(seed=seed)
model = LogisticRegression(
solver="liblinear", random_state=random_state, max_iter=5000, C=C
)
model.fit(
train.x,
train.y.to_numpy().ravel(),
sample_weight=compute_instance_weights(train)["instance weights"],
)
return model


def _predict(model: sklearn.linear_model._base.LinearModel, test: TestTuple) -> Prediction:
return Prediction(hard=pd.Series(model.predict(test.x)))


def _train_and_predict(
train: DataTuple, test: TestTuple, classifier: ClassifierType, C: float, kernel: str, seed: int
) -> Prediction:
Expand Down
Loading