diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md index 848706bc..556ec7e8 100644 --- a/docs/api/evaluation.md +++ b/docs/api/evaluation.md @@ -1,3 +1,7 @@ +::: polaris.evaluate.BenchmarkPredictions + +--- + ::: polaris.evaluate.ResultsMetadata options: filters: ["!^_"] @@ -25,4 +29,4 @@ ::: polaris.evaluate.metrics.generic_metrics ::: polaris.evaluate.metrics.docking_metrics ---- \ No newline at end of file +--- diff --git a/docs/tutorials/competition.participate.ipynb b/docs/tutorials/competition.participate.ipynb index 0ee6f223..1301e1e9 100644 --- a/docs/tutorials/competition.participate.ipynb +++ b/docs/tutorials/competition.participate.ipynb @@ -190,6 +190,9 @@ "competition_predictions = CompetitionPredictions(\n", " name=\"hello-world-result\",\n", " predictions=predictions,\n", + " target_labels=competition.target_cols,\n", + " test_set_labels=competition.test_set_labels,\n", + " test_set_sizes=competition.test_set_sizes,\n", " github_url=\"https://github.com/polaris-hub/polaris-hub\",\n", " paper_url=\"https://polarishub.io/\",\n", " description=\"Hello, World!\",\n", diff --git a/mkdocs.yml b/mkdocs.yml index 0fd09511..d261228a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -24,8 +24,9 @@ nav: - PDB Datasets: tutorials/dataset_pdb.ipynb - SDF Datasets: tutorials/dataset_sdf.ipynb - Optimization: tutorials/optimization.ipynb - - Competitions: - - tutorials/competition.participate.ipynb + # NOTE (cwognum): Competitions are currently gated. + # - Competitions: + # - tutorials/competition.participate.ipynb - API Reference: - Load: api/load.md - Core: @@ -33,10 +34,11 @@ nav: - Benchmark: api/benchmark.md - Subset: api/subset.md - Evaluation: api/evaluation.md - - Competitions: - - Competition Dataset: api/competition.dataset.md - - Competition: api/competition.md - - Competiton Evaluation: api/competition.evaluation.md + # NOTE (cwognum): Competitions are currently gated. + # - Competitions: + # - Competition Dataset: api/competition.dataset.md + # - Competition: api/competition.md + # - Competiton Evaluation: api/competition.evaluation.md - Hub: - Client: api/hub.client.md - External Auth Client: api/hub.external_client.md diff --git a/polaris/benchmark/__init__.py b/polaris/benchmark/__init__.py index b1f86863..416f6245 100644 --- a/polaris/benchmark/__init__.py +++ b/polaris/benchmark/__init__.py @@ -4,4 +4,8 @@ SingleTaskBenchmarkSpecification, ) -__all__ = ["BenchmarkSpecification", "SingleTaskBenchmarkSpecification", "MultiTaskBenchmarkSpecification"] +__all__ = [ + "BenchmarkSpecification", + "SingleTaskBenchmarkSpecification", + "MultiTaskBenchmarkSpecification", +] diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index ec22852e..66b6be17 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -30,7 +30,7 @@ from polaris.utils.types import ( AccessType, HubOwner, - PredictionsType, + IncomingPredictionsType, SplitType, TargetType, TaskType, @@ -131,7 +131,9 @@ def _validate_cols(cls, v, info: ValidationInfo): if info.data.get("dataset") is not None and not all( c in info.data["dataset"].table.columns for c in v ): - raise InvalidBenchmarkError("Not all specified target columns were found in the dataset.") + raise InvalidBenchmarkError("Not all specified columns were found in the dataset.") + if len(set(v)) != len(v): + raise InvalidBenchmarkError("The task specifies duplicate columns") return v @field_validator("metrics") @@ -173,19 +175,18 @@ def _validate_split(self) -> Self: 4) There is no overlap between the train and test set 5) No row exists in the test set where all labels are missing/empty """ + + if not isinstance(self.split[1], dict): + self.split = self.split[0], {"test": self.split[1]} split = self.split # Train partition can be empty (zero-shot) # Test partitions cannot be empty - if (isinstance(split[1], dict) and any(len(v) == 0 for v in split[1].values())) or ( - not isinstance(split[1], dict) and len(split[1]) == 0 - ): + if any(len(v) == 0 for v in split[1].values()): raise InvalidBenchmarkError("The predefined split contains empty test partitions") train_idx_list = split[0] - full_test_idx_list = ( - list(chain.from_iterable(split[1].values())) if isinstance(split[1], dict) else split[1] - ) + full_test_idx_list = list(chain.from_iterable(split[1].values())) if len(train_idx_list) == 0: logger.info( @@ -206,14 +207,11 @@ def _validate_split(self) -> Self: # Check for duplicate indices within a given test set. Because a user can specify # multiple test sets for a given benchmark and it is acceptable for indices to be shared # across test sets, we check for duplicates in each test set independently. - if isinstance(split[1], dict): - for test_set_name, test_set_idx_list in split[1].items(): - if len(test_set_idx_list) != len(set(test_set_idx_list)): - raise InvalidBenchmarkError( - f'Test set with name "{test_set_name}" contains duplicate indices' - ) - elif len(full_test_idx_set) != len(full_test_idx_list): - raise InvalidBenchmarkError("The test set contains duplicate indices") + for test_set_name, test_set_idx_list in split[1].items(): + if len(test_set_idx_list) != len(set(test_set_idx_list)): + raise InvalidBenchmarkError( + f'Test set with name "{test_set_name}" contains duplicate indices' + ) # All indices are valid given the dataset dataset = self.dataset @@ -307,18 +305,13 @@ def _compute_checksum(self): for m in sorted(self.metrics, key=lambda k: k.name): hash_fn.update(m.name.encode("utf-8")) - if not isinstance(self.split[1], dict): - split = self.split[0], {"test": self.split[1]} - else: - split = self.split - # Train set - s = json.dumps(sorted(split[0])) + s = json.dumps(sorted(self.split[0])) hash_fn.update(s.encode("utf-8")) # Test sets - for k in sorted(split[1].keys()): - s = json.dumps(sorted(split[1][k])) + for k in sorted(self.split[1].keys()): + s = json.dumps(sorted(self.split[1][k])) hash_fn.update(k.encode("utf-8")) hash_fn.update(s.encode("utf-8")) @@ -335,7 +328,7 @@ def n_train_datapoints(self) -> int: @property def n_test_sets(self) -> int: """The number of test sets""" - return len(self.split[1]) if isinstance(self.split[1], dict) else 1 + return len(self.split[1]) @computed_field @property @@ -370,6 +363,18 @@ def task_type(self) -> str: v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK return v.value + @computed_field + @property + def test_set_labels(self) -> list[str]: + """The labels of the test sets.""" + return sorted(list(self.split[1].keys())) + + @computed_field + @property + def test_set_sizes(self) -> list[str]: + """The sizes of the test sets.""" + return {k: len(v) for k, v in self.split[1].items()} + def _get_subset(self, indices, hide_targets=True, featurization_fn=None): """Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used internally to construct the train and test sets.""" @@ -393,10 +398,7 @@ def make_test_subset(vals): return self._get_subset(vals, hide_targets=hide_targets, featurization_fn=featurization_fn) test_split = self.split[1] - if isinstance(test_split, dict): - test = {k: make_test_subset(v) for k, v in test_split.items()} - else: - test = make_test_subset(test_split) + test = {k: make_test_subset(v) for k, v in test_split.items()} return test @@ -422,10 +424,16 @@ def get_train_test_split( train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn) test = self._get_test_set(hide_targets=True, featurization_fn=featurization_fn) + # For improved UX, we return the object instead of the dictionary if there is only one test set. + # Internally, however, assume that the test set is always a dictionary simplifies the code. + if len(test) == 1: + test = test["test"] return train, test def evaluate( - self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None + self, + y_pred: IncomingPredictionsType | None = None, + y_prob: IncomingPredictionsType | None = None, ) -> BenchmarkResults: """Execute the evaluation protocol for the benchmark, given a set of predictions. @@ -433,16 +441,6 @@ def evaluate( Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just the predictions. This reduces the chance of accidentally using the test targets during training. - info: Expected structure for `y_pred` and `y_prob` arguments - The supplied `y_pred` and `y_prob` arguments must adhere to a certain structure depending on the number of - tasks and test sets included in the benchmark. Refer to the following for guidance on the correct structure when - creating your `y_pred` and `y_prod` objects: - - - Single task, single set: `[values...]` - - Multi-task, single set: `{task_name_1: [values...], task_name_2: [values...]}` - - Single task, multi-set: `{test_set_1: {task_name: [values...]}, test_set_2: {task_name: [values...]}}` - - Multi-task, multi-set: `{test_set_1: {task_name_1: [values...], task_name_2: [values...]}, test_set_2: {task_name_1: [values...], task_name_2: [values...]}}` - For this method, we make the following assumptions: 1. There can be one or multiple test set(s); @@ -456,7 +454,8 @@ def evaluate( If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. If there are multiple test sets, the predictions should be further wrapped in a dictionary with the test subset labels as keys. - y_prob: The predicted probabilities for the test set, as NumPy arrays. + y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the + number of tasks and test sets. Returns: A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. @@ -475,31 +474,22 @@ def evaluate( """ # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves. - # The `evaluate_benchmark` function expects the benchmark labels to be of a certain structure which - # depends on the number of tasks and test sets defined for the benchmark. Below, we build the structure - # of the benchmark labels based on the aforementioned factors. - test = self._get_test_set(hide_targets=False) - if isinstance(test, dict): - # - # For multi-set benchmarks - y_true = {} - for test_set_name, values in test.items(): - y_true[test_set_name] = {} - if isinstance(values.targets, dict): - # - # For multi-task, multi-set benchmarks - for task_name, values in values.targets.items(): - y_true[test_set_name][task_name] = values - else: - # - # For single task, multi-set benchmarks - y_true[test_set_name][self.target_cols[0]] = values.targets - else: - # - # For single set benchmarks (single and multiple task) - y_true = test.targets + y_true_subset = self._get_test_set(hide_targets=False) + y_true_values = {k: v.targets for k, v in y_true_subset.items()} + + # Simplify the case where there is only one test set + if len(y_true_values) == 1: + y_true_values = y_true_values["test"] - scores = evaluate_benchmark(self.target_cols, self.metrics, y_true, y_pred=y_pred, y_prob=y_prob) + scores = evaluate_benchmark( + target_cols=self.target_cols, + test_set_labels=self.test_set_labels, + test_set_sizes=self.test_set_sizes, + metrics=self.metrics, + y_true=y_true_values, + y_pred=y_pred, + y_prob=y_prob, + ) return BenchmarkResults(results=scores, benchmark_name=self.name, benchmark_owner=self.owner) diff --git a/polaris/evaluate/__init__.py b/polaris/evaluate/__init__.py index 95e61efc..7ab8172a 100644 --- a/polaris/evaluate/__init__.py +++ b/polaris/evaluate/__init__.py @@ -1,11 +1,12 @@ from polaris.evaluate._metric import Metric, MetricInfo +from polaris.evaluate._predictions import BenchmarkPredictions from polaris.evaluate._results import ( BenchmarkResults, - ResultsType, - CompetitionResults, CompetitionPredictions, - ResultsMetadata, + CompetitionResults, EvaluationResult, + ResultsMetadata, + ResultsType, ) from polaris.evaluate.utils import evaluate_benchmark @@ -19,4 +20,5 @@ "ResultsType", "evaluate_benchmark", "CompetitionPredictions", + "BenchmarkPredictions", ] diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 048f61c9..652bd5ce 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -1,12 +1,12 @@ from enum import Enum -from typing import Callable, Literal, Optional +from typing import Callable, Literal import numpy as np from pydantic import BaseModel, Field - from sklearn.metrics import ( accuracy_score, average_precision_score, + balanced_accuracy_score, explained_variance_score, f1_score, matthews_corrcoef, @@ -14,17 +14,15 @@ mean_squared_error, r2_score, roc_auc_score, - balanced_accuracy_score, ) from polaris.evaluate.metrics import ( - cohen_kappa_score, absolute_average_fold_error, - spearman, + cohen_kappa_score, pearsonr, + spearman, ) from polaris.evaluate.metrics.docking_metrics import rmsd_coverage - from polaris.utils.types import DirectionType @@ -107,7 +105,7 @@ def y_type(self) -> bool: return self.value.y_type def score( - self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None + self, y_true: np.ndarray, y_pred: np.ndarray | None = None, y_prob: np.ndarray | None = None ) -> float: """Endpoint for computing the metric. @@ -134,7 +132,7 @@ def score( return self.fn(**kwargs, **self.value.kwargs) def __call__( - self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None + self, y_true: np.ndarray, y_pred: np.ndarray | None = None, y_prob: np.ndarray | None = None ) -> float: """For convenience, make metrics callable""" return self.score(y_true, y_pred, y_prob) diff --git a/polaris/evaluate/_predictions.py b/polaris/evaluate/_predictions.py new file mode 100644 index 00000000..af8e94fa --- /dev/null +++ b/polaris/evaluate/_predictions.py @@ -0,0 +1,179 @@ +import numpy as np +from pydantic import ( + BaseModel, + ConfigDict, + TypeAdapter, + field_serializer, + field_validator, + model_validator, +) +from typing_extensions import Self + +from polaris.utils.misc import convert_lists_to_arrays +from polaris.utils.types import IncomingPredictionsType, PredictionsType + + +class BenchmarkPredictions(BaseModel): + """ + Base model to represent predictions in the Polaris code base. + + Guided by [Postel's Law](https://en.wikipedia.org/wiki/Robustness_principle), + this class normalizes different formats to a single, internal representation. + + Attributes: + predictions: The predictions for the benchmark. + target_labels: The target columns for the associated benchmark. + test_set_labels: The names of the test sets for the associated benchmark. + """ + + predictions: PredictionsType + target_labels: list[str] + test_set_labels: list[str] + test_set_sizes: dict[str, int] + + model_config = ConfigDict(arbitrary_types_allowed=True) + + @field_serializer("predictions") + def _serialize_predictions(self, predictions: PredictionsType): + """ + Recursively converts all numpy values in the predictions dictionary to lists + so they can be serialized. + """ + + def convert_to_list(v): + if isinstance(v, np.ndarray): + return v.tolist() + elif isinstance(v, dict): + return {k: convert_to_list(v) for k, v in v.items()} + + return convert_to_list(predictions) + + @field_validator("target_labels", "test_set_labels") + @classmethod + def _validate_labels(cls, v: list[str]) -> list[str]: + if len(set(v)) != len(v): + raise ValueError("The predictions contain duplicate columns") + return v + + @model_validator(mode="before") + @classmethod + def _validate_predictions(cls, data: dict) -> dict: + """Normalizes the predictions format to a standard representation we use internally""" + + # This model validator runs before any Pydantic internal validation. + # This way we can normalize the incoming data to a standard representation. + # However, this implies that the fields can theoretically be any type. + + # Ensure the type of the incoming predictions is correct + validator = TypeAdapter(IncomingPredictionsType, config={"arbitrary_types_allowed": True}) + predictions = validator.validate_python(data.get("predictions")) + + # Ensure the type of the target_labels and test_set_labels is correct + validator = TypeAdapter(list[str]) + target_labels = validator.validate_python(data.get("target_labels")) + test_set_labels = validator.validate_python(data.get("test_set_labels")) + + validator = TypeAdapter(dict[str, int]) + test_set_sizes = validator.validate_python(data.get("test_set_sizes")) + + # Normalize the predictions to a standard representation + predictions = convert_lists_to_arrays(predictions) + predictions = cls._normalize_predictions(predictions, target_labels, test_set_labels) + + return { + "predictions": predictions, + "target_labels": target_labels, + "test_set_labels": test_set_labels, + "test_set_sizes": test_set_sizes, + } + + @model_validator(mode="after") + def check_test_set_size(self) -> Self: + """Verify that the size of all predictions""" + for test_set_label, test_set in self.predictions.items(): + for target in test_set.values(): + if test_set_label not in self.test_set_sizes: + raise ValueError(f"Expected size for test set '{test_set_label}' is not defined") + + if len(target) != self.test_set_sizes[test_set_label]: + raise ValueError( + f"Predictions size mismatch: The predictions for test set '{test_set_label}' " + f"should have a size of {self.test_set_sizes[test_set_label]}, but have a size of {len(target)}." + ) + return self + + @classmethod + def _normalize_predictions( + cls, predictions: IncomingPredictionsType, target_labels: list[str], test_set_labels: list[str] + ) -> PredictionsType: + """ + Normalizes the predictions to a standard representation we use internally. + This standard representation is a nested, two-level dictionary: + `{test_set_name: {target_column: np.ndarray}}` + """ + # (1) If the predictions are already fully specified, no need to do anything + if cls._is_fully_specified(predictions, target_labels, test_set_labels): + return predictions + + # If not fully specified, we distinguish 4 cases based on the type of benchmark. + is_single_task = len(target_labels) == 1 + is_single_test = len(test_set_labels) == 1 + + # (2) Single-task, single test set: We expect a numpy array as input. + if is_single_task and is_single_test: + if isinstance(predictions, dict): + raise ValueError( + "The predictions for single-task, single test set benchmarks should be a numpy array." + ) + predictions = {test_set_labels[0]: {target_labels[0]: predictions}} + + # (3) Single-task, multiple test sets: We expect a dictionary with the test set labels as keys. + elif is_single_task and not is_single_test: + if not isinstance(predictions, dict) or set(predictions.keys()) != set(test_set_labels): + raise ValueError( + "The predictions for single-task, multiple test sets benchmarks " + "should be a dictionary with the test set labels as keys." + ) + predictions = {k: {target_labels[0]: v} for k, v in predictions.items()} + + # (4) Multi-task, single test set: We expect a dictionary with the target labels as keys. + elif not is_single_task and is_single_test: + if not isinstance(predictions, dict) or set(predictions.keys()) != set(target_labels): + raise ValueError( + "The predictions for multi-task, single test set benchmarks " + "should be a dictionary with the target labels as keys." + ) + predictions = {test_set_labels[0]: predictions} + + # (5) Multi-task, multi-test sets: The predictions should be fully-specified + else: + raise ValueError( + "The predictions for multi-task, multi-test sets benchmarks should be fully-specified " + "as a nested, two-level dictionary: { test_set_name: { target_column: np.ndarray } }" + ) + + return predictions + + @classmethod + def _is_fully_specified( + cls, predictions: IncomingPredictionsType, target_labels: list[str], test_set_labels: list[str] + ) -> bool: + """ + Check if the predictions are fully specified for the target columns and test set names. + """ + # Not a dictionary + if not isinstance(predictions, dict): + return False + + # Outer-level of the dictionary should correspond to the test set names + if set(predictions.keys()) != set(test_set_labels): + return False + + # Inner-level of the dictionary should correspond to the target columns + for test_set_predictions in predictions.values(): + if not isinstance(test_set_predictions, dict): + return False + if set(test_set_predictions.keys()) != set(target_labels): + return False + + return True diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py index 4de807b6..217fd0e8 100644 --- a/polaris/evaluate/_results.py +++ b/polaris/evaluate/_results.py @@ -2,7 +2,6 @@ from datetime import datetime from typing import ClassVar, Optional, Union -import numpy as np import pandas as pd from pydantic import ( BaseModel, @@ -16,18 +15,16 @@ from pydantic.alias_generators import to_camel from polaris._artifact import BaseArtifactModel -from polaris.evaluate import Metric +from polaris.evaluate import BenchmarkPredictions, Metric from polaris.hub.settings import PolarisHubSettings from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidResultError from polaris.utils.misc import slugify from polaris.utils.types import ( AccessType, - CompetitionPredictionsType, HttpUrlString, HubOwner, HubUser, - PredictionsType, SlugCompatibleStringType, ) @@ -261,44 +258,16 @@ def competition_artifact_id(self) -> str: return f"{self.competition_owner}/{slugify(self.competition_name)}" -class CompetitionPredictions(ResultsMetadata): - """Class specific to predictions for competition benchmarks. +class CompetitionPredictions(ResultsMetadata, BenchmarkPredictions): + """ + Predictions for competition benchmarks. This object is to be used as input to [`CompetitionSpecification.evaluate`][polaris.competition.CompetitionSpecification.evaluate]. It is used to ensure that the structure of the predictions are compatible with evaluation methods on the Polaris Hub. + In addition to the predictions, it contains additional meta-data to create a results object. Attributes: - predictions: The predictions created for a given competition's test set(s). + access: The access the returned results should have """ - predictions: Union[PredictionsType, CompetitionPredictionsType] access: Optional[AccessType] = "private" - - @field_validator("predictions") - @classmethod - def _convert_predictions(cls, value: Union[PredictionsType, CompetitionPredictionsType]): - """Convert prediction arrays from a list type to a numpy array. This is required for certain - operations during prediction evaluation""" - - if isinstance(value, list): - return np.array(value) - elif isinstance(value, np.ndarray): - return value - elif isinstance(value, dict): - for key, val in value.items(): - value[key] = cls._convert_predictions(val) - return value - - @field_serializer("predictions") - def _serialize_predictions(self, value: PredictionsType): - """Used to serialize a Predictions object such that it can be sent over the wire during - external evaluation for competitions""" - - if isinstance(value, np.ndarray): - return value.tolist() - elif isinstance(value, list): - return value - elif isinstance(value, dict): - for key, val in value.items(): - value[key] = self._serialize_predictions(val) - return value diff --git a/polaris/evaluate/metrics/__init__.py b/polaris/evaluate/metrics/__init__.py index d14bdf17..0b14ceec 100644 --- a/polaris/evaluate/metrics/__init__.py +++ b/polaris/evaluate/metrics/__init__.py @@ -1,7 +1,7 @@ +from polaris.evaluate.metrics.docking_metrics import rmsd_coverage from polaris.evaluate.metrics.generic_metrics import ( - cohen_kappa_score, absolute_average_fold_error, - spearman, + cohen_kappa_score, pearsonr, + spearman, ) -from polaris.evaluate.metrics.docking_metrics import rmsd_coverage diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py index 65e8778e..b2c72280 100644 --- a/polaris/evaluate/utils.py +++ b/polaris/evaluate/utils.py @@ -1,45 +1,42 @@ import numpy as np import pandas as pd -from typing import Optional - -from polaris.evaluate import BenchmarkResults, ResultsType -from polaris.utils.types import PredictionsType -from polaris.evaluate import Metric from numpy.typing import NDArray - -def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]): - """Check if the given values are for a multiple-task benchmark with a single - test set. This is inferred by comparing the target names with the keys of the - given data. If all keys in the given data match the target column names, we - assume they are target names (as opposed to test set names for a single-task, - multiple test set benchmark).""" - return all(k in target_cols for k in vals) +from polaris.evaluate import BenchmarkPredictions, BenchmarkResults, Metric, ResultsType +from polaris.utils.types import IncomingPredictionsType -def normalize_predictions_type(vals: PredictionsType, target_cols: list[str]): - if isinstance(vals, dict): - if is_multi_task_single_test_set(vals, target_cols): - return {"test": vals} - else: - return vals - elif vals is None: +def _optionally_get(preds: BenchmarkPredictions | None, keys: list[str] | str) -> dict | None: + """ + Returns the value in a nested dictionary associated with a sequence of keys + if it exists, otherwise return None + """ + if preds is None: return None - else: - return {"test": {target_cols[0]: vals}} - -def safe_mask( - input_values: dict | dict[str, dict], test_label: str, target_label: str, mask: NDArray[np.bool_] -): - if ( - input_values is None - or input_values.get(test_label) is None - or input_values[test_label].get(target_label) is None - ): + if not isinstance(keys, list): + keys = [keys] + + d = preds.predictions + for k in keys: + d = d.get(k) + if d is None: + return None + return d + + +def _safe_mask( + preds: BenchmarkPredictions | None, + mask: NDArray[np.bool_], + keys: list[str], +) -> NDArray[np.float64] | None: + """ + Mask a prediction array if it exists in a nested array. Otherwise return None + """ + v = _optionally_get(preds, keys) + if v is None: return None - else: - return np.array(input_values[test_label][target_label])[mask] + return v[mask] def mask_index(input_values): @@ -58,44 +55,61 @@ def mask_index(input_values): def evaluate_benchmark( target_cols: list[str], + test_set_labels: list[str], + test_set_sizes: dict[str, int], metrics: list[Metric], - y_true: PredictionsType, - y_pred: Optional[PredictionsType] = None, - y_prob: Optional[PredictionsType] = None, + y_true: IncomingPredictionsType, + y_pred: IncomingPredictionsType | None = None, + y_prob: IncomingPredictionsType | None = None, ): - y_true = normalize_predictions_type(y_true, target_cols) - y_pred = normalize_predictions_type(y_pred, target_cols) - y_prob = normalize_predictions_type(y_prob, target_cols) - - if y_pred and set(y_true.keys()) != set(y_pred.keys()): - raise KeyError(f"Missing keys for at least one of the test sets. Expecting: {sorted(y_true.keys())}") - + """ + Utility function that contains the evaluation logic for a benchmark + """ + + # Normalize the ground truth and predictions to a consistent, internal representation. + # Format is a two-level dictionary: {test_set_label: {target_label: np.ndarray}} + y_true = BenchmarkPredictions( + predictions=y_true, + target_labels=target_cols, + test_set_labels=test_set_labels, + test_set_sizes=test_set_sizes, + ) + if y_pred is not None: + y_pred = BenchmarkPredictions( + predictions=y_pred, + target_labels=target_cols, + test_set_labels=test_set_labels, + test_set_sizes=test_set_sizes, + ) + if y_prob is not None: + y_prob = BenchmarkPredictions( + predictions=y_prob, + target_labels=target_cols, + test_set_labels=test_set_labels, + test_set_sizes=test_set_sizes, + ) + + # Compute the results # Results are saved in a tabular format. For more info, see the BenchmarkResults docs. scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS) # For every test set... - for test_label, y_true_subset in y_true.items(): + for test_label, y_true_test in y_true.predictions.items(): # For every metric... for metric in metrics: if metric.is_multitask: # Multi-task but with a metric across targets score = metric( - y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label) + y_true=y_true_test, + y_pred=_optionally_get(y_pred, test_label), + y_prob=_optionally_get(y_prob, test_label), ) scores.loc[len(scores)] = (test_label, "aggregated", metric, score) continue - if not isinstance(y_true_subset, dict): - # Single task - score = metric( - y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label) - ) - scores.loc[len(scores)] = (test_label, target_cols[0], metric, score) - continue - # Otherwise, for every target... - for target_label, y_true_target in y_true_subset.items(): + for target_label, y_true_target in y_true_test.items(): # Single-task metrics for a multi-task benchmark # In such a setting, there can be NaN values, which we thus have to filter out. @@ -103,8 +117,8 @@ def evaluate_benchmark( score = metric( y_true=y_true_target[mask], - y_pred=safe_mask(y_pred, test_label, target_label, mask), - y_prob=safe_mask(y_prob, test_label, target_label, mask), + y_pred=_safe_mask(y_pred, mask, [test_label, target_label]), + y_prob=_safe_mask(y_prob, mask, [test_label, target_label]), ) scores.loc[len(scores)] = (test_label, target_label, metric, score) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index d2278452..b7c3247d 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -23,9 +23,9 @@ SingleTaskBenchmarkSpecification, ) from polaris.competition import CompetitionSpecification + +from polaris.evaluate import BenchmarkResults, CompetitionPredictions, CompetitionResults from polaris.dataset import CompetitionDataset, Dataset, DatasetV1 -from polaris.evaluate import BenchmarkResults, CompetitionResults -from polaris.evaluate._results import CompetitionPredictions from polaris.experimental._dataset_v2 import DatasetV2 from polaris.hub.external_client import ExternalAuthClient from polaris.hub.oauth import CachedTokenAuth diff --git a/polaris/utils/misc.py b/polaris/utils/misc.py index b9156ea5..79b6583d 100644 --- a/polaris/utils/misc.py +++ b/polaris/utils/misc.py @@ -1,6 +1,8 @@ from typing import Any -from polaris.utils.types import SlugCompatibleStringType, SlugStringType +import numpy as np + +from polaris.utils.types import ListOrArrayType, SlugCompatibleStringType, SlugStringType def listit(t: Any): @@ -16,3 +18,19 @@ def slugify(sluggable: SlugCompatibleStringType) -> SlugStringType: Converts a slug-compatible string to a slug. """ return sluggable.lower().replace("_", "-").strip("-") + + +def convert_lists_to_arrays(predictions: ListOrArrayType | dict) -> np.ndarray | dict: + """ + Recursively converts all plain Python lists in the predictions object to numpy arrays + """ + + def convert_to_array(v): + if isinstance(v, np.ndarray): + return v + elif isinstance(v, list): + return np.array(v) + elif isinstance(v, dict): + return {k: convert_to_array(v) for k, v in v.items()} + + return convert_to_array(predictions) diff --git a/polaris/utils/types.py b/polaris/utils/types.py index 27c94992..86d8d4ef 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -25,19 +25,26 @@ The second item can either be a single test set or a dictionary with multiple, named test sets. """ -PredictionsType: TypeAlias = Union[np.ndarray, dict[str, Union[np.ndarray, dict[str, np.ndarray]]]] +ListOrArrayType: TypeAlias = list | np.ndarray """ -A prediction is one of three things: +A list of numbers or a numpy array. Predictions can be provided as either a list or a numpy array. +""" + +IncomingPredictionsType: TypeAlias = ListOrArrayType | dict[str, ListOrArrayType | dict[str, ListOrArrayType]] +""" +The type of the predictions that are ingested into the Polaris BenchmarkPredictions object. Can be one +of the following: - A single array (single-task, single test set) - A dictionary of arrays (single-task, multiple test sets) - A dictionary of dictionaries of arrays (multi-task, multiple test sets) """ -CompetitionPredictionsType: TypeAlias = Union[list, dict[str, Union[list, dict[str, list]]]] +PredictionsType: TypeAlias = dict[str, dict[str, np.ndarray]] """ -An additional type to represent the structure of predictions which are specific to competitions. This -type allows for the predictions to be sent over the wire for external evaluation. +The normalized format for predictions for internal use. Predictions are accepted in a generous +variety of representations and normalized into this standard format, a dictionary of dictionaries +that looks like {"test_set_name": {"target_name": np.ndarray}}. """ DatapointPartType = Union[Any, tuple[Any], dict[str, Any]] diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index a3b998ea..41bf5ce0 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -36,7 +36,7 @@ def test_split_verification(is_single_task, test_single_task_benchmark, test_mul cls(split=(train_split, {"test": []}), **default_kwargs) # Non-exclusive partitions with pytest.raises(ValidationError): - cls(split=(train_split, test_split + train_split[:1]), **default_kwargs) + cls(split=(train_split, test_split["test"] + train_split[:1]), **default_kwargs) with pytest.raises(ValidationError): cls(split=(train_split, {"test1": test_split, "test2": train_split[:1]}), **default_kwargs) # Invalid indices @@ -45,21 +45,22 @@ def test_split_verification(is_single_task, test_single_task_benchmark, test_mul with pytest.raises(ValidationError): cls(split=(train_split + [-1], test_split), **default_kwargs) with pytest.raises(ValidationError): - cls(split=(train_split, test_split + [len(obj.dataset)]), **default_kwargs) + cls(split=(train_split, test_split["test"] + [len(obj.dataset)]), **default_kwargs) with pytest.raises(ValidationError): - cls(split=(train_split, test_split + [-1]), **default_kwargs) + cls(split=(train_split, test_split["test"] + [-1]), **default_kwargs) # Duplicate indices with pytest.raises(ValidationError): cls(split=(train_split + train_split[:1], test_split), **default_kwargs) with pytest.raises(ValidationError): - cls(split=(train_split, test_split + test_split[:1]), **default_kwargs) + cls(split=(train_split, test_split["test"] + test_split["test"][:1]), **default_kwargs) with pytest.raises(ValidationError): cls( - split=(train_split, {"test1": test_split, "test2": test_split + test_split[:1]}), **default_kwargs + split=(train_split, {"test1": test_split, "test2": test_split["test"] + test_split["test"][:1]}), + **default_kwargs, ) # It should _not_ fail with duplicate indices across test partitions - cls(split=(train_split, {"test1": test_split, "test2": test_split}), **default_kwargs) + cls(split=(train_split, {"test1": test_split["test"], "test2": test_split["test"]}), **default_kwargs) # It should _not_ fail with missing indices cls(split=(train_split[:-1], test_split), **default_kwargs) # It should _not_ fail with an empty train set @@ -178,7 +179,7 @@ def _check_for_failure(_kwargs): _check_for_failure(kwargs) kwargs = obj.model_dump() - kwargs["split"] = kwargs["split"][0], kwargs["split"][1][1:] + kwargs["split"] = kwargs["split"][0], kwargs["split"][1]["test"][1:] _check_for_failure(kwargs) # Metrics diff --git a/tests/test_benchmark_predictions.py b/tests/test_benchmark_predictions.py new file mode 100644 index 00000000..a2112a5c --- /dev/null +++ b/tests/test_benchmark_predictions.py @@ -0,0 +1,247 @@ +import numpy as np +import pytest + +from polaris.evaluate import BenchmarkPredictions + + +def assert_deep_equal(result, expected): + assert isinstance(result, type(expected)), f"Types differ: {type(result)} != {type(expected)}" + + if isinstance(expected, dict): + assert result.keys() == expected.keys() + for key in expected: + assert_deep_equal(result[key], expected[key]) + elif isinstance(expected, np.ndarray): + assert np.array_equal(result, expected) + else: + assert result == expected + + +def test_benchmark_predictions_normalization(): + # Single task, single test set + assert_deep_equal( + {"test": {"col1": np.array([1, 2, 3])}}, + BenchmarkPredictions( + predictions=[1, 2, 3], + target_labels=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ).predictions, + ) + assert_deep_equal( + {"test": {"col1": np.array([1, 2, 3])}}, + BenchmarkPredictions( + predictions={"test": {"col1": [1, 2, 3]}}, + target_labels=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ).predictions, + ) + + # Single task, multiple test sets + assert_deep_equal( + {"test": {"col1": np.array([1, 2, 3])}, "test2": {"col1": np.array([4, 5, 6])}}, + BenchmarkPredictions( + predictions={"test": [1, 2, 3], "test2": [4, 5, 6]}, + target_labels=["col1"], + test_set_labels=["test", "test2"], + test_set_sizes={"test": 3, "test2": 3}, + ).predictions, + ) + assert_deep_equal( + {"test1": {"col1": np.array([1, 2, 3])}, "test2": {"col1": np.array([4, 5, 6])}}, + BenchmarkPredictions( + predictions={"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}}, + target_labels=["col1"], + test_set_labels=["test1", "test2"], + test_set_sizes={"test1": 3, "test2": 3}, + ).predictions, + ) + + # Multi-task, single test set + assert_deep_equal( + {"test": {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}}, + BenchmarkPredictions( + predictions={"col1": [1, 2, 3], "col2": [4, 5, 6]}, + target_labels=["col1", "col2"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ).predictions, + ) + assert_deep_equal( + {"test": {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}}, + BenchmarkPredictions( + predictions={"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}}, + target_labels=["col1", "col2"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ).predictions, + ) + + # Multi-task, multiple test sets + assert_deep_equal( + { + "test1": {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}, + "test2": {"col1": np.array([7, 8, 9]), "col2": np.array([10, 11, 12])}, + }, + BenchmarkPredictions( + predictions={ + "test1": {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + "test2": {"col1": [7, 8, 9], "col2": [10, 11, 12]}, + }, + target_labels=["col1", "col2"], + test_set_labels=["test1", "test2"], + test_set_sizes={"test1": 3, "test2": 3}, + ).predictions, + ) + + +def test_benchmark_predictions_incorrect_keys(): + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions=[1, 2, 3], + target_labels=["col1"], + test_set_labels=["test1", "test2"], + test_set_sizes={"test1": 3}, + ) + + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions=[1, 2, 3], + target_labels=["col1", "col2"], + test_set_labels=["test1"], + test_set_sizes={"test1": 3}, + ) + + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"col1": [1, 2, 3]}, + target_labels=["col1"], + test_set_labels=["test1", "test2"], + test_set_sizes={"test1": 3, "test2": 3}, + ) + + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}}, + target_labels=["col1"], + test_set_labels=["test1", "test2", "test3"], + test_set_sizes={"test1": 3, "test2": 3, "test3": 3}, + ) + + +def test_benchmark_predictions_type_checking(): + v1 = {"test": {"col1": ["strings", "also", "valid"]}} + v2 = BenchmarkPredictions( + predictions=["strings", "also", "valid"], + target_labels=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ).predictions + + assert list(v1.keys()) == ["test"] + assert list(v2.keys()) == ["test"] + assert list(v1["test"].keys()) == ["col1"] + assert list(v2["test"].keys()) == ["col1"] + assert isinstance(v2["test"]["col1"], np.ndarray) + assert np.array_equal(v2["test"]["col1"], np.array(["strings", "also", "valid"])) + + +def test_invalid_benchmark_predictions_errors(): + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": {"col1": [1, 2, 3]}, "test2": [4, 5, 6]}, + target_cols=["col1", "col2"], + test_set_labels=["test", "test2"], + test_set_sizes={"test": 3, "test2": 3}, + ) + + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": {"col1": "not an array or list"}}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 1}, + ) + + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": {"wrong column name": [1, 2, 3]}}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ) + + # You should either fully or minimally specify the predictions. + # We don't allow in-between results. + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"col1": [1, 2, 3]}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ) + + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": [1, 2, 3]}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ) + + # You shouldn't specify more keys than expected. + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ) + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": {"col1": [1, 2, 3]}, "test2": {"col1": [1, 2, 3]}}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3, "test2": 3}, + ) + + # Incorrect size + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": {"col": [1, 2, 3, 4]}}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ) + + # Invalid test_set_sizes + with pytest.raises(ValueError): + BenchmarkPredictions( + predictions={"test": {"col": [1, 2, 3]}}, + target_cols=["col1"], + test_set_labels=["test"], + test_set_sizes={"test1": 3}, + ) + + +def test_benchmark_predictions_serialization(): + predictions = BenchmarkPredictions( + predictions=[1, 2, 3], + target_labels=["col1"], + test_set_labels=["test"], + test_set_sizes={"test": 3}, + ) + serialized = predictions.model_dump() + assert serialized["predictions"] == {"test": {"col1": [1, 2, 3]}} + assert serialized["target_labels"] == ["col1"] + assert serialized["test_set_labels"] == ["test"] + + deserialized = BenchmarkPredictions(**serialized) + assert set(deserialized.predictions.keys()) == {"test"} + assert set(deserialized.predictions["test"].keys()) == {"col1"} + assert np.array_equal(deserialized.predictions["test"]["col1"], np.array([1, 2, 3])) + assert deserialized.target_labels == ["col1"] + assert deserialized.test_set_labels == ["test"] + assert set(deserialized.test_set_sizes.keys()) == {"test"} + assert deserialized.test_set_sizes["test"] == 3 diff --git a/tests/test_competition.py b/tests/test_competition.py index e6421585..19311994 100644 --- a/tests/test_competition.py +++ b/tests/test_competition.py @@ -1,8 +1,8 @@ import numpy as np import pandas as pd -from polaris.evaluate.utils import evaluate_benchmark, normalize_predictions_type from polaris.competition import CompetitionSpecification +from polaris.evaluate.utils import evaluate_benchmark def test_competition_from_json(test_competition, tmpdir): @@ -15,13 +15,18 @@ def test_competition_from_json(test_competition, tmpdir): def test_multi_col_competition_evaluation(test_competition): """Test that multi-column competitions will be evaluated properly when when target labels are read as a pandas dataframe from a file.""" - data = np.random.randint(2, size=(6, 3)) + data = np.random.randint(2, size=(10, 3)) labels = pd.DataFrame(data, columns=["Column1", "Column2", "Column3"]) labels_as_from_hub = {col: np.array(labels[col]) for col in labels.columns} predictions = {target_col: np.random.randint(2, size=labels.shape[0]) for target_col in labels.columns} result = evaluate_benchmark( - ["Column1", "Column2", "Column3"], test_competition.metrics, labels_as_from_hub, y_pred=predictions + target_cols=["Column1", "Column2", "Column3"], + test_set_labels=["test"], + test_set_sizes=test_competition.test_set_sizes, + metrics=test_competition.metrics, + y_true=labels_as_from_hub, + y_pred=predictions, ) assert isinstance(result, pd.DataFrame) @@ -36,7 +41,7 @@ def test_multi_col_competition_evaluation(test_competition): def test_single_col_competition_evaluation(test_competition): """Test that multi-column competitions will be evaluated properly when when target labels are read as a pandas dataframe from a file.""" - data = np.array( + y_true = np.array( [ 1.15588236, 1.56414507, @@ -48,18 +53,18 @@ def test_single_col_competition_evaluation(test_competition): 0.86099644, 0.67568671, 2.28213589, - 1.06617679, - 1.05709529, - 0.67568671, - 0.67568671, - 0.67568671, ] ) - labels = {"LOG HLM_CLint (mL/min/kg)": data} - predictions = data + np.random.uniform(0, 3, size=len(data)) + + y_pred = y_true + np.random.uniform(0, 3, size=len(y_true)) result = evaluate_benchmark( - ["LOG HLM_CLint (mL/min/kg)"], test_competition.metrics, labels, y_pred=predictions + target_cols=["LOG HLM_CLint (mL/min/kg)"], + test_set_labels=["test"], + test_set_sizes=test_competition.test_set_sizes, + metrics=test_competition.metrics, + y_true=y_true, + y_pred=y_pred, ) assert isinstance(result, pd.DataFrame) @@ -69,35 +74,3 @@ def test_single_col_competition_evaluation(test_competition): "Metric", "Score", } - - -def test_normalize_predictions_type(): - "Single column, single test set" - assert {"test": {"col1": [1, 2, 3]}} == normalize_predictions_type([1, 2, 3], ["col1"]) - assert {"test": {"col1": [1, 2, 3]}} == normalize_predictions_type({"col1": [1, 2, 3]}, ["col1"]) - assert {"test": {"col1": [1, 2, 3]}} == normalize_predictions_type( - {"test": {"col1": [1, 2, 3]}}, ["col1"] - ) - - "Multi-column, single test set" - assert {"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}} == normalize_predictions_type( - {"col1": [1, 2, 3], "col2": [4, 5, 6]}, ["col1", "col2"] - ) - - assert {"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}} == normalize_predictions_type( - {"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}}, ["col1", "col2"] - ) - - "Single column, multi-test set" - assert {"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}} == normalize_predictions_type( - {"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}}, ["col1"] - ) - - "Multi-column, multi-test set" - assert { - "test1": {"col1": [1, 2, 3], "col2": [4, 5, 6]}, - "test2": {"col1": [7, 8, 9], "col2": [10, 11, 12]}, - } == normalize_predictions_type( - {"test1": {"col1": [1, 2, 3], "col2": [4, 5, 6]}, "test2": {"col1": [7, 8, 9], "col2": [10, 11, 12]}}, - ["col1", "col2"], - )