diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md
index 848706bc..556ec7e8 100644
--- a/docs/api/evaluation.md
+++ b/docs/api/evaluation.md
@@ -1,3 +1,7 @@
+::: polaris.evaluate.BenchmarkPredictions
+
+---
+
 ::: polaris.evaluate.ResultsMetadata
     options:
         filters: ["!^_"]
@@ -25,4 +29,4 @@
 ::: polaris.evaluate.metrics.generic_metrics
 ::: polaris.evaluate.metrics.docking_metrics
 
----
\ No newline at end of file
+---
diff --git a/docs/tutorials/competition.participate.ipynb b/docs/tutorials/competition.participate.ipynb
index 0ee6f223..1301e1e9 100644
--- a/docs/tutorials/competition.participate.ipynb
+++ b/docs/tutorials/competition.participate.ipynb
@@ -190,6 +190,9 @@
     "competition_predictions = CompetitionPredictions(\n",
     "    name=\"hello-world-result\",\n",
     "    predictions=predictions,\n",
+    "    target_labels=competition.target_cols,\n",
+    "    test_set_labels=competition.test_set_labels,\n",
+    "    test_set_sizes=competition.test_set_sizes,\n",
     "    github_url=\"https://github.com/polaris-hub/polaris-hub\",\n",
     "    paper_url=\"https://polarishub.io/\",\n",
     "    description=\"Hello, World!\",\n",
diff --git a/mkdocs.yml b/mkdocs.yml
index 0fd09511..d261228a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -24,8 +24,9 @@ nav:
           - PDB Datasets: tutorials/dataset_pdb.ipynb
           - SDF Datasets: tutorials/dataset_sdf.ipynb
           - Optimization: tutorials/optimization.ipynb
-      - Competitions:
-          - tutorials/competition.participate.ipynb
+      # NOTE (cwognum): Competitions are currently gated.
+      # - Competitions:
+      #     - tutorials/competition.participate.ipynb
   - API Reference:
       - Load: api/load.md
       - Core:
@@ -33,10 +34,11 @@ nav:
           - Benchmark: api/benchmark.md
           - Subset: api/subset.md
           - Evaluation: api/evaluation.md
-      - Competitions:
-          - Competition Dataset: api/competition.dataset.md
-          - Competition: api/competition.md
-          - Competiton Evaluation: api/competition.evaluation.md
+      # NOTE (cwognum): Competitions are currently gated.
+      # - Competitions:
+      #     - Competition Dataset: api/competition.dataset.md
+      #     - Competition: api/competition.md
+      #     - Competiton Evaluation: api/competition.evaluation.md
       - Hub:
           - Client: api/hub.client.md
           - External Auth Client: api/hub.external_client.md
diff --git a/polaris/benchmark/__init__.py b/polaris/benchmark/__init__.py
index b1f86863..416f6245 100644
--- a/polaris/benchmark/__init__.py
+++ b/polaris/benchmark/__init__.py
@@ -4,4 +4,8 @@
     SingleTaskBenchmarkSpecification,
 )
 
-__all__ = ["BenchmarkSpecification", "SingleTaskBenchmarkSpecification", "MultiTaskBenchmarkSpecification"]
+__all__ = [
+    "BenchmarkSpecification",
+    "SingleTaskBenchmarkSpecification",
+    "MultiTaskBenchmarkSpecification",
+]
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index ec22852e..66b6be17 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -30,7 +30,7 @@
 from polaris.utils.types import (
     AccessType,
     HubOwner,
-    PredictionsType,
+    IncomingPredictionsType,
     SplitType,
     TargetType,
     TaskType,
@@ -131,7 +131,9 @@ def _validate_cols(cls, v, info: ValidationInfo):
         if info.data.get("dataset") is not None and not all(
             c in info.data["dataset"].table.columns for c in v
         ):
-            raise InvalidBenchmarkError("Not all specified target columns were found in the dataset.")
+            raise InvalidBenchmarkError("Not all specified columns were found in the dataset.")
+        if len(set(v)) != len(v):
+            raise InvalidBenchmarkError("The task specifies duplicate columns")
         return v
 
     @field_validator("metrics")
@@ -173,19 +175,18 @@ def _validate_split(self) -> Self:
           4) There is no overlap between the train and test set
           5) No row exists in the test set where all labels are missing/empty
         """
+
+        if not isinstance(self.split[1], dict):
+            self.split = self.split[0], {"test": self.split[1]}
         split = self.split
 
         # Train partition can be empty (zero-shot)
         # Test partitions cannot be empty
-        if (isinstance(split[1], dict) and any(len(v) == 0 for v in split[1].values())) or (
-            not isinstance(split[1], dict) and len(split[1]) == 0
-        ):
+        if any(len(v) == 0 for v in split[1].values()):
             raise InvalidBenchmarkError("The predefined split contains empty test partitions")
 
         train_idx_list = split[0]
-        full_test_idx_list = (
-            list(chain.from_iterable(split[1].values())) if isinstance(split[1], dict) else split[1]
-        )
+        full_test_idx_list = list(chain.from_iterable(split[1].values()))
 
         if len(train_idx_list) == 0:
             logger.info(
@@ -206,14 +207,11 @@ def _validate_split(self) -> Self:
         # Check for duplicate indices within a given test set. Because a user can specify
         # multiple test sets for a given benchmark and it is acceptable for indices to be shared
         # across test sets, we check for duplicates in each test set independently.
-        if isinstance(split[1], dict):
-            for test_set_name, test_set_idx_list in split[1].items():
-                if len(test_set_idx_list) != len(set(test_set_idx_list)):
-                    raise InvalidBenchmarkError(
-                        f'Test set with name "{test_set_name}" contains duplicate indices'
-                    )
-        elif len(full_test_idx_set) != len(full_test_idx_list):
-            raise InvalidBenchmarkError("The test set contains duplicate indices")
+        for test_set_name, test_set_idx_list in split[1].items():
+            if len(test_set_idx_list) != len(set(test_set_idx_list)):
+                raise InvalidBenchmarkError(
+                    f'Test set with name "{test_set_name}" contains duplicate indices'
+                )
 
         # All indices are valid given the dataset
         dataset = self.dataset
@@ -307,18 +305,13 @@ def _compute_checksum(self):
         for m in sorted(self.metrics, key=lambda k: k.name):
             hash_fn.update(m.name.encode("utf-8"))
 
-        if not isinstance(self.split[1], dict):
-            split = self.split[0], {"test": self.split[1]}
-        else:
-            split = self.split
-
         # Train set
-        s = json.dumps(sorted(split[0]))
+        s = json.dumps(sorted(self.split[0]))
         hash_fn.update(s.encode("utf-8"))
 
         # Test sets
-        for k in sorted(split[1].keys()):
-            s = json.dumps(sorted(split[1][k]))
+        for k in sorted(self.split[1].keys()):
+            s = json.dumps(sorted(self.split[1][k]))
             hash_fn.update(k.encode("utf-8"))
             hash_fn.update(s.encode("utf-8"))
 
@@ -335,7 +328,7 @@ def n_train_datapoints(self) -> int:
     @property
     def n_test_sets(self) -> int:
         """The number of test sets"""
-        return len(self.split[1]) if isinstance(self.split[1], dict) else 1
+        return len(self.split[1])
 
     @computed_field
     @property
@@ -370,6 +363,18 @@ def task_type(self) -> str:
         v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK
         return v.value
 
+    @computed_field
+    @property
+    def test_set_labels(self) -> list[str]:
+        """The labels of the test sets."""
+        return sorted(list(self.split[1].keys()))
+
+    @computed_field
+    @property
+    def test_set_sizes(self) -> list[str]:
+        """The sizes of the test sets."""
+        return {k: len(v) for k, v in self.split[1].items()}
+
     def _get_subset(self, indices, hide_targets=True, featurization_fn=None):
         """Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used
         internally to construct the train and test sets."""
@@ -393,10 +398,7 @@ def make_test_subset(vals):
             return self._get_subset(vals, hide_targets=hide_targets, featurization_fn=featurization_fn)
 
         test_split = self.split[1]
-        if isinstance(test_split, dict):
-            test = {k: make_test_subset(v) for k, v in test_split.items()}
-        else:
-            test = make_test_subset(test_split)
+        test = {k: make_test_subset(v) for k, v in test_split.items()}
 
         return test
 
@@ -422,10 +424,16 @@ def get_train_test_split(
         train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn)
         test = self._get_test_set(hide_targets=True, featurization_fn=featurization_fn)
 
+        # For improved UX, we return the object instead of the dictionary if there is only one test set.
+        # Internally, however, assume that the test set is always a dictionary simplifies the code.
+        if len(test) == 1:
+            test = test["test"]
         return train, test
 
     def evaluate(
-        self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None
+        self,
+        y_pred: IncomingPredictionsType | None = None,
+        y_prob: IncomingPredictionsType | None = None,
     ) -> BenchmarkResults:
         """Execute the evaluation protocol for the benchmark, given a set of predictions.
 
@@ -433,16 +441,6 @@ def evaluate(
             Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just
             the predictions. This reduces the chance of accidentally using the test targets during training.
 
-        info: Expected structure for `y_pred` and `y_prob` arguments
-            The supplied `y_pred` and `y_prob` arguments must adhere to a certain structure depending on the number of
-            tasks and test sets included in the benchmark. Refer to the following for guidance on the correct structure when
-            creating your `y_pred` and `y_prod` objects:
-
-            - Single task, single set: `[values...]`
-            - Multi-task, single set: `{task_name_1: [values...], task_name_2: [values...]}`
-            - Single task, multi-set: `{test_set_1: {task_name: [values...]}, test_set_2: {task_name: [values...]}}`
-            - Multi-task, multi-set: `{test_set_1: {task_name_1: [values...], task_name_2: [values...]}, test_set_2: {task_name_1: [values...], task_name_2: [values...]}}`
-
         For this method, we make the following assumptions:
 
         1. There can be one or multiple test set(s);
@@ -456,7 +454,8 @@ def evaluate(
                 If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
                 If there are multiple test sets, the predictions should be further wrapped in a dictionary
                     with the test subset labels as keys.
-            y_prob: The predicted probabilities for the test set, as NumPy arrays.
+            y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the
+                number of tasks and test sets.
 
         Returns:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
@@ -475,31 +474,22 @@ def evaluate(
         """
 
         # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
-        # The `evaluate_benchmark` function expects the benchmark labels to be of a certain structure which
-        # depends on the number of tasks and test sets defined for the benchmark. Below, we build the structure
-        # of the benchmark labels based on the aforementioned factors.
-        test = self._get_test_set(hide_targets=False)
-        if isinstance(test, dict):
-            #
-            # For multi-set benchmarks
-            y_true = {}
-            for test_set_name, values in test.items():
-                y_true[test_set_name] = {}
-                if isinstance(values.targets, dict):
-                    #
-                    # For multi-task, multi-set benchmarks
-                    for task_name, values in values.targets.items():
-                        y_true[test_set_name][task_name] = values
-                else:
-                    #
-                    # For single task, multi-set benchmarks
-                    y_true[test_set_name][self.target_cols[0]] = values.targets
-        else:
-            #
-            # For single set benchmarks (single and multiple task)
-            y_true = test.targets
+        y_true_subset = self._get_test_set(hide_targets=False)
+        y_true_values = {k: v.targets for k, v in y_true_subset.items()}
+
+        # Simplify the case where there is only one test set
+        if len(y_true_values) == 1:
+            y_true_values = y_true_values["test"]
 
-        scores = evaluate_benchmark(self.target_cols, self.metrics, y_true, y_pred=y_pred, y_prob=y_prob)
+        scores = evaluate_benchmark(
+            target_cols=self.target_cols,
+            test_set_labels=self.test_set_labels,
+            test_set_sizes=self.test_set_sizes,
+            metrics=self.metrics,
+            y_true=y_true_values,
+            y_pred=y_pred,
+            y_prob=y_prob,
+        )
 
         return BenchmarkResults(results=scores, benchmark_name=self.name, benchmark_owner=self.owner)
 
diff --git a/polaris/evaluate/__init__.py b/polaris/evaluate/__init__.py
index 95e61efc..7ab8172a 100644
--- a/polaris/evaluate/__init__.py
+++ b/polaris/evaluate/__init__.py
@@ -1,11 +1,12 @@
 from polaris.evaluate._metric import Metric, MetricInfo
+from polaris.evaluate._predictions import BenchmarkPredictions
 from polaris.evaluate._results import (
     BenchmarkResults,
-    ResultsType,
-    CompetitionResults,
     CompetitionPredictions,
-    ResultsMetadata,
+    CompetitionResults,
     EvaluationResult,
+    ResultsMetadata,
+    ResultsType,
 )
 from polaris.evaluate.utils import evaluate_benchmark
 
@@ -19,4 +20,5 @@
     "ResultsType",
     "evaluate_benchmark",
     "CompetitionPredictions",
+    "BenchmarkPredictions",
 ]
diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 048f61c9..652bd5ce 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -1,12 +1,12 @@
 from enum import Enum
-from typing import Callable, Literal, Optional
+from typing import Callable, Literal
 
 import numpy as np
 from pydantic import BaseModel, Field
-
 from sklearn.metrics import (
     accuracy_score,
     average_precision_score,
+    balanced_accuracy_score,
     explained_variance_score,
     f1_score,
     matthews_corrcoef,
@@ -14,17 +14,15 @@
     mean_squared_error,
     r2_score,
     roc_auc_score,
-    balanced_accuracy_score,
 )
 
 from polaris.evaluate.metrics import (
-    cohen_kappa_score,
     absolute_average_fold_error,
-    spearman,
+    cohen_kappa_score,
     pearsonr,
+    spearman,
 )
 from polaris.evaluate.metrics.docking_metrics import rmsd_coverage
-
 from polaris.utils.types import DirectionType
 
 
@@ -107,7 +105,7 @@ def y_type(self) -> bool:
         return self.value.y_type
 
     def score(
-        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
+        self, y_true: np.ndarray, y_pred: np.ndarray | None = None, y_prob: np.ndarray | None = None
     ) -> float:
         """Endpoint for computing the metric.
 
@@ -134,7 +132,7 @@ def score(
         return self.fn(**kwargs, **self.value.kwargs)
 
     def __call__(
-        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
+        self, y_true: np.ndarray, y_pred: np.ndarray | None = None, y_prob: np.ndarray | None = None
     ) -> float:
         """For convenience, make metrics callable"""
         return self.score(y_true, y_pred, y_prob)
diff --git a/polaris/evaluate/_predictions.py b/polaris/evaluate/_predictions.py
new file mode 100644
index 00000000..af8e94fa
--- /dev/null
+++ b/polaris/evaluate/_predictions.py
@@ -0,0 +1,179 @@
+import numpy as np
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    TypeAdapter,
+    field_serializer,
+    field_validator,
+    model_validator,
+)
+from typing_extensions import Self
+
+from polaris.utils.misc import convert_lists_to_arrays
+from polaris.utils.types import IncomingPredictionsType, PredictionsType
+
+
+class BenchmarkPredictions(BaseModel):
+    """
+    Base model to represent predictions in the Polaris code base.
+
+    Guided by [Postel's Law](https://en.wikipedia.org/wiki/Robustness_principle),
+    this class normalizes different formats to a single, internal representation.
+
+    Attributes:
+        predictions: The predictions for the benchmark.
+        target_labels: The target columns for the associated benchmark.
+        test_set_labels: The names of the test sets for the associated benchmark.
+    """
+
+    predictions: PredictionsType
+    target_labels: list[str]
+    test_set_labels: list[str]
+    test_set_sizes: dict[str, int]
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    @field_serializer("predictions")
+    def _serialize_predictions(self, predictions: PredictionsType):
+        """
+        Recursively converts all numpy values in the predictions dictionary to lists
+        so they can be serialized.
+        """
+
+        def convert_to_list(v):
+            if isinstance(v, np.ndarray):
+                return v.tolist()
+            elif isinstance(v, dict):
+                return {k: convert_to_list(v) for k, v in v.items()}
+
+        return convert_to_list(predictions)
+
+    @field_validator("target_labels", "test_set_labels")
+    @classmethod
+    def _validate_labels(cls, v: list[str]) -> list[str]:
+        if len(set(v)) != len(v):
+            raise ValueError("The predictions contain duplicate columns")
+        return v
+
+    @model_validator(mode="before")
+    @classmethod
+    def _validate_predictions(cls, data: dict) -> dict:
+        """Normalizes the predictions format to a standard representation we use internally"""
+
+        # This model validator runs before any Pydantic internal validation.
+        # This way we can normalize the incoming data to a standard representation.
+        # However, this implies that the fields can theoretically be any type.
+
+        # Ensure the type of the incoming predictions is correct
+        validator = TypeAdapter(IncomingPredictionsType, config={"arbitrary_types_allowed": True})
+        predictions = validator.validate_python(data.get("predictions"))
+
+        # Ensure the type of the target_labels and test_set_labels is correct
+        validator = TypeAdapter(list[str])
+        target_labels = validator.validate_python(data.get("target_labels"))
+        test_set_labels = validator.validate_python(data.get("test_set_labels"))
+
+        validator = TypeAdapter(dict[str, int])
+        test_set_sizes = validator.validate_python(data.get("test_set_sizes"))
+
+        # Normalize the predictions to a standard representation
+        predictions = convert_lists_to_arrays(predictions)
+        predictions = cls._normalize_predictions(predictions, target_labels, test_set_labels)
+
+        return {
+            "predictions": predictions,
+            "target_labels": target_labels,
+            "test_set_labels": test_set_labels,
+            "test_set_sizes": test_set_sizes,
+        }
+
+    @model_validator(mode="after")
+    def check_test_set_size(self) -> Self:
+        """Verify that the size of all predictions"""
+        for test_set_label, test_set in self.predictions.items():
+            for target in test_set.values():
+                if test_set_label not in self.test_set_sizes:
+                    raise ValueError(f"Expected size for test set '{test_set_label}' is not defined")
+
+                if len(target) != self.test_set_sizes[test_set_label]:
+                    raise ValueError(
+                        f"Predictions size mismatch: The predictions for test set '{test_set_label}' "
+                        f"should have a size of {self.test_set_sizes[test_set_label]}, but have a size of {len(target)}."
+                    )
+        return self
+
+    @classmethod
+    def _normalize_predictions(
+        cls, predictions: IncomingPredictionsType, target_labels: list[str], test_set_labels: list[str]
+    ) -> PredictionsType:
+        """
+        Normalizes  the predictions to a standard representation we use internally.
+        This standard representation is a nested, two-level dictionary:
+        `{test_set_name: {target_column: np.ndarray}}`
+        """
+        # (1) If the predictions are already fully specified, no need to do anything
+        if cls._is_fully_specified(predictions, target_labels, test_set_labels):
+            return predictions
+
+        # If not fully specified, we distinguish 4 cases based on the type of benchmark.
+        is_single_task = len(target_labels) == 1
+        is_single_test = len(test_set_labels) == 1
+
+        # (2) Single-task, single test set: We expect a numpy array as input.
+        if is_single_task and is_single_test:
+            if isinstance(predictions, dict):
+                raise ValueError(
+                    "The predictions for single-task, single test set benchmarks should be a numpy array."
+                )
+            predictions = {test_set_labels[0]: {target_labels[0]: predictions}}
+
+        # (3) Single-task, multiple test sets: We expect a dictionary with the test set labels as keys.
+        elif is_single_task and not is_single_test:
+            if not isinstance(predictions, dict) or set(predictions.keys()) != set(test_set_labels):
+                raise ValueError(
+                    "The predictions for single-task, multiple test sets benchmarks "
+                    "should be a dictionary with the test set labels as keys."
+                )
+            predictions = {k: {target_labels[0]: v} for k, v in predictions.items()}
+
+        # (4) Multi-task, single test set: We expect a dictionary with the target labels as keys.
+        elif not is_single_task and is_single_test:
+            if not isinstance(predictions, dict) or set(predictions.keys()) != set(target_labels):
+                raise ValueError(
+                    "The predictions for multi-task, single test set benchmarks "
+                    "should be a dictionary with the target labels as keys."
+                )
+            predictions = {test_set_labels[0]: predictions}
+
+        # (5) Multi-task, multi-test sets: The predictions should be fully-specified
+        else:
+            raise ValueError(
+                "The predictions for multi-task, multi-test sets benchmarks should be fully-specified "
+                "as a nested, two-level dictionary: { test_set_name: { target_column: np.ndarray } }"
+            )
+
+        return predictions
+
+    @classmethod
+    def _is_fully_specified(
+        cls, predictions: IncomingPredictionsType, target_labels: list[str], test_set_labels: list[str]
+    ) -> bool:
+        """
+        Check if the predictions are fully specified for the target columns and test set names.
+        """
+        # Not a dictionary
+        if not isinstance(predictions, dict):
+            return False
+
+        # Outer-level of the dictionary should correspond to the test set names
+        if set(predictions.keys()) != set(test_set_labels):
+            return False
+
+        # Inner-level of the dictionary should correspond to the target columns
+        for test_set_predictions in predictions.values():
+            if not isinstance(test_set_predictions, dict):
+                return False
+            if set(test_set_predictions.keys()) != set(target_labels):
+                return False
+
+        return True
diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py
index 4de807b6..217fd0e8 100644
--- a/polaris/evaluate/_results.py
+++ b/polaris/evaluate/_results.py
@@ -2,7 +2,6 @@
 from datetime import datetime
 from typing import ClassVar, Optional, Union
 
-import numpy as np
 import pandas as pd
 from pydantic import (
     BaseModel,
@@ -16,18 +15,16 @@
 from pydantic.alias_generators import to_camel
 
 from polaris._artifact import BaseArtifactModel
-from polaris.evaluate import Metric
+from polaris.evaluate import BenchmarkPredictions, Metric
 from polaris.hub.settings import PolarisHubSettings
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidResultError
 from polaris.utils.misc import slugify
 from polaris.utils.types import (
     AccessType,
-    CompetitionPredictionsType,
     HttpUrlString,
     HubOwner,
     HubUser,
-    PredictionsType,
     SlugCompatibleStringType,
 )
 
@@ -261,44 +258,16 @@ def competition_artifact_id(self) -> str:
         return f"{self.competition_owner}/{slugify(self.competition_name)}"
 
 
-class CompetitionPredictions(ResultsMetadata):
-    """Class specific to predictions for competition benchmarks.
+class CompetitionPredictions(ResultsMetadata, BenchmarkPredictions):
+    """
+    Predictions for competition benchmarks.
 
     This object is to be used as input to [`CompetitionSpecification.evaluate`][polaris.competition.CompetitionSpecification.evaluate].
     It is used to ensure that the structure of the predictions are compatible with evaluation methods on the Polaris Hub.
+    In addition to the predictions, it contains additional meta-data to create a results object.
 
     Attributes:
-        predictions: The predictions created for a given competition's test set(s).
+        access: The access the returned results should have
     """
 
-    predictions: Union[PredictionsType, CompetitionPredictionsType]
     access: Optional[AccessType] = "private"
-
-    @field_validator("predictions")
-    @classmethod
-    def _convert_predictions(cls, value: Union[PredictionsType, CompetitionPredictionsType]):
-        """Convert prediction arrays from a list type to a numpy array. This is required for certain
-        operations during prediction evaluation"""
-
-        if isinstance(value, list):
-            return np.array(value)
-        elif isinstance(value, np.ndarray):
-            return value
-        elif isinstance(value, dict):
-            for key, val in value.items():
-                value[key] = cls._convert_predictions(val)
-            return value
-
-    @field_serializer("predictions")
-    def _serialize_predictions(self, value: PredictionsType):
-        """Used to serialize a Predictions object such that it can be sent over the wire during
-        external evaluation for competitions"""
-
-        if isinstance(value, np.ndarray):
-            return value.tolist()
-        elif isinstance(value, list):
-            return value
-        elif isinstance(value, dict):
-            for key, val in value.items():
-                value[key] = self._serialize_predictions(val)
-            return value
diff --git a/polaris/evaluate/metrics/__init__.py b/polaris/evaluate/metrics/__init__.py
index d14bdf17..0b14ceec 100644
--- a/polaris/evaluate/metrics/__init__.py
+++ b/polaris/evaluate/metrics/__init__.py
@@ -1,7 +1,7 @@
+from polaris.evaluate.metrics.docking_metrics import rmsd_coverage
 from polaris.evaluate.metrics.generic_metrics import (
-    cohen_kappa_score,
     absolute_average_fold_error,
-    spearman,
+    cohen_kappa_score,
     pearsonr,
+    spearman,
 )
-from polaris.evaluate.metrics.docking_metrics import rmsd_coverage
diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py
index 65e8778e..b2c72280 100644
--- a/polaris/evaluate/utils.py
+++ b/polaris/evaluate/utils.py
@@ -1,45 +1,42 @@
 import numpy as np
 import pandas as pd
-from typing import Optional
-
-from polaris.evaluate import BenchmarkResults, ResultsType
-from polaris.utils.types import PredictionsType
-from polaris.evaluate import Metric
 from numpy.typing import NDArray
 
-
-def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]):
-    """Check if the given values are for a multiple-task benchmark with a single
-    test set. This is inferred by comparing the target names with the keys of the
-    given data. If all keys in the given data match the target column names, we
-    assume they are target names (as opposed to test set names for a single-task,
-    multiple test set benchmark)."""
-    return all(k in target_cols for k in vals)
+from polaris.evaluate import BenchmarkPredictions, BenchmarkResults, Metric, ResultsType
+from polaris.utils.types import IncomingPredictionsType
 
 
-def normalize_predictions_type(vals: PredictionsType, target_cols: list[str]):
-    if isinstance(vals, dict):
-        if is_multi_task_single_test_set(vals, target_cols):
-            return {"test": vals}
-        else:
-            return vals
-    elif vals is None:
+def _optionally_get(preds: BenchmarkPredictions | None, keys: list[str] | str) -> dict | None:
+    """
+    Returns the value in a nested dictionary associated with a sequence of keys
+    if it exists, otherwise return None
+    """
+    if preds is None:
         return None
-    else:
-        return {"test": {target_cols[0]: vals}}
 
-
-def safe_mask(
-    input_values: dict | dict[str, dict], test_label: str, target_label: str, mask: NDArray[np.bool_]
-):
-    if (
-        input_values is None
-        or input_values.get(test_label) is None
-        or input_values[test_label].get(target_label) is None
-    ):
+    if not isinstance(keys, list):
+        keys = [keys]
+
+    d = preds.predictions
+    for k in keys:
+        d = d.get(k)
+        if d is None:
+            return None
+    return d
+
+
+def _safe_mask(
+    preds: BenchmarkPredictions | None,
+    mask: NDArray[np.bool_],
+    keys: list[str],
+) -> NDArray[np.float64] | None:
+    """
+    Mask a prediction array if it exists in a nested array. Otherwise return None
+    """
+    v = _optionally_get(preds, keys)
+    if v is None:
         return None
-    else:
-        return np.array(input_values[test_label][target_label])[mask]
+    return v[mask]
 
 
 def mask_index(input_values):
@@ -58,44 +55,61 @@ def mask_index(input_values):
 
 def evaluate_benchmark(
     target_cols: list[str],
+    test_set_labels: list[str],
+    test_set_sizes: dict[str, int],
     metrics: list[Metric],
-    y_true: PredictionsType,
-    y_pred: Optional[PredictionsType] = None,
-    y_prob: Optional[PredictionsType] = None,
+    y_true: IncomingPredictionsType,
+    y_pred: IncomingPredictionsType | None = None,
+    y_prob: IncomingPredictionsType | None = None,
 ):
-    y_true = normalize_predictions_type(y_true, target_cols)
-    y_pred = normalize_predictions_type(y_pred, target_cols)
-    y_prob = normalize_predictions_type(y_prob, target_cols)
-
-    if y_pred and set(y_true.keys()) != set(y_pred.keys()):
-        raise KeyError(f"Missing keys for at least one of the test sets. Expecting: {sorted(y_true.keys())}")
-
+    """
+    Utility function that contains the evaluation logic for a benchmark
+    """
+
+    # Normalize the ground truth and predictions to a consistent, internal representation.
+    # Format is a two-level dictionary: {test_set_label: {target_label: np.ndarray}}
+    y_true = BenchmarkPredictions(
+        predictions=y_true,
+        target_labels=target_cols,
+        test_set_labels=test_set_labels,
+        test_set_sizes=test_set_sizes,
+    )
+    if y_pred is not None:
+        y_pred = BenchmarkPredictions(
+            predictions=y_pred,
+            target_labels=target_cols,
+            test_set_labels=test_set_labels,
+            test_set_sizes=test_set_sizes,
+        )
+    if y_prob is not None:
+        y_prob = BenchmarkPredictions(
+            predictions=y_prob,
+            target_labels=target_cols,
+            test_set_labels=test_set_labels,
+            test_set_sizes=test_set_sizes,
+        )
+
+    # Compute the results
     # Results are saved in a tabular format. For more info, see the BenchmarkResults docs.
     scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS)
 
     # For every test set...
-    for test_label, y_true_subset in y_true.items():
+    for test_label, y_true_test in y_true.predictions.items():
         # For every metric...
         for metric in metrics:
             if metric.is_multitask:
                 # Multi-task but with a metric across targets
                 score = metric(
-                    y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
+                    y_true=y_true_test,
+                    y_pred=_optionally_get(y_pred, test_label),
+                    y_prob=_optionally_get(y_prob, test_label),
                 )
 
                 scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
                 continue
 
-            if not isinstance(y_true_subset, dict):
-                # Single task
-                score = metric(
-                    y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
-                )
-                scores.loc[len(scores)] = (test_label, target_cols[0], metric, score)
-                continue
-
             # Otherwise, for every target...
-            for target_label, y_true_target in y_true_subset.items():
+            for target_label, y_true_target in y_true_test.items():
                 # Single-task metrics for a multi-task benchmark
                 # In such a setting, there can be NaN values, which we thus have to filter out.
 
@@ -103,8 +117,8 @@ def evaluate_benchmark(
 
                 score = metric(
                     y_true=y_true_target[mask],
-                    y_pred=safe_mask(y_pred, test_label, target_label, mask),
-                    y_prob=safe_mask(y_prob, test_label, target_label, mask),
+                    y_pred=_safe_mask(y_pred, mask, [test_label, target_label]),
+                    y_prob=_safe_mask(y_prob, mask, [test_label, target_label]),
                 )
 
                 scores.loc[len(scores)] = (test_label, target_label, metric, score)
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
index d2278452..b7c3247d 100644
--- a/polaris/hub/client.py
+++ b/polaris/hub/client.py
@@ -23,9 +23,9 @@
     SingleTaskBenchmarkSpecification,
 )
 from polaris.competition import CompetitionSpecification
+
+from polaris.evaluate import BenchmarkResults, CompetitionPredictions, CompetitionResults
 from polaris.dataset import CompetitionDataset, Dataset, DatasetV1
-from polaris.evaluate import BenchmarkResults, CompetitionResults
-from polaris.evaluate._results import CompetitionPredictions
 from polaris.experimental._dataset_v2 import DatasetV2
 from polaris.hub.external_client import ExternalAuthClient
 from polaris.hub.oauth import CachedTokenAuth
diff --git a/polaris/utils/misc.py b/polaris/utils/misc.py
index b9156ea5..79b6583d 100644
--- a/polaris/utils/misc.py
+++ b/polaris/utils/misc.py
@@ -1,6 +1,8 @@
 from typing import Any
 
-from polaris.utils.types import SlugCompatibleStringType, SlugStringType
+import numpy as np
+
+from polaris.utils.types import ListOrArrayType, SlugCompatibleStringType, SlugStringType
 
 
 def listit(t: Any):
@@ -16,3 +18,19 @@ def slugify(sluggable: SlugCompatibleStringType) -> SlugStringType:
     Converts a slug-compatible string to a slug.
     """
     return sluggable.lower().replace("_", "-").strip("-")
+
+
+def convert_lists_to_arrays(predictions: ListOrArrayType | dict) -> np.ndarray | dict:
+    """
+    Recursively converts all plain Python lists in the predictions object to numpy arrays
+    """
+
+    def convert_to_array(v):
+        if isinstance(v, np.ndarray):
+            return v
+        elif isinstance(v, list):
+            return np.array(v)
+        elif isinstance(v, dict):
+            return {k: convert_to_array(v) for k, v in v.items()}
+
+    return convert_to_array(predictions)
diff --git a/polaris/utils/types.py b/polaris/utils/types.py
index 27c94992..86d8d4ef 100644
--- a/polaris/utils/types.py
+++ b/polaris/utils/types.py
@@ -25,19 +25,26 @@
 The second item can either be a single test set or a dictionary with multiple, named test sets.
 """
 
-PredictionsType: TypeAlias = Union[np.ndarray, dict[str, Union[np.ndarray, dict[str, np.ndarray]]]]
+ListOrArrayType: TypeAlias = list | np.ndarray
 """
-A prediction is one of three things:
+A list of numbers or a numpy array. Predictions can be provided as either a list or a numpy array.
+"""
+
+IncomingPredictionsType: TypeAlias = ListOrArrayType | dict[str, ListOrArrayType | dict[str, ListOrArrayType]]
+"""
+The type of the predictions that are ingested into the Polaris BenchmarkPredictions object. Can be one
+of the following:
 
 - A single array (single-task, single test set)
 - A dictionary of arrays (single-task, multiple test sets)
 - A dictionary of dictionaries of arrays (multi-task, multiple test sets)
 """
 
-CompetitionPredictionsType: TypeAlias = Union[list, dict[str, Union[list, dict[str, list]]]]
+PredictionsType: TypeAlias = dict[str, dict[str, np.ndarray]]
 """
-An additional type to represent the structure of predictions which are specific to competitions. This
-type allows for the predictions to be sent over the wire for external evaluation.
+The normalized format for predictions for internal use. Predictions are accepted in a generous
+variety of representations and normalized into this standard format, a dictionary of dictionaries
+that looks like {"test_set_name": {"target_name": np.ndarray}}.
 """
 
 DatapointPartType = Union[Any, tuple[Any], dict[str, Any]]
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index a3b998ea..41bf5ce0 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -36,7 +36,7 @@ def test_split_verification(is_single_task, test_single_task_benchmark, test_mul
         cls(split=(train_split, {"test": []}), **default_kwargs)
     # Non-exclusive partitions
     with pytest.raises(ValidationError):
-        cls(split=(train_split, test_split + train_split[:1]), **default_kwargs)
+        cls(split=(train_split, test_split["test"] + train_split[:1]), **default_kwargs)
     with pytest.raises(ValidationError):
         cls(split=(train_split, {"test1": test_split, "test2": train_split[:1]}), **default_kwargs)
     # Invalid indices
@@ -45,21 +45,22 @@ def test_split_verification(is_single_task, test_single_task_benchmark, test_mul
     with pytest.raises(ValidationError):
         cls(split=(train_split + [-1], test_split), **default_kwargs)
     with pytest.raises(ValidationError):
-        cls(split=(train_split, test_split + [len(obj.dataset)]), **default_kwargs)
+        cls(split=(train_split, test_split["test"] + [len(obj.dataset)]), **default_kwargs)
     with pytest.raises(ValidationError):
-        cls(split=(train_split, test_split + [-1]), **default_kwargs)
+        cls(split=(train_split, test_split["test"] + [-1]), **default_kwargs)
     # Duplicate indices
     with pytest.raises(ValidationError):
         cls(split=(train_split + train_split[:1], test_split), **default_kwargs)
     with pytest.raises(ValidationError):
-        cls(split=(train_split, test_split + test_split[:1]), **default_kwargs)
+        cls(split=(train_split, test_split["test"] + test_split["test"][:1]), **default_kwargs)
     with pytest.raises(ValidationError):
         cls(
-            split=(train_split, {"test1": test_split, "test2": test_split + test_split[:1]}), **default_kwargs
+            split=(train_split, {"test1": test_split, "test2": test_split["test"] + test_split["test"][:1]}),
+            **default_kwargs,
         )
 
     # It should _not_ fail with duplicate indices across test partitions
-    cls(split=(train_split, {"test1": test_split, "test2": test_split}), **default_kwargs)
+    cls(split=(train_split, {"test1": test_split["test"], "test2": test_split["test"]}), **default_kwargs)
     # It should _not_ fail with missing indices
     cls(split=(train_split[:-1], test_split), **default_kwargs)
     # It should _not_ fail with an empty train set
@@ -178,7 +179,7 @@ def _check_for_failure(_kwargs):
     _check_for_failure(kwargs)
 
     kwargs = obj.model_dump()
-    kwargs["split"] = kwargs["split"][0], kwargs["split"][1][1:]
+    kwargs["split"] = kwargs["split"][0], kwargs["split"][1]["test"][1:]
     _check_for_failure(kwargs)
 
     # Metrics
diff --git a/tests/test_benchmark_predictions.py b/tests/test_benchmark_predictions.py
new file mode 100644
index 00000000..a2112a5c
--- /dev/null
+++ b/tests/test_benchmark_predictions.py
@@ -0,0 +1,247 @@
+import numpy as np
+import pytest
+
+from polaris.evaluate import BenchmarkPredictions
+
+
+def assert_deep_equal(result, expected):
+    assert isinstance(result, type(expected)), f"Types differ: {type(result)} != {type(expected)}"
+
+    if isinstance(expected, dict):
+        assert result.keys() == expected.keys()
+        for key in expected:
+            assert_deep_equal(result[key], expected[key])
+    elif isinstance(expected, np.ndarray):
+        assert np.array_equal(result, expected)
+    else:
+        assert result == expected
+
+
+def test_benchmark_predictions_normalization():
+    # Single task, single test set
+    assert_deep_equal(
+        {"test": {"col1": np.array([1, 2, 3])}},
+        BenchmarkPredictions(
+            predictions=[1, 2, 3],
+            target_labels=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        ).predictions,
+    )
+    assert_deep_equal(
+        {"test": {"col1": np.array([1, 2, 3])}},
+        BenchmarkPredictions(
+            predictions={"test": {"col1": [1, 2, 3]}},
+            target_labels=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        ).predictions,
+    )
+
+    # Single task, multiple test sets
+    assert_deep_equal(
+        {"test": {"col1": np.array([1, 2, 3])}, "test2": {"col1": np.array([4, 5, 6])}},
+        BenchmarkPredictions(
+            predictions={"test": [1, 2, 3], "test2": [4, 5, 6]},
+            target_labels=["col1"],
+            test_set_labels=["test", "test2"],
+            test_set_sizes={"test": 3, "test2": 3},
+        ).predictions,
+    )
+    assert_deep_equal(
+        {"test1": {"col1": np.array([1, 2, 3])}, "test2": {"col1": np.array([4, 5, 6])}},
+        BenchmarkPredictions(
+            predictions={"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}},
+            target_labels=["col1"],
+            test_set_labels=["test1", "test2"],
+            test_set_sizes={"test1": 3, "test2": 3},
+        ).predictions,
+    )
+
+    # Multi-task, single test set
+    assert_deep_equal(
+        {"test": {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}},
+        BenchmarkPredictions(
+            predictions={"col1": [1, 2, 3], "col2": [4, 5, 6]},
+            target_labels=["col1", "col2"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        ).predictions,
+    )
+    assert_deep_equal(
+        {"test": {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}},
+        BenchmarkPredictions(
+            predictions={"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}},
+            target_labels=["col1", "col2"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        ).predictions,
+    )
+
+    # Multi-task, multiple test sets
+    assert_deep_equal(
+        {
+            "test1": {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])},
+            "test2": {"col1": np.array([7, 8, 9]), "col2": np.array([10, 11, 12])},
+        },
+        BenchmarkPredictions(
+            predictions={
+                "test1": {"col1": [1, 2, 3], "col2": [4, 5, 6]},
+                "test2": {"col1": [7, 8, 9], "col2": [10, 11, 12]},
+            },
+            target_labels=["col1", "col2"],
+            test_set_labels=["test1", "test2"],
+            test_set_sizes={"test1": 3, "test2": 3},
+        ).predictions,
+    )
+
+
+def test_benchmark_predictions_incorrect_keys():
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions=[1, 2, 3],
+            target_labels=["col1"],
+            test_set_labels=["test1", "test2"],
+            test_set_sizes={"test1": 3},
+        )
+
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions=[1, 2, 3],
+            target_labels=["col1", "col2"],
+            test_set_labels=["test1"],
+            test_set_sizes={"test1": 3},
+        )
+
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"col1": [1, 2, 3]},
+            target_labels=["col1"],
+            test_set_labels=["test1", "test2"],
+            test_set_sizes={"test1": 3, "test2": 3},
+        )
+
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}},
+            target_labels=["col1"],
+            test_set_labels=["test1", "test2", "test3"],
+            test_set_sizes={"test1": 3, "test2": 3, "test3": 3},
+        )
+
+
+def test_benchmark_predictions_type_checking():
+    v1 = {"test": {"col1": ["strings", "also", "valid"]}}
+    v2 = BenchmarkPredictions(
+        predictions=["strings", "also", "valid"],
+        target_labels=["col1"],
+        test_set_labels=["test"],
+        test_set_sizes={"test": 3},
+    ).predictions
+
+    assert list(v1.keys()) == ["test"]
+    assert list(v2.keys()) == ["test"]
+    assert list(v1["test"].keys()) == ["col1"]
+    assert list(v2["test"].keys()) == ["col1"]
+    assert isinstance(v2["test"]["col1"], np.ndarray)
+    assert np.array_equal(v2["test"]["col1"], np.array(["strings", "also", "valid"]))
+
+
+def test_invalid_benchmark_predictions_errors():
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": {"col1": [1, 2, 3]}, "test2": [4, 5, 6]},
+            target_cols=["col1", "col2"],
+            test_set_labels=["test", "test2"],
+            test_set_sizes={"test": 3, "test2": 3},
+        )
+
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": {"col1": "not an array or list"}},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 1},
+        )
+
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": {"wrong column name": [1, 2, 3]}},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        )
+
+    # You should either fully or minimally specify the predictions.
+    # We don't allow in-between results.
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"col1": [1, 2, 3]},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        )
+
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": [1, 2, 3]},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        )
+
+    # You shouldn't specify more keys than expected.
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        )
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": {"col1": [1, 2, 3]}, "test2": {"col1": [1, 2, 3]}},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3, "test2": 3},
+        )
+
+    # Incorrect size
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": {"col": [1, 2, 3, 4]}},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test": 3},
+        )
+
+    # Invalid test_set_sizes
+    with pytest.raises(ValueError):
+        BenchmarkPredictions(
+            predictions={"test": {"col": [1, 2, 3]}},
+            target_cols=["col1"],
+            test_set_labels=["test"],
+            test_set_sizes={"test1": 3},
+        )
+
+
+def test_benchmark_predictions_serialization():
+    predictions = BenchmarkPredictions(
+        predictions=[1, 2, 3],
+        target_labels=["col1"],
+        test_set_labels=["test"],
+        test_set_sizes={"test": 3},
+    )
+    serialized = predictions.model_dump()
+    assert serialized["predictions"] == {"test": {"col1": [1, 2, 3]}}
+    assert serialized["target_labels"] == ["col1"]
+    assert serialized["test_set_labels"] == ["test"]
+
+    deserialized = BenchmarkPredictions(**serialized)
+    assert set(deserialized.predictions.keys()) == {"test"}
+    assert set(deserialized.predictions["test"].keys()) == {"col1"}
+    assert np.array_equal(deserialized.predictions["test"]["col1"], np.array([1, 2, 3]))
+    assert deserialized.target_labels == ["col1"]
+    assert deserialized.test_set_labels == ["test"]
+    assert set(deserialized.test_set_sizes.keys()) == {"test"}
+    assert deserialized.test_set_sizes["test"] == 3
diff --git a/tests/test_competition.py b/tests/test_competition.py
index e6421585..19311994 100644
--- a/tests/test_competition.py
+++ b/tests/test_competition.py
@@ -1,8 +1,8 @@
 import numpy as np
 import pandas as pd
 
-from polaris.evaluate.utils import evaluate_benchmark, normalize_predictions_type
 from polaris.competition import CompetitionSpecification
+from polaris.evaluate.utils import evaluate_benchmark
 
 
 def test_competition_from_json(test_competition, tmpdir):
@@ -15,13 +15,18 @@ def test_competition_from_json(test_competition, tmpdir):
 def test_multi_col_competition_evaluation(test_competition):
     """Test that multi-column competitions will be evaluated properly when when
     target labels are read as a pandas dataframe from a file."""
-    data = np.random.randint(2, size=(6, 3))
+    data = np.random.randint(2, size=(10, 3))
     labels = pd.DataFrame(data, columns=["Column1", "Column2", "Column3"])
     labels_as_from_hub = {col: np.array(labels[col]) for col in labels.columns}
     predictions = {target_col: np.random.randint(2, size=labels.shape[0]) for target_col in labels.columns}
 
     result = evaluate_benchmark(
-        ["Column1", "Column2", "Column3"], test_competition.metrics, labels_as_from_hub, y_pred=predictions
+        target_cols=["Column1", "Column2", "Column3"],
+        test_set_labels=["test"],
+        test_set_sizes=test_competition.test_set_sizes,
+        metrics=test_competition.metrics,
+        y_true=labels_as_from_hub,
+        y_pred=predictions,
     )
 
     assert isinstance(result, pd.DataFrame)
@@ -36,7 +41,7 @@ def test_multi_col_competition_evaluation(test_competition):
 def test_single_col_competition_evaluation(test_competition):
     """Test that multi-column competitions will be evaluated properly when when
     target labels are read as a pandas dataframe from a file."""
-    data = np.array(
+    y_true = np.array(
         [
             1.15588236,
             1.56414507,
@@ -48,18 +53,18 @@ def test_single_col_competition_evaluation(test_competition):
             0.86099644,
             0.67568671,
             2.28213589,
-            1.06617679,
-            1.05709529,
-            0.67568671,
-            0.67568671,
-            0.67568671,
         ]
     )
-    labels = {"LOG HLM_CLint (mL/min/kg)": data}
-    predictions = data + np.random.uniform(0, 3, size=len(data))
+
+    y_pred = y_true + np.random.uniform(0, 3, size=len(y_true))
 
     result = evaluate_benchmark(
-        ["LOG HLM_CLint (mL/min/kg)"], test_competition.metrics, labels, y_pred=predictions
+        target_cols=["LOG HLM_CLint (mL/min/kg)"],
+        test_set_labels=["test"],
+        test_set_sizes=test_competition.test_set_sizes,
+        metrics=test_competition.metrics,
+        y_true=y_true,
+        y_pred=y_pred,
     )
 
     assert isinstance(result, pd.DataFrame)
@@ -69,35 +74,3 @@ def test_single_col_competition_evaluation(test_competition):
         "Metric",
         "Score",
     }
-
-
-def test_normalize_predictions_type():
-    "Single column, single test set"
-    assert {"test": {"col1": [1, 2, 3]}} == normalize_predictions_type([1, 2, 3], ["col1"])
-    assert {"test": {"col1": [1, 2, 3]}} == normalize_predictions_type({"col1": [1, 2, 3]}, ["col1"])
-    assert {"test": {"col1": [1, 2, 3]}} == normalize_predictions_type(
-        {"test": {"col1": [1, 2, 3]}}, ["col1"]
-    )
-
-    "Multi-column, single test set"
-    assert {"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}} == normalize_predictions_type(
-        {"col1": [1, 2, 3], "col2": [4, 5, 6]}, ["col1", "col2"]
-    )
-
-    assert {"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}} == normalize_predictions_type(
-        {"test": {"col1": [1, 2, 3], "col2": [4, 5, 6]}}, ["col1", "col2"]
-    )
-
-    "Single column, multi-test set"
-    assert {"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}} == normalize_predictions_type(
-        {"test1": {"col1": [1, 2, 3]}, "test2": {"col1": [4, 5, 6]}}, ["col1"]
-    )
-
-    "Multi-column, multi-test set"
-    assert {
-        "test1": {"col1": [1, 2, 3], "col2": [4, 5, 6]},
-        "test2": {"col1": [7, 8, 9], "col2": [10, 11, 12]},
-    } == normalize_predictions_type(
-        {"test1": {"col1": [1, 2, 3], "col2": [4, 5, 6]}, "test2": {"col1": [7, 8, 9], "col2": [10, 11, 12]}},
-        ["col1", "col2"],
-    )